In [18]:
import os

import pandas as pd

from util import stopwords, viet_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

# Read Coprus

In [19]:
corpus_path = os.path.join('..', 'Corpus')
topics = os.listdir(corpus_path)
data = []
for topic in topics:
    for file in os.listdir(os.path.join(corpus_path, topic)):
        with open(os.path.join(corpus_path, topic, file), encoding='utf-8') as txt_file:
            link = txt_file.readline()[:-1]
            article = txt_file.read()
        data.append((file[:3], link, article, topic))

data = pd.DataFrame(data, columns=['article_index', 'link', 'article', 'topic'])

# Feature Extraction

In [36]:
vectoriser = TfidfVectorizer(tokenizer=viet_tokenize, stop_words=stopwords,
                             ngram_range=(1,3), min_df=.05, max_df=.7)
tfidf_feat = vectoriser.fit_transform(data['article'])
features_df = pd.DataFrame(tfidf_feat.toarray(), columns=vectoriser.get_feature_names_out())



In [37]:
print(features_df.shape)
features_df

(2432, 702)


Unnamed: 0,a,an,an khang,an toàn,australia,ban,ban đầu,bao gồm,bay,biến,...,đội tuyển,động vật,đợt,đức,đứng,ưu tiên,ảnh,ảnh hưởng,ổn định,ứng dụng
0,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
1,0.000000,0.00000,0.0,0.0,0.000000,0.032266,0.0,0.0000,0.0,0.0,...,0.000000,0.0,0.037188,0.000000,0.027888,0.0,0.000000,0.0,0.033319,0.0
2,0.000000,0.00000,0.0,0.0,0.000000,0.147407,0.0,0.0000,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.063703,0.0,0.000000,0.0,0.076110,0.0
3,0.000000,0.00000,0.0,0.0,0.117756,0.000000,0.0,0.0411,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.041463,0.0,0.000000,0.0,0.000000,0.0
4,0.211352,0.05126,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.048346,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2427,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
2428,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,...,0.000000,0.0,0.000000,0.066136,0.028919,0.0,0.000000,0.0,0.000000,0.0
2429,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,...,0.154562,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
2430,0.000000,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0000,0.0,0.0,...,0.233441,0.0,0.000000,0.000000,0.084942,0.0,0.000000,0.0,0.000000,0.0


# Save

In [38]:
from pathlib import Path

import joblib

temp_result_dir = os.path.join('.', 'temp')
if not os.path.exists(temp_result_dir): Path(temp_result_dir).mkdir()

data.to_csv(os.path.join(temp_result_dir, 'data.csv'), index=False)
features_df.to_csv(os.path.join(temp_result_dir, 'features.csv'), index=False)

joblib.dump(vectoriser, os.path.join('..', 'Output', 'tfidf_vec.pkl'))