In [11]:
import os

import pandas as pd


from string import punctuation

from pyvi import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Read Coprus

In [12]:
corpus_path = os.path.join('..', 'Corpus')
topics = os.listdir(corpus_path)
data = []
for topic in topics:
    for file in os.listdir(os.path.join(corpus_path, topic)):
        with open(os.path.join(corpus_path, topic, file), encoding='utf-8') as txt_file:
            link = txt_file.readline()[:-1]
            article = txt_file.read()
        data.append((file[:3], link, article, topic))

data = pd.DataFrame(data, columns=['article_index', 'link', 'article', 'topic'])

# Feature Extraction

In [13]:
with open(os.path.join('..', 'Misc', 'vietnamese-stopwords.txt'), encoding='utf-8') as file:
    stopwords = list(i[:-1] for i in file.readlines())
    
punctuation += '...'
def viet_tokenize(article: str) -> list[str]:
  tokens = []
  text = article.replace('\n', '')
  for token in ViTokenizer.tokenize(text).split():
    if (token in punctuation) or\
        token.isnumeric():
       continue
    tokens.append(token.lower().replace('_', ' '))
  return tokens

In [14]:
vectoriser = TfidfVectorizer(tokenizer=viet_tokenize, stop_words=stopwords,
                             ngram_range=(1,1), min_df=.1, max_df=.6)
tfidf_feat = vectoriser.fit_transform(data['article'])
features_df = pd.DataFrame(tfidf_feat.toarray(), columns=vectoriser.get_feature_names_out())



In [15]:
print(features_df.shape)
features_df

(2432, 212)


Unnamed: 0,...,an toàn,ban,ban đầu,bao gồm,bàn,bác sĩ,bóng,bảng,bảo vệ,....1,đại diện,đại học,đầu,địa phương,đồng,độ,đội,đức,đứng,ảnh hưởng
0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.302320,0.080360,0.0,0.290264,0.00000,0.000000,0.000000,0.000000,0.0
1,0.035204,0.0,0.051499,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.038739,0.000000,0.0,0.260362,0.00000,0.000000,0.000000,0.044511,0.0
2,0.000000,0.0,0.221029,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.066293,0.0,0.000000,0.00000,0.000000,0.000000,0.095520,0.0
3,0.000000,0.0,0.000000,0.0,0.061062,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.107230,0.042754,0.0,0.000000,0.16607,0.000000,0.000000,0.061603,0.0
4,0.000000,0.0,0.000000,0.0,0.000000,0.160479,0.146397,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2427,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.0
2428,0.000000,0.0,0.000000,0.0,0.000000,0.194941,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.142154,0.086354,0.037760,0.0
2429,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.215017,0.000000,0.000000,0.0
2430,0.000000,0.0,0.000000,0.0,0.000000,0.055772,0.000000,0.0,0.350854,0.0,...,0.0,0.000000,0.112464,0.0,0.000000,0.00000,0.254186,0.000000,0.108030,0.0


# Save

In [16]:
from pathlib import Path

import joblib

In [17]:
temp_result_dir = os.path.join('.', 'temp')
if not os.path.exists(temp_result_dir): Path(temp_result_dir).mkdir()

data.to_csv(os.path.join(temp_result_dir, 'data.csv'), index=False)
features_df.to_csv(os.path.join(temp_result_dir, 'features.csv'), index=False)
joblib.dump(vectoriser, os.path.join(temp_result_dir, 'tfidf_vec.pkl'))

['.\\temp\\tfidf_vec.pkl']