In [1]:
import os
import re

import pandas as pd


from string import punctuation

from pyvi import ViTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Read Coprus

In [2]:
corpus_path = os.path.join('..', 'Corpus')
topics = os.listdir(corpus_path)
data = []
for topic in topics:
    for file in os.listdir(os.path.join(corpus_path, topic)):
        with open(os.path.join(corpus_path, topic, file), encoding='utf-8') as txt_file:
            link = txt_file.readline()[:-1]
            article = txt_file.read()
        data.append((file[:3], link, article, topic))

data = pd.DataFrame(data, columns=['article_index', 'link', 'article', 'topic'])

# Feature Extraction

In [5]:
with open(os.path.join('..', 'Misc', 'vietnamese-stopwords.txt'), encoding='utf-8') as file:
    stopwords = list(i[:-1] for i in file.readlines())
    
def viet_tokenize(article: str):
  tokens = []
  text = re.sub(r'[^\w\s]', '', article)
  for token in ViTokenizer.tokenize(text).split():
    if (token in (punctuation + '\n')) or\
        token.isnumeric():
       continue
    tokens.append(token.lower().replace('_', ' '))
  return tokens

In [6]:
vectoriser = TfidfVectorizer(tokenizer=viet_tokenize, stop_words=stopwords,
                             ngram_range=(1,1), min_df=.1, max_df=.6)
tfidf_feat = vectoriser.fit_transform(data['article'])
features_df = pd.DataFrame(tfidf_feat.toarray(), columns=vectoriser.get_feature_names_out())



In [8]:
print(features_df.shape)
features_df

(2432, 206)


Unnamed: 0,an toàn,ban,ban đầu,bao gồm,bàn,bác sĩ,bóng,bảng,bảo vệ,bắt,...,đại diện,đại học,đầu,địa phương,đồng,độ,đội,đức,đứng,ảnh hưởng
0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.308729,0.082363,0.0,0.197870,0.000000,0.000000,0.000000,0.000000,0.0
1,0.0,0.051609,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.038525,0.000000,0.0,0.259257,0.000000,0.000000,0.000000,0.044233,0.0
2,0.0,0.222428,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.066442,0.0,0.000000,0.000000,0.000000,0.000000,0.095318,0.0
3,0.0,0.000000,0.0,0.059949,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.105351,0.042158,0.0,0.000000,0.164392,0.000000,0.000000,0.060480,0.0
4,0.0,0.000000,0.0,0.000000,0.166039,0.150897,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2427,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2428,0.0,0.000000,0.0,0.000000,0.195273,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.141858,0.086679,0.037681,0.0
2429,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.215722,0.000000,0.000000,0.0
2430,0.0,0.000000,0.0,0.000000,0.055339,0.000000,0.0,0.346813,0.0,0.0,...,0.0,0.000000,0.111654,0.0,0.000000,0.000000,0.251258,0.000000,0.106786,0.0


# Save

In [9]:
from pathlib import Path

import joblib

In [10]:
temp_result_dir = os.path.join('.', 'temp')
if not os.path.exists(temp_result_dir): Path(temp_result_dir).mkdir()

data.to_csv(os.path.join(temp_result_dir, 'data.csv'), index=False)
features_df.to_csv(os.path.join(temp_result_dir, 'features.csv'), index=False)
joblib.dump(vectoriser, os.path.join(temp_result_dir, 'tfidf_vec.pkl'))

['.\\temp\\tfidf_vec.pkl']