In [47]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import string
from pyvi import ViTokenizer, ViPosTagger
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB
from gensim.models import word2vec
from tensorflow.keras import preprocessing
from src.dtprocess import cleandt

## Import and Adjust data

In [48]:
vnexpress = pd.read_csv('./data/vnexpress/csv/vnexpress.csv').drop(columns=['Unnamed: 0'], axis=1)
vnexpress.shape

(2481, 6)

In [49]:
vnexpress['tag'] = vnexpress['content'] + vnexpress['title'] + vnexpress['description']
vnexpress.drop(columns=['content', 'title', 'description'], inplace=True)

In [50]:
mask = vnexpress['tag'].isna()
vnexpress[mask]

Unnamed: 0,article_id,topic,sub-topic,tag
110,110,doi-song,bai-hoc-song,
136,136,doi-song,bai-hoc-song,
215,215,doi-song,nha,
337,337,doi-song,tieu-dung,
407,407,du-lich,diem-den,
...,...,...,...,...
2338,2338,the-thao,photo,
2353,2353,the-thao,photo,
2361,2361,the-thao,photo,
2381,2381,the-thao,photo,


In [51]:
vnexpress.dropna(axis=0, inplace=True)
vnexpress.reset_index(inplace=True)
vnexpress.drop(columns=['index'], axis=1, inplace=True)
vnexpress.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2418 entries, 0 to 2417
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   article_id  2418 non-null   int64 
 1   topic       2418 non-null   object
 2   sub-topic   2418 non-null   object
 3   tag         2418 non-null   object
dtypes: int64(1), object(3)
memory usage: 75.7+ KB


In [52]:
vnexpress['tag'] = vnexpress['tag'].apply(lambda x: x.lower())
vnexpress['tag'] = vnexpress['tag'].apply(lambda x: cleandt.remove_punctuation(x))
vnexpress['tag'] = vnexpress['tag'].apply(ViTokenizer.tokenize)
vnexpress['tag'] = vnexpress['tag'].apply(lambda x: cleandt.remove_stopword(x, './data/vietnamese-stopwords-dash.txt'))

In [53]:
vnexpress['topic'].value_counts()

topic
giai-tri    614
the-thao    498
khoa-hoc    410
doi-song    361
giao-duc    355
du-lich     180
Name: count, dtype: int64

In [54]:
news = vnexpress.copy()
label_encoder = LabelEncoder()
news['topic'] = label_encoder.fit_transform(vnexpress['topic'])

news.head()

Unnamed: 0,article_id,topic,sub-topic,tag
0,0,0,to-am,xin_lỗi đi thế_hệ câu trẻ anh_chị_em bạn_bè xô...
1,1,0,to-am,nghiên_cứu đại_học wisconsinmadison mỹ xem_xét...
2,2,0,to-am,yếu_tố độ trẻ quan_hệ cha_mẹ hai mức_độ xung_đ...
3,3,0,to-am,đôi_mươi trần thị liên_kết_hôn lâm_sơn hải ấp ...
4,4,0,to-am,hoàng thị hòa 34 hà nam chồng học 3 km bố_mẹ h...


In [55]:
X = news[['article_id', 'tag']]
y = news[['topic']]
news_train, news_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [56]:
print("The length of news_train: ", len(news_train))
print("The length of news_test: ", len(news_test))

The length of news_train:  1934
The length of news_test:  484


In [57]:
print(label_train.value_counts())
print(label_test.value_counts())

topic
2        483
5        393
4        336
0        301
3        272
1        149
Name: count, dtype: int64
topic
2        131
5        105
3         83
4         74
0         60
1         31
Name: count, dtype: int64


## Re-present text data in numeric vector by pre-train Word2Vec model

In [83]:
# Đệm vector cho từng word với số chiều chiều là 128
def mean_text_embedding(model, data):
    sequences = data['tag'].to_list()
    input_gensim = []

    for sen in sequences:
        input_gensim.append(sen.split())
    
    text_embeddings = []
    for text in input_gensim:
        text_embedding = [model.wv[word] for word in text if word in model.wv]
        text_embeddings.append(text_embedding)
        
    # Tính giá trị trung bình cho từng word vector, mỗi giá trị trung bình đại diện cho 1 word
    mean_text_embeddings = []
    for emebbed in tqdm(text_embeddings):
        temp = np.mean(emebbed, axis=1)
        mean_text_embeddings.append(list(temp))
    
    return mean_text_embeddings

In [91]:
def text_padding(mean_text_embeddings):
    # Đệm thêm 0 để đủ độ dài trong từng văn bản
    max_length = max([len(i) for i in mean_text_embeddings])
    mean_text_embeddings_padding = np.array(preprocessing.sequence.pad_sequences(mean_text_embeddings, maxlen=1995, padding='pre', dtype='float32'))
    
    return mean_text_embeddings_padding

In [85]:
model = word2vec.Word2Vec.load("./model/word.model")

In [86]:
mean_text_embeddings = mean_text_embedding(model, news_train)

100%|██████████| 1934/1934 [00:00<00:00, 4487.24it/s]


In [87]:
X_train = text_padding(mean_text_embeddings)
X_train[0:5]

array([[ 0.        ,  0.        ,  0.        , ..., -0.00202429,
        -0.00148585, -0.0094535 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.02569173,
         0.00980861, -0.02831943],
       [ 0.        ,  0.        ,  0.        , ...,  0.01418668,
        -0.00689139, -0.02060469],
       [ 0.        ,  0.        ,  0.        , ..., -0.03609761,
        -0.04463908, -0.03185184],
       [ 0.        ,  0.        ,  0.        , ..., -0.00892404,
        -0.00881314, -0.01030831]], dtype=float32)

## Create text classification model by Naive Bayes

In [64]:
mnb = MultinomialNB()
mnb.fit(abs(X_train), label_train)

  y = column_or_1d(y, warn=True)


In [92]:
mean_text_embeddings_test = mean_text_embedding(model, news_test)
X_test = text_padding(mean_text_embeddings_test)

100%|██████████| 484/484 [00:00<00:00, 4566.12it/s]


In [93]:
mnb.predict(abs(X_test))

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,