In [None]:
!pip install pyvi

In [None]:
!unzip /content/drive/MyDrive/data_train.zip

In [None]:
from pyvi import ViTokenizer, ViPosTagger
import gensim
import os
import pickle
import codecs
import re
from tqdm import tqdm
import regex
import pandas as pd
from tokenize import group
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler

# **Tiền xử lý**

In [None]:
def remove_html(txt):
  return regex.sub(r'<[^>]*>', '', txt)
stopword = set()
with open('/content/drive/MyDrive/vietnamese-stopwords.txt', 'r', encoding='utf-8') as _fp:
  word = _fp.readlines()
stopword = [n.replace('\n', '') for n in word]
def remove_stopwords(line):
  words = []
  for word in line.strip().split():
    if word not in stopword:
      words.append(word)
  return ' '.join(words)
def text_preprocess(document):
   document = remove_html(document)
   lemmatizer = WordNetLemmatizer()
   document = ' '.join([lemmatizer.lemmatize(word) for word in document.split()])
   document = ViTokenizer.tokenize(document)
   document = document.lower()
   document = regex.sub(r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]', '', document)
   document = regex.sub(r'\s+', ' ', document).strip()
   document = re.sub(r'([A-Z])\1+', lambda m: m.group(1).upper(), document, flags=re.IGNORECASE)
   return document


# **Tải dữ liệu**

In [None]:
def load_data(directory_path, label):
    data_frames = []
    file_list = os.listdir(directory_path)
    for file_name in file_list:
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
            data = text_preprocess(data)
            data = remove_stopwords(data)
            df = pd.DataFrame({'text': [data], 'label': [label]})
            data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)
pos_path = '/content/data_train/train/pos'
neg_path = '/content/data_train/train/neg'
pos_data = load_data(pos_path, 1) 
neg_data = load_data(neg_path, 0)
traindata = pd.concat([pos_data, neg_data], ignore_index=True)
traindata.to_csv('/content/traindata.csv', index=False)

In [None]:
def load_data(directory_path, label):
    data_frames = []
    file_list = os.listdir(directory_path)
    for file_name in file_list:
        file_path = os.path.join(directory_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = file.read()
            data = text_preprocess(data)
            data = remove_stopwords(data)
            df = pd.DataFrame({'text': [data], 'label': [label]})
            data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)
pos_path = '/content/data_train/test/pos'
neg_path = '/content/data_train/test/neg'
pos_data = load_data(pos_path, 1)
neg_data = load_data(neg_path, 0)
testdata = pd.concat([pos_data, neg_data], ignore_index=True)
testdata.to_csv('/content/testdata.csv', index=False)

# **Trích xuất đặc trưng**

In [None]:
traindata = pd.read_csv('traindata.csv')
testdata = pd.read_csv('testdata.csv')
X_train = traindata['text']
y_train = traindata['label']
X_test = testdata['text']
y_test = testdata['label']

In [None]:
traindata.head()

Unnamed: 0,text,label
0,hôm_nay mua 30 sinh_nhật chất lương xưa tuyệt_...,1
1,đồ uống chất_lượng thái_độ phục_vụ ok uống quá...,1
2,matcha đầu thử ghiền kem trà xanh cho_đến bây_...,1
3,đồ nướng thoải_mái toàn xiên xiên tươi nướng n...,1
4,quán nằm vị_trí mặt_tiền đường mặc_dù gửi xe 1...,1


In [None]:
testdata.head()

Unnamed: 0,text,label
0,đạo phật thỉnh_thoảng đi ăn_chay lắm nhà_hàng ...,1
1,món phong_cách phục_vụ đặc_biệt bảo_vệ tận_tìn...,1
2,trà sữa ngon giá ok rẻ ding_tea chatime cốc to...,1
3,đồ chiên ngon kem test súp cua dở giá rẻ hình_...,1
4,quán 6 vị bắp giá rẻ ổn tuy_nhiên phô_mai mặn ...,1


In [None]:
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)
svd = TruncatedSVD(n_components=300, random_state=42)
svd.fit(X_train_tfidf)
X_train_tfidf_svd = svd.transform(X_train_tfidf)
X_test_tfidf_svd = svd.transform(X_test_tfidf)

# **Mô hình KNN**

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_tfidf_svd, y_train)
y_pred = knn_model.predict(X_test_tfidf_svd)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.6944
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.67      0.69      5000
           1       0.68      0.72      0.70      5000

    accuracy                           0.69     10000
   macro avg       0.70      0.69      0.69     10000
weighted avg       0.70      0.69      0.69     10000



# **Mô hình SVM**

In [None]:
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_tfidf_svd = svd.fit_transform(X_train_tfidf)
X_test_tfidf_svd = svd.transform(X_test_tfidf)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf_svd, y_train)
y_pred = svm_model.predict(X_test_tfidf_svd)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8543
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      5000
           1       0.85      0.87      0.86      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



# **Mô hình Naive bayes**

In [None]:
scaler = MinMaxScaler()
X_train_tfidf_svd_scaled = scaler.fit_transform(X_train_tfidf_svd)
X_test_tfidf_svd_scaled = scaler.transform(X_test_tfidf_svd)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train_tfidf_svd_scaled, y_train)
y_pred = naive_bayes_model.predict(X_test_tfidf_svd_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8045
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.80      5000
           1       0.80      0.81      0.81      5000

    accuracy                           0.80     10000
   macro avg       0.80      0.80      0.80     10000
weighted avg       0.80      0.80      0.80     10000

