### Install libraries

In [1]:
!pip install --upgrade numpy==1.24.4
!pip install --upgrade --force-reinstall gensim

Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Using cached wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [2]:
!pip install underthesea scikit-learn transformers torch sentencepiece



### Import main libraries

In [3]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

### Import data

In [91]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
data = pd.read_excel('/content/drive/MyDrive/DATA ANALYSIS/NLP/nlp.xlsx')

### Data preprocessing

In [4]:
import re
import string
from underthesea import word_tokenize, pos_tag

# Vietnamese stopwords (expand this list if needed)
vi_stopwords = set([
    'và', 'là', 'có', 'cho', 'của', 'rằng', 'một', 'những', 'các', 'được',
    'với', 'để', 'thì', 'lại', 'tôi', 'anh', 'em', 'này', 'đó', 'ở', 'ra', 'vào',
    'toàn'
])

slang_dict = {
    "ko": "không",
    "hok": "không",
    "k": "không",
    "j": "gì",
    "thik": "thích",
    "cx": "cũng",
    "mik": "mình",
    "mk": "mình",
    "bt": "bình thường",
    "dc": "được",
    "vs": "với",
    "đc": "được",
    "r": "rồi",
    "wa": "quá",
    "đk": "được",
    "ko bt": "không biết",
    "vl": "bậy bạ",
}

def normalize_slang(text, slang_dict):
    words = text.split()
    normalized = [slang_dict.get(word, word) for word in words]
    return " ".join(normalized)

invalid_indices = []
valid_corpus = []

# Order logic of preprocessing steps: Cleaning ➝ Tokenizing ➝ Filtering

for idx, review in enumerate(data['review']):
    # 1. Clean text: remove digits, lowercase, remove punctuation
    review = re.sub(r"\d+", "", review)
    review = review.lower()
    review = review.translate(str.maketrans('', '', string.punctuation))
    review = re.sub(r'\s+', ' ', review).strip()

    # Extra step: Domain knowledge (normalizing slang, idiom, abbreviations)
    review = normalize_slang(review, slang_dict)

    # 2. Tokenize
    tokens = word_tokenize(review, format="text").split()

    # 3. POS tagging
    tagged = pos_tag(" ".join(tokens))  # list of (word, POS)

    # 4. Keep Nouns, Adjectives, Verbs only
    filtered = [word for word, pos in tagged if pos.startswith(('N', 'A', 'M', 'V', 'R'))]

    # 5. Remove stopwords and single-letter tokens
    final_tokens = [word for word in filtered if word not in vi_stopwords and len(word) > 1]

    # If no valid tokens, log and save index
    if len(final_tokens) == 0:
        print(f"Empty tokens after filtering at index {idx}:")
        print(f"Original review: {data['review'][idx]}")
        invalid_indices.append(idx)
    else:
        valid_corpus.append(final_tokens)

In [5]:
# Define a preprocessing function to reuse
def preprocess_review(text):
    # Clean the text
    text = re.sub(r"\d+", "", text)  # Remove digits
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # Normalize slang
    text = normalize_slang(text, slang_dict)

    # Tokenize
    tokens = word_tokenize(text, format="text").split()

    # POS tagging
    tagged = pos_tag(" ".join(tokens))

    # Filter for nouns, adjectives, and verbs
    filtered = [word for word, pos in tagged if pos.startswith(('N', 'A', 'M', 'V', 'R'))]

    # Remove stopwords and single-letter words
    final_tokens = [word for word in filtered if word not in vi_stopwords and len(word) > 1]

    return final_tokens

In [6]:
valid_corpus

[['ứng_dụng', 'luyện', 'đầy_đủ', 'cấp_độ'],
 ['quảng_cáo'],
 ['hiệu_quả', 'tuyệt_vời'],
 ['quảng_cáo', 'kinh_khủng'],
 ['ứng_dụng', 'đầy_đủ'],
 ['quảng_cáo', 'âm_thanh', 'tạp_âm'],
 ['quảng_cáo',
  'quảng_cáo',
  'chục',
  'tua',
  'một_chút',
  'quảng_cáo',
  'hát_hò',
  'điên'],
 ['quảng_cáo', 'hoài_phiền'],
 ['ứng_dụng', 'hữu_ích', 'bổ_sung', 'memo', 'đoạn', 'văn'],
 ['hữu_ích_tra', 'nhanh_nghĩa', 'chính_xác', 'chữ', 'hán', 'tuyệt_vời'],
 ['rất', 'hay', 'ý_nghĩa', 'bé'],
 ['rất', 'không', 'quảng_cáo', 'rất', 'thoải_mái'],
 ['rất', 'tốt', 'bé', 'nhà', 'rất', 'thích'],
 ['rất', 'hay', 'bổ_ích', 'thú_vị'],
 ['tuyệt_vời', 'rất', 'vui_vẻ'],
 ['ứng_dụng', 'rất', 'hay', 'bổ_ích'],
 ['rất', 'hay', 'hiệu_quả', 'bé'],
 ['dino', 'đi', 'học', 'rất', 'tốt'],
 ['app',
  'tệ',
  'quá',
  'lúc',
  'không',
  'gửi',
  'mã_otp',
  'lúc',
  'kết_nối',
  'thất_bại',
  'lũ',
  'ngu',
  'làm',
  'app'],
 ['chỗ',
  'ghép',
  'hàng',
  'đủ',
  'số',
  'chữ',
  'con',
  'làm',
  'đúng',
  'rồi',
  'bảo',
  

In [8]:
from gensim.utils import simple_preprocess
from underthesea import sent_tokenize

processed_words = []

for text in valid_corpus:  # `corpus` contains preprocessed reviews (strings or token lists)
    # Join list of tokens into text if needed
    if isinstance(text, list):
        text = ' '.join(text)

    sentences = sent_tokenize(text)  # Vietnamese sentence segmentation
    for sentence in sentences:
        processed_words.append(simple_preprocess(sentence))  # Tokenize, lowercase, clean

### Model selection

#### Word2Vec Model

In [10]:
from gensim.models import Word2Vec

# 1. Train Word2Vec model
w2v_model = Word2Vec(
    sentences=processed_words,  # list of tokenized sentences
    vector_size=10,
    window=5,
    min_count=1, # might cause overfitting
    workers=4,
    sg=1,  # use skip-gram (better for smaller datasets), set sg=0 for CBOW
    seed=42
)

# 2. View vocabularies
print("Top 10 most frequent tokens:", w2v_model.wv.index_to_key[:10])  # top 10 frequent tokens
print("Vocabulary size:", len(w2v_model.wv.index_to_key))

# 3. Corpus size (number of sentences)
print("Number of training sentences:", w2v_model.corpus_count)

# 4. Similar words to input (check if word exists in vocab first)
word = 'điên'
if word in w2v_model.wv:
    print("Words most similar to 'điên':", w2v_model.wv.most_similar(word))
else:
    print(f"'{word}' not in vocabulary")

# 5. Vector shape for a word (again, check if it exists)
word = 'quảng_cáo'
if word in w2v_model.wv:
    print(f"Vector shape of '{word}':", w2v_model.wv[word].shape)
else:
    print(f"'{word}' not in vocabulary")

Top 10 most frequent tokens: ['rất', 'không', 'học', 'app', 'quảng_cáo', 'ứng_dụng', 'tốt', 'hay', 'con', 'bé']
Vocabulary size: 189
Number of training sentences: 50
Words most similar to 'điên': [('mìnhrất_ý', 0.7684803009033203), ('nhiều', 0.728179931640625), ('âm_thanh', 0.7148966193199158), ('vẫn', 0.7006509900093079), ('đạt', 0.6749110221862793), ('vừa', 0.6685372591018677), ('dạy', 0.6450703740119934), ('phải', 0.5831232070922852), ('tư_duy', 0.5791965126991272), ('nghĩamà', 0.5183154940605164)]
Vector shape of 'quảng_cáo': (10,)


In [12]:
import numpy as np

def avg_word2vec(doc, model):
    # filter out-of-vocab words
    valid_words = [word for word in doc if word in model.wv.index_to_key]

    # debug print
    print(valid_words)

    if not valid_words:
        # fallback: return zero vector with same shape as model vector size
        return np.zeros(model.vector_size)

    # average vectors of all valid words
    return np.mean([model.wv[word] for word in valid_words], axis=0)

In [13]:
!pip install tqdm



In [14]:
from tqdm import tqdm

In [47]:
# independent
X_w2v = []
# for i in tqdm(range(len(processed_words))):
for i in range(len(processed_words)):
  X_w2v.append(avg_word2vec(processed_words[i],w2v_model))

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_w2v = scaler.fit_transform(X_w2v)

X_new_w2v = np.array(X_w2v)

['ứng_dụng', 'luyện', 'đầy_đủ', 'cấp_độ']
['quảng_cáo']
['hiệu_quả', 'tuyệt_vời']
['quảng_cáo', 'kinh_khủng']
['ứng_dụng', 'đầy_đủ']
['quảng_cáo', 'âm_thanh', 'tạp_âm']
['quảng_cáo', 'quảng_cáo', 'chục', 'tua', 'một_chút', 'quảng_cáo', 'hát_hò', 'điên']
['quảng_cáo', 'hoài_phiền']
['ứng_dụng', 'hữu_ích', 'bổ_sung', 'memo', 'đoạn', 'văn']
['hữu_ích_tra', 'nhanh_nghĩa', 'chính_xác', 'chữ', 'hán', 'tuyệt_vời']
['rất', 'hay', 'ý_nghĩa', 'bé']
['rất', 'không', 'quảng_cáo', 'rất', 'thoải_mái']
['rất', 'tốt', 'bé', 'nhà', 'rất', 'thích']
['rất', 'hay', 'bổ_ích', 'thú_vị']
['tuyệt_vời', 'rất', 'vui_vẻ']
['ứng_dụng', 'rất', 'hay', 'bổ_ích']
['rất', 'hay', 'hiệu_quả', 'bé']
['dino', 'đi', 'học', 'rất', 'tốt']
['app', 'tệ', 'quá', 'lúc', 'không', 'gửi', 'mã_otp', 'lúc', 'kết_nối', 'thất_bại', 'lũ', 'ngu', 'làm', 'app']
['chỗ', 'ghép', 'hàng', 'đủ', 'số', 'chữ', 'con', 'làm', 'đúng', 'rồi', 'bảo', 'sai']
['trò', 'đã', 'bạn', 'nhiều', 'kiến_thức']
['app', 'học', 'tốt', 'giúp', 'con', 'phát_triển', 

#### PhoBERT model

In [48]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load PhoBERT base model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
phoBert_model = AutoModel.from_pretrained("vinai/phobert-base")

In [49]:
def get_bert_embedding(text_tokens, model):
    """
    text_tokens: list of tokens (joined by whitespace), e.g., ['quảng_cáo', 'đầy_đủ']
    """
    sentence = " ".join(text_tokens)  # Join tokens into a single string
    inputs = tokenizer(
      sentence,
      return_tensors="pt",
      truncation=True,
      padding=True,
      max_length=128  # or 256 depending on your content
      )

    with torch.no_grad():
        outputs = model(**inputs)

    # Get the average of the last hidden state (you could also use [CLS] token)
    last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]
    embedding = torch.mean(last_hidden_state, dim=1)  # mean pooling over tokens
    return embedding.squeeze().numpy()

In [50]:
X_pb = []

for tokens in processed_words:
    embedding = get_bert_embedding(tokens, phoBert_model)
    X_pb.append(embedding)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_pb = scaler.fit_transform(X_pb)

X_new_pb = np.array(X_pb)

In [51]:
from sklearn.decomposition import PCA # Not working for cases since PCA requires n_components requires the n_samples at least equal to its size (n_components ≤ min(n_samples, n_features))

# apply PCA on PhoBERT vectors (e.g., from 768 → 10):
pca = PCA(n_components=10)
X_new_pb = pca.fit_transform(X_new_pb)

# from sklearn.decomposition import TruncatedSVD

# svd = TruncatedSVD(n_components=100)
# X_pb = svd.fit_transform(X_pb)

In [52]:
from joblib import dump
# Save both the scaler and PCA
dump(scaler, 'scaler_pb.joblib')
dump(pca, 'pca_pb.joblib')

['pca_pb.joblib']

### Apply model on data

In [53]:
print("Shape of Word2Vec features:", X_new_w2v.shape)

Shape of Word2Vec features: (50, 10)


In [54]:
print("Shape of PhoBERT features:", X_new_pb.shape)

Shape of PhoBERT features: (50, 10)


In [55]:
# # Dependent - output
# y = pd.get_dummies(data['sentiment']) # create 2 new cols: 0 and 1
# y = y.iloc[:,0].values # 0 = first column; -1 = last column

In [56]:
y = data['sentiment'] # since the value of sentiment column is already a binary

In [57]:
# transform X from array to dataframe
df_w2v = pd.DataFrame(X_new_w2v)
df_pb = pd.DataFrame(X_new_pb)

# df.columns = [f'feature_{i}' for i in range(df.shape[1])]

X_new_w2v = df_w2v
X_new_pb = df_pb

### Train test split

In [58]:
# train test split
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X_new_w2v, y, test_size = 0.20, random_state = 0)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_new_pb, y, test_size = 0.20, random_state = 0)

### ML model

In [59]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

In [92]:
W2V_classifier = classifier.fit(X1_train, y1_train) # cannot handle 0, null values from train data

In [93]:
PB_classifier = classifier.fit(X2_train, y2_train) # cannot handle 0, null values from train data

### Predict on test set

In [94]:
y1_pred = W2V_classifier.predict(X1_test)

In [95]:
y2_pred = PB_classifier.predict(X2_test)

### Evaluation metrics

In [96]:
from sklearn.metrics import accuracy_score, classification_report

In [97]:
print(accuracy_score(y1_test, y1_pred))
print(accuracy_score(y2_test, y2_pred))

0.9
0.9


In [98]:
print(classification_report(y1_test, y1_pred))
print(classification_report(y2_test, y2_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.90      1.00      0.95         9

    accuracy                           0.90        10
   macro avg       0.45      0.50      0.47        10
weighted avg       0.81      0.90      0.85        10

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.89      0.94         9

    accuracy                           0.90        10
   macro avg       0.75      0.94      0.80        10
weighted avg       0.95      0.90      0.91        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [99]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(W2V_classifier, X1_train, y1_train, cv=skf, scoring="recall")
print("W2V: precision (5‑fold CV):", scores)

W2V: precision (5‑fold CV): [0.33333333 1.         1.         1.         1.        ]


In [100]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(PB_classifier, X2_train, y2_train, cv=skf, scoring="precision")
print("PB: recall (5‑fold CV):", scores)

PB: recall (5‑fold CV): [0.83333333 0.66666667 0.83333333 0.83333333 0.71428571]


In [101]:
from collections import Counter
print("W2v - Train labels:", Counter(y1_train))
print("W2v - Test  labels:", Counter(y1_test))

W2v - Train labels: Counter({1: 26, 0: 14})
W2v - Test  labels: Counter({1: 9, 0: 1})


In [102]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y1_test, y1_pred, labels=[0,1])
print("Confusion matrix:\n", cm)

Confusion matrix:
 [[0 1]
 [0 9]]


In [103]:
cm = confusion_matrix(y2_test, y2_pred, labels=[0,1])
print("Confusion matrix:\n", cm)

Confusion matrix:
 [[1 0]
 [1 8]]


### Classify on new data

In [104]:
# new_text = "tạp âm ! quảng cáo hoài phiền lắm" # PB better
# new_text = "Bài học này hay và thú vị! Mình học xong rồi." # Both
# new_text = "Mình ấn vào ứng dụng xong bị out ra luôn dù mk đã xoá app và tải lại rồi, vừa cập nhật phiên bản mới xong cũng k vào được luôn" # PB better
new_text = "Thích phiên bản cũ có lớp cộng đồng hơn" # None (but this case is hard)

# Preprocess the new text
processed_text = preprocess_review(new_text)

## WORD2VEC
# Get the vector representation using Word2Vec model
text_vector = np.mean([w2v_model.wv[word] for word in processed_text if word in w2v_model.wv.index_to_key], axis=0)

w2v_prediction = W2V_classifier.predict([text_vector])
print("W2V Predicted class:", w2v_prediction)

## PhoBERT
from joblib import load

# Load scaler and PCA
scaler = load('scaler_pb.joblib')
pca = load('pca_pb.joblib')

# Get PhoBERT embedding for the input sentence
embedding = get_bert_embedding(processed_text, phoBert_model)  # assuming this returns a single 768-d vector
X_new_pb = [embedding]  # Wrap in a list to make it 2D

# Apply saved scaler and PCA
text_vector_pb = scaler.transform(X_new_pb)
text_vector = pca.transform(text_vector_pb)

pb_prediction = PB_classifier.predict(text_vector)
print("PB Predicted class:", pb_prediction)

W2V Predicted class: [1]
PB Predicted class: [1]


In [105]:
# apply hyperparameter tuning in random forest