# Classification for Vietnamese Text

Dataset: UIT-VSMEC (http://nlp.uit.edu.vn/datasets/)

## Set up

In [1]:
# Install packages
!pip install pandas underthesea emoji demoji langdetect gensim huggingface_hub

Collecting numpy>=1.23.2 (from pandas)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.4
    Uninstalling numpy-2.2.4:
      Successfully uninstalled numpy-2.2.4
Successfully installed numpy-1.26.4


In [2]:
!pip install --upgrade --force-reinstall numpy
!pip install --upgrade --force-reinstall pandas

Collecting numpy
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.4
Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x

In [3]:
# Import libraries for general purpose
import json
import torch
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, RobertaForSequenceClassification, BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset

# Text cleaning
import re
import emoji
import demoji
import unicodedata
from langdetect import detect, LangDetectException
from underthesea import word_tokenize, text_normalize

# Data preprocessing
from sklearn.model_selection import train_test_split

# Gensim for Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# sklearn imports
from keras.models import Sequential
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix, make_scorer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier

# Set style for plots
sns.set_style("whitegrid")
sns.despine()
# plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

# Handle warnings
import warnings
warnings.filterwarnings('ignore')

KeyboardInterrupt: 

In [None]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

## Part 1: Build a dataset and preprocess


### Load dataset


In [None]:
def load_data(path):
  df = pd.read_excel(path, sheet_name='Sheet1')
  df.columns = ['index', 'Emotion', 'Sentence']
  # remove unused column
  df.drop(columns=['index'], inplace=True)
  return df

train_df = load_data('https://github.com/patuaans/vietnamese-sentiment-analysis/blob/main/train_nor_811.xlsx?raw=true')
val_df = load_data('https://github.com/patuaans/vietnamese-sentiment-analysis/blob/main/valid_nor_811.xlsx?raw=true')
test_df = load_data('https://github.com/patuaans/vietnamese-sentiment-analysis/blob/main/test_nor_811.xlsx?raw=true')

# Concatenate all DataFrames into one
df = pd.concat([train_df, val_df, test_df], ignore_index=True)

In [None]:
df.head()

In [None]:
df.info()

**=> No Missing Values**

In [None]:
df.duplicated().sum()

**=> There are some duplicated comments, we will remove them**

In [None]:
df = df[~df.duplicated()]

In [None]:
df.info()

In [None]:
unique_sentiments = df.Emotion.unique()
num_labels = len(unique_sentiments)
unique_sentiments

**=> 7 labels**

In [None]:
ax = df.Emotion.value_counts().plot.bar()
ax.bar_label(ax.containers[0])
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
ax.set_yticks([])
plt.show()

**=> Imbalanced labels! But We still want to classify 7 classes**

In [None]:
# mapping to the sentiment column
dicto = {'Disgust': 0, 'Enjoyment': 1 , 'Anger': 2, 'Surprise': 3, 'Sadness': 4, 'Fear': 5, 'Other': 6}

df.Emotion = df.Emotion.map(dicto)

In [None]:
df.head()

### Data preprocessing

#### Load teencode dictionary

In [None]:
def load_teencode_dict(file_path):
    teencode_dict = {}
    f = requests.get(file_path)
    teencode_dict = f.json()
    return teencode_dict

# Load the dictionary
teencode_dict = load_teencode_dict('https://github.com/patuaans/vietnamese-sentiment-analysis/blob/main/teencode.json?raw=true')
print(teencode_dict)

In [None]:
def replace_teencode(text, teencode_dict):
    tokens = word_tokenize(text)
    normalized_words = [teencode_dict.get(word, word) for word in tokens]
    return ' '.join(normalized_words)

replace_teencode('ctrai khôg bme', teencode_dict)

#### Convert emoticons to emoji

In [None]:
# Define a dictionary mapping text emoticons to emoji with proper escaping
emoticon_to_emoji = {
    r'\?{1,}': '?',
    r'\!{1,}': '!',
    r'=\]{1,}': '😊',     # =] or =]] or =]]] etc.
    r':\){1,}': '😊',     # :) or :)) or :))) etc.
    r'=D': '😃',
    r'=d': '😃',
    r'< 3': '❤️',
    r'=\){1,}': '😂',     # =) or =)) or =))) etc.
    r':\({1,}': '😢',     # :( or :(( or :((( etc.
    r':-\({1,}': '😢',    # :-( or :-(( etc.
    r';\){1,}': '😉',     # ;) or ;)) etc.
    r':D': '😄',
    r':d': '😄',
    r':P': '😜',
    r':p': '😜',
    r'T_T': '😭',
    r't_t': '😭',
    r'-_-': '😑',
    r'@@': '😣',
    r'XD': '😂',
    r'xd': '😂',
    r':v': '😚',
    r':3': '😺',
    r'\^-\^': '😊',
}

def convert_emoticons_to_emoji(text):
    for emoticon_pattern, emoji_char in emoticon_to_emoji.items():
        text = re.sub(emoticon_pattern, emoji_char, text)
    return text

# Test the function
test_sentences = [
    "Cảm ơn bạn nhé :))))",
    "nay buồn :((((",
    "Vui quá =))))",
    "Thích quá :D",
    "Huhu buồn lắm T_T",
    "Cười xỉu luôn =D",
    "Hihi dễ thương quá ^-^",
]

for sentence in test_sentences:
    converted = convert_emoticons_to_emoji(sentence)
    print(f"Original: {sentence}")
    print(f"Converted: {converted}\n")

#### Preprocessing Function

In [None]:
def clean_hashtags(text):
    # Remove hashtags at the end of the sentence
    text = re.sub(r'(\s+#[\w-]+)+\s*$', '', text).strip()
    # Remove the # symbol from hashtags in the middle of the sentence
    text = re.sub(r'#([\w-]+)', r'\1', text).strip()
    return text

clean_hashtags('cccc #aaaa ddd #bbbb')

In [None]:
# def filter_non_vietnamese(text):
#   try:
#     lang = detect(text)
#   except LangDetectException:
#     lang = 'unknown'
#   return text if lang == 'vi' else ''

# filter_non_vietnamese('run có ☺️')

In [None]:
def remove_short_words(text, min_len=2):
    words = text.split()
    filtered_words = [word for word in words if len(word) >= min_len]
    return ' '.join(filtered_words)

remove_short_words('c')

In [None]:
def preprocess_text(text):
    # Change to Unicode dựng sẵn
    text = unicodedata.normalize('NFC', text)
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'https?://\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtag
    text = clean_hashtags(text)
    # Convert emoticons to emojis
    text = convert_emoticons_to_emoji(text)
    # Convert emojis to text descriptions
    text = emoji.demojize(text)
    # Clean emojis
    text = demoji.replace(text, '')
    # Loại bỏ ký tự đặc biệt (không loại bỏ chữ cái tiếng Việt và khoảng trắng)
    text = re.sub(r"[^0-9a-zA-Záàảãạăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệíìỉĩịóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựýỳỷỹ_\s]", " ", text)
    # Replace teencode with standard words
    text= replace_teencode(text, teencode_dict)
    # Collapse repeated emoji descriptions
    text = re.sub(r"\b(\w+)( \1\b)+", r"\1", text)
    # Collapse repeated characters
    text = re.sub(r'(\w)(\1{1,})', r'\1', text)
    # Text normalize
    normalized_words = text_normalize(text)
    return normalized_words

comment = '   ctrai Check    this out! https://example.com @user123 #amazing Cảm ơn bạn! ☺️☺️   '
preprocess_text(comment)

In [None]:
# Apply preprocessing to the 'Sentence' column
df['text_clean'] = df['Sentence'].apply(preprocess_text)

In [None]:
df.info()

In [None]:
df['text_clean'].duplicated().sum()

In [None]:
df.drop_duplicates('text_clean', inplace=True)

In [None]:
df.info()

In [None]:
df.head()

#### Comments length analysis

In [None]:
df['text_len'] = [len(text.split()) for text in df.text_clean]

In [None]:
plt.figure(figsize=(7,5))
ax = sns.countplot(x='text_len', data=df[df['text_len']<10], palette='mako')
plt.title('Count of comments with less than 10 words', fontsize=20)
plt.yticks([])
for container in ax.containers:
    ax.bar_label(container)
plt.ylabel('count')
plt.xlabel('')
plt.show()

In [None]:
df.sort_values(by=['text_len'], ascending=False)

## Part 2: Classical model with diffrent method

In [None]:
train_valid_df, test_df = train_test_split(df[['text_clean', 'Emotion']], test_size=0.2, random_state=42)
train_df, valid_df = train_test_split(train_valid_df, test_size=0.1, random_state=42)

### Classical Model

In [None]:
def classical_model(train_df, test_df , bow=False, TFIDF=False, Ngram=False,
                    model=linear_model.LogisticRegression(solver='liblinear')):
    if bow:
        count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    elif TFIDF:
        count_vec = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
    elif Ngram:
        count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None, ngram_range=Ngram)

    count_vec.fit(train_df.text_clean)
    xtrain = count_vec.transform(train_df.text_clean)
    xtest = count_vec.transform(test_df.text_clean)
    model.fit(xtrain, train_df.Emotion)
    preds = model.predict(xtest)
    accuracy_precision = precision_score(test_df.Emotion, preds, average='macro')
    accuracy_recall = recall_score(test_df.Emotion, preds, average='macro')
    print('precision score:', accuracy_precision)
    print('recall score:', accuracy_recall)
    print("========================================================")

    print(classification_report(test_df.Emotion, preds))

#### Logistic

In [None]:
# BoW
classical_model(train_df, test_df , bow =True,model=linear_model.LogisticRegression(solver = 'liblinear'))

In [None]:
# TF-IDF
classical_model(train_df, test_df , TFIDF=True,model=linear_model.LogisticRegression(solver = 'liblinear'))

In [None]:
# N-gram
classical_model(train_df, test_df , Ngram=(1,2),model=linear_model.LogisticRegression(solver = 'liblinear'))

### Naive

In [None]:
# BoW
classical_model(train_df, test_df ,bow = True,model = MultinomialNB()) # multiclassification

In [None]:
# TF-IDF
classical_model(train_df, test_df, model= MultinomialNB(),TFIDF=True)

In [None]:
# N-gram
classical_model(train_df, test_df, model = MultinomialNB(), Ngram=(1,2))

#### Decision Tree

In [None]:
# BoW
classical_model(train_df, test_df, bow = True, model = DecisionTreeClassifier())

In [None]:
# TF-IDF
classical_model(train_df, test_df, TFIDF=True, model = DecisionTreeClassifier())

In [None]:
# N-gram
classical_model(train_df, test_df, Ngram=(1,2), model = DecisionTreeClassifier())

#### Random Forest

In [None]:
# BoW
classical_model(train_df, test_df, bow = True, model = RandomForestClassifier())

In [None]:
# TF-IDF
classical_model(train_df, test_df, TFIDF=True, model = RandomForestClassifier())

In [None]:
# N-gram
classical_model(train_df, test_df, Ngram=(1,2), model = RandomForestClassifier())

### SVM

In [None]:
# BoW
classical_model(train_df, test_df, bow = True, model = SVC(kernel='linear'))

In [None]:
# TF-IDF
classical_model(train_df, test_df, TFIDF=True, model = SVC(kernel='linear'))

In [None]:
# N-gram
classical_model(train_df, test_df, Ngram=(1,2), model = SVC(kernel='linear'))

## Part 3: Deep Learning models with Doc2Vec method

#### 1. Without pretrain model

##### Doc2Vec with Neural Network

In [None]:
# Tăng kích thước vector và số epoch cho Doc2Vec
def train_doc2vec(tagged_data, vector_size=300, window=5, epochs=50):  # Tăng vector_size và epochs
    """
    Train a Doc2Vec model.
    """
    model_d2v = Doc2Vec(vector_size=vector_size, window=window, min_count=2, workers=4, epochs=epochs)
    model_d2v.build_vocab(tagged_data)
    model_d2v.train(tagged_data, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)
    return model_d2v

# Huấn luyện lại mô hình Doc2Vec với các tham số mới
model_d2v = train_doc2vec(tagged_data)

# Lấy vector cho các tập train, validation và test
train_vectors = get_vectors(model_d2v, train_df['text_clean'])
valid_vectors = get_vectors(model_d2v, valid_df['text_clean'])
test_vectors = get_vectors(model_d2v, test_df['text_clean'])

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

def build_nn_model(input_dim, num_labels):
    """
    Build a more complex neural network classifier with Dropout for regularization.
    """
    model_nn = Sequential()
    model_nn.add(Dense(512, activation='relu', input_shape=(input_dim,)))  # Tăng số lượng neuron
    model_nn.add(Dropout(0.5))  # Thêm Dropout để tránh overfitting
    model_nn.add(Dense(256, activation='relu'))  # Thêm một lớp nữa
    model_nn.add(Dropout(0.5))  # Dropout ở lớp thứ hai
    model_nn.add(Dense(num_labels, activation='softmax'))  # Lớp output
    model_nn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model_nn


In [None]:
# Xây dựng mô hình NN
model_nn = build_nn_model(input_dim=300, num_labels=num_labels)

# Thêm EarlyStopping và ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3, min_lr=1e-6)

# Huấn luyện mô hình với Callbacks
history = model_nn.fit(
    train_vectors, train_df['Emotion'].values,
    epochs=25, batch_size=64,
    validation_data=(valid_vectors, valid_df['Emotion'].values),
    callbacks=[early_stopping, reduce_lr]
)

# Đánh giá mô hình trên tập test
def evaluate_nn_model(model_nn, test_vectors, test_labels):
    """
    Evaluate the neural network on the test set.
    """
    preds = np.argmax(model_nn.predict(test_vectors), axis=1)
    print("Doc2Vec + Neural Network Classification Report:")
    print(classification_report(test_labels, preds))

# Đánh giá mô hình
evaluate_nn_model(model_nn, test_vectors, test_df['Emotion'].values)

### Bert

In [None]:
df = Dataset.from_pandas(df).remove_columns(["text_len", "Sentence", "__index_level_0__"])

In [None]:
# First split: train + validation and test
train_valid_df = df.train_test_split(test_size=0.2, seed=42)

# Second split: train and validation from the training portion
train_df = train_valid_df["train"].train_test_split(test_size=0.1, seed=42)

# Accessing the final train, validation, and test sets
train_dataset = train_df["train"]
valid_dataset = train_df["test"]
test_dataset = train_valid_df["test"]

In [None]:
# Load BERT tokenizer and model
bert_check_point = "google-bert/bert-base-multilingual-uncased"
label2id = {'Disgust': 0, 'Enjoyment': 1 , 'Anger': 2, 'Surprise': 3, 'Sadness': 4, 'Fear': 5, 'Other': 6}
id2label = {0: 'Disgust',1: 'Enjoyment',2: 'Anger',3: 'Surprise',4: 'Sadness',5: 'Fear',6: 'Other'}
bert_tokenizer = AutoTokenizer.from_pretrained(bert_check_point)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_check_point,
                                                                num_labels=num_labels,
                                                                id2label=id2label,
                                                                label2id=label2id)
bert_model.to(device)

In [None]:
def preprocess_data(dataset, tokenizer):
    def tokenize_function(examples):
        # Tokenize inputs and targets
        model_inputs = tokenizer(
            examples["text_clean"],
            truncation=True,
        )

        return model_inputs

    # Apply tokenization to dataset
    tokenized_data = dataset.map(tokenize_function, batched=True)
    return tokenized_data

In [None]:
tokenized_train_df = preprocess_data(train_dataset, bert_tokenizer)
tokenized_valid_df = preprocess_data(valid_dataset, bert_tokenizer)
tokenized_test_df = preprocess_data(test_dataset, bert_tokenizer)

In [None]:
model_name = bert_check_point.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-VSMEC",
    evaluation_strategy="steps",  # Evaluate every few steps
    eval_steps=200,               # Adjust based on your dataset size and training speed
    logging_steps=200,            # Log metrics more frequently
    save_steps=200,               # Ensure saving steps is a round multiple of eval_steps
    load_best_model_at_end=True,  # Automatically load the best model found during training
    metric_for_best_model="accuracy", # Choose the metric you consider most important
    greater_is_better=True
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=tokenized_train_df,
    eval_dataset=tokenized_valid_df,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

In [None]:
# Load PhoBERT tokenizer and model
phobert_check_point = "vinai/phobert-base"
phobert_tokenizer = AutoTokenizer.from_pretrained(phobert_check_point)
phobert_model = AutoModelForSequenceClassification.from_pretrained(phobert_check_point, num_labels=num_labels)
phobert_model.to(device)

In [None]:
tokenized_train_df = preprocess_data(train_dataset, phobert_tokenizer)
tokenized_valid_df = preprocess_data(valid_dataset, phobert_tokenizer)
tokenized_test_df = preprocess_data(test_dataset, phobert_tokenizer)

In [None]:
model_name = phobert_check_point.split("/")[-1]

trainer = Trainer(
    model=phobert_model,
    args=training_args,
    train_dataset=tokenized_train_df,
    eval_dataset=tokenized_valid_df,
    tokenizer=bert_tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()