In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import ml_collections
import datasets
import torch
import transformers
from datasets import  Dataset, DatasetDict
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    set_seed,
)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import emoji
from wordcloud import WordCloud, STOPWORDS
import re,string, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer

import warnings
warnings.filterwarnings(action="ignore")

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\hung0\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\hung0\AppData\Roaming\Python\Python311\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\hung0\AppData\Roaming\Python\Python311\site-packages\ipykerne

In [2]:
def model_config():
    cfg_dictionary = {
        "data_path": "../../data/data.csv",
        "model_path": "/kaggle/working/bert_model.h5",
        "model_type": "transformer",

        "test_size": 0.1,
        "validation_size":0.2,
        "train_batch_size": 32,
        "eval_batch_size": 32,

        "epochs": 5,
        "adam_epsilon": 1e-8,
        "lr": 3e-5,
        "num_warmup_steps": 10,

        "max_length": 128,
        "random_seed": 42,
        "num_labels": 3,
        "model_checkpoint":"roberta-base",
    }
    cfg = ml_collections.FrozenConfigDict(cfg_dictionary)

    return cfg
cfg = model_config()



In [8]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
import nltk

from nltk.stem import WordNetLemmatizer
from wordcloud import STOPWORDS as WC_STOPWORDS


# ==============================
# 1) Tải tài nguyên NLTK
# ==============================
nltk.download("wordnet")
nltk.download("omw-1.4")
lemmatizer = WordNetLemmatizer()

# Stopwords: KHÔNG bỏ từ phủ định
CUSTOM_STOPS = set(WC_STOPWORDS) - {
    "not", "no", "nor", "against", "ain", "aren", "couldn", "didn", "doesn",
    "hadn", "hasn", "haven", "isn", "mightn", "mustn", "needn", "shan",
    "shouldn", "wasn", "weren", "won", "wouldn"
}


# ==============================
# 2) Regex compile trước cho nhanh
# ==============================
RE_HTML = re.compile(r"<.*?>")
RE_URL = re.compile(r"(https?://\S+|www\.\S+)")
RE_MENTION = re.compile(r"@\w+")
RE_HASH = re.compile(r"#\w+")
# Giữ chữ, số, khoảng trắng và ký hiệu tài chính: $ % + - . , /
RE_KEEP = re.compile(r"[^A-Za-z0-9\$\%\+\-\,\./\s]")
RE_MULTIWS = re.compile(r"\s+")

# ==============================
# 3) Tiền xử lý văn bản
# ==============================
def expand_contractions(text: str) -> str:
    """Mở rộng các contractions phổ biến trong tiếng Anh."""
    t = re.sub(r"won\'t", "will not", text)
    t = re.sub(r"can\'t", "can not", t)
    t = re.sub(r"n\'t", " not", t)
    t = re.sub(r"\'re", " are", t)
    t = re.sub(r"\'s", " is", t)
    t = re.sub(r"\'d", " would", t)
    t = re.sub(r"\'ll", " will", t)
    t = re.sub(r"\'t", " not", t)
    t = re.sub(r"\'ve", " have", t)
    t = re.sub(r"\'m", " am", t)
    return t


def preprocess_text_fin(
    text: str,
    *,
    use_lemma: bool = True,
    remove_emoji: bool = False,
    normalize_currency: bool = True,
) -> str:
    """Tiền xử lý văn bản: bỏ HTML/URL/mention/hashtag, mở rộng contractions, chuẩn hóa ký hiệu tiền tệ, loại bỏ ký tự không cần thiết, lemmatize & bỏ stopwords (giữ từ phủ định)."""
    if text is None:
        return ""

    t = str(text).lower()

    # 1) Bỏ HTML, URL, mention/hashtag
    t = RE_HTML.sub(" ", t)
    t = RE_URL.sub(" ", t)
    t = RE_MENTION.sub(" ", t)
    t = RE_HASH.sub(" ", t)

    # 2) Mở rộng contractions
    t = expand_contractions(t)

    # 3) Chuẩn hóa tiền tệ thành token (tùy chọn)
    if normalize_currency:
        t = t.replace("$", " <currency> ")

    # 4) Emoji/Non-ascii (tùy chọn)
    if remove_emoji:
        t = emoji.replace_emoji(t, replace=" ")

    # 5) Lọc ký tự nhưng GIỮ số & % + - . , /
    t = RE_KEEP.sub(" ", t)
    t = RE_MULTIWS.sub(" ", t).strip()

    # 6) Lemmatization + bỏ stopwords (giữ từ phủ định)
    if use_lemma:
        tokens = []
        for w in t.split():
            if w in CUSTOM_STOPS:
                continue
            tokens.append(lemmatizer.lemmatize(w))
        t = " ".join(tokens)

    return t

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hung0\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hung0\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
def preprocess_csv(csv_file: str, cfg=None) -> pd.DataFrame:
    df = pd.read_csv(csv_file)

    # Nhãn số -> cột 'labels'
    df["labels"] = LabelEncoder().fit_transform(df["Sentiment"])

    # Loại trùng câu
    df.drop_duplicates(subset=["Sentence"], keep="first", inplace=True)

    # Chọn logic: nếu KHÔNG dùng transformer thì áp dụng preprocess_text_fin
    use_transformer = (cfg is not None and getattr(cfg, "model_type", "") == "transformer")
    if not use_transformer:
        df["Sentence"] = df["Sentence"].astype(str).apply(preprocess_text_fin)

    return df

In [9]:
df = preprocess_csv(cfg.data_path)
X_train, X_test, y_train, y_test = train_test_split(np.array(df["Sentence"]),np.array(df["labels"]), test_size=0.25, random_state=42)

In [10]:
tfidf = TfidfVectorizer(use_idf=True, tokenizer=word_tokenize,min_df=0.00002,max_df=0.70)
X_train_tf = tfidf.fit_transform(X_train.astype('U'))
X_test_tf = tfidf.transform(X_test.astype('U'))

print(f"TF_IDF Model: Train features shape:{X_train_tf.shape} and Test features shape:{X_test_tf.shape}")

TF_IDF Model: Train features shape:(3991, 9933) and Test features shape:(1331, 9933)


In [11]:
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
lgb = LGBMClassifier(random_state=42)
xgb = XGBClassifier(eval_metric="mlogloss",random_state=42)
dt = DecisionTreeClassifier(random_state=42)
svc = SVC(random_state=42)
nb = MultinomialNB()
mlp = MLPClassifier(random_state=42)

clfs = {
    "Random Forest": rf,
    "Gradient Boosting":gb,
    "AdaBoost": ada,
    "LightGBM": lgb,
    "XGBoost": xgb,
    "Decision Tree":dt,
    "Support Vector Machine":svc,
    "Naive Bayes": nb,
    "Multilayer Perceptron":mlp
}

def fit_model(clf,x_train,y_train,x_test, y_test):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    accuracy = accuracy_score(y_pred, y_test)
    return accuracy

accuracys = []

for name,clf in tqdm(clfs.items()):
    curr_acc = fit_model(clf,X_train_tf,y_train,X_test_tf,y_test)
    accuracys.append(curr_acc)

 33%|███▎      | 3/9 [00:09<00:17,  2.92s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005250 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8620
[LightGBM] [Info] Number of data points in the train set: 3991, number of used features: 425
[LightGBM] [Info] Start training from score -2.235013
[LightGBM] [Info] Start training from score -0.604717
[LightGBM] [Info] Start training from score -1.059064


100%|██████████| 9/9 [01:12<00:00,  8.06s/it]


In [12]:
models_df = pd.DataFrame({"Models":clfs.keys(),"Accuracy Scores":accuracys}).sort_values('Accuracy Scores',ascending=False)
models_df

Unnamed: 0,Models,Accuracy Scores
0,Random Forest,0.733283
4,XGBoost,0.732532
1,Gradient Boosting,0.729527
6,Support Vector Machine,0.728775
3,LightGBM,0.717506
8,Multilayer Perceptron,0.711495
5,Decision Tree,0.691961
7,Naive Bayes,0.676935
2,AdaBoost,0.62284


In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    confusion_matrix, roc_auc_score
)
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import seaborn as sns

def _maybe_proba_or_decision(clf, X):
    """Trả về scores xác suất/decision nếu có, else None."""
    if hasattr(clf, "predict_proba"):
        return clf.predict_proba(X)
    if hasattr(clf, "decision_function"):
        s = clf.decision_function(X)
        # Decision có thể 1D với binary; chuyển thành 2 cột
        if s.ndim == 1:
            s = np.vstack([-s, s]).T
        return s
    return None

def plot_confusion(y_true, y_pred, labels=None, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(5.2, 4.5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=labels, yticklabels=labels)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(title)
    plt.tight_layout()
    plt.show()

def evaluate_model(name, clf, Xtr, ytr, Xte, yte, label_names=None, show_plot=True):
    """Fit + in ra báo cáo & trả về metrics tổng hợp."""
    clf.fit(Xtr, ytr)
    y_pred = clf.predict(Xte)

    acc = accuracy_score(yte, y_pred)
    f1_macro = f1_score(yte, y_pred, average="macro")
    f1_weighted = f1_score(yte, y_pred, average="weighted")

    print(f"\n=== {name} ===")
    print(f"Accuracy       : {acc:.4f}")
    print(f"F1-macro       : {f1_macro:.4f}")
    print(f"F1-weighted    : {f1_weighted:.4f}")
    print("\nClassification report:")
    print(classification_report(yte, y_pred, target_names=label_names) if label_names is not None
          else classification_report(yte, y_pred))

    # Confusion matrix (plot)
    if show_plot:
        labels = list(range(len(np.unique(yte)))) if label_names is None else label_names
        plot_confusion(yte, y_pred, labels=labels, title=f"Confusion Matrix – {name}")

    # ROC-AUC macro (OVR) nếu có score
    auc_macro = None
    scores = _maybe_proba_or_decision(clf, Xte)
    if scores is not None:
        classes = np.unique(yte)
        Y_true_bin = label_binarize(yte, classes=classes)
        # Nếu scores shape không khớp số lớp, bỏ qua AUC
        if scores.shape[1] == Y_true_bin.shape[1]:
            try:
                auc_macro = roc_auc_score(Y_true_bin, scores, average="macro", multi_class="ovr")
                print(f"ROC-AUC macro  : {auc_macro:.4f}")
            except Exception as e:
                print(f"(Bỏ qua ROC-AUC: {e})")

    return {
        "model": name,
        "accuracy": acc,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "roc_auc_macro": auc_macro
    }

# ==== Chạy đánh giá cho tất cả models trong clfs ====
results = []
label_names = None  # hoặc: ["negative","neutral","positive"] nếu anh có mapping
for name, clf in clfs.items():
    res = evaluate_model(name, clf, X_train_tf, y_train, X_test_tf, y_test,
                         label_names=label_names, show_plot=False)  # đặt True nếu muốn vẽ heatmap từng model
    results.append(res)

results_df = pd.DataFrame(results).sort_values("f1_macro", ascending=False)
print("\n=== Tổng hợp ===")
print(results_df.to_string(index=False))



=== Random Forest ===
Accuracy       : 0.7333
F1-macro       : 0.5810
F1-weighted    : 0.7020

Classification report:
              precision    recall  f1-score   support

           0       0.74      0.14      0.23       165
           1       0.75      0.92      0.83       698
           2       0.70      0.66      0.68       468

    accuracy                           0.73      1331
   macro avg       0.73      0.57      0.58      1331
weighted avg       0.73      0.73      0.70      1331

ROC-AUC macro  : 0.8649

=== Gradient Boosting ===
Accuracy       : 0.7295
F1-macro       : 0.6244
F1-weighted    : 0.7072

Classification report:
              precision    recall  f1-score   support

           0       0.74      0.28      0.41       165
           1       0.72      0.94      0.81       698
           2       0.75      0.58      0.65       468

    accuracy                           0.73      1331
   macro avg       0.74      0.60      0.62      1331
weighted avg       0.73    