In [131]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score, f1_score
)
import numpy as np
import pandas as pd
import sys, os

# ====== YOUR PREPROCESSOR ======
sys.path.append(os.path.abspath("../.."))
from src.preprocessing import preprocess_text_fin


In [132]:
DATA_PATH = "../../data/data.csv"
RANDOM_STATE = 42

# 1) C·∫•u h√¨nh chung (√°p d·ª•ng cho m·ªçi m√¥ h√¨nh)

**L√Ω do thi·∫øt k·∫ø TF-IDF & CV:**

* **`ngram_range=(1,2)`**: c√¢u ng·∫Øn ‚Üí bigram b·∫Øt c·ª•m gi√†u t√≠n hi·ªáu (‚Äúrose sharply‚Äù, ‚Äúbeat estimates‚Äù).
* **`min_df‚àà{3,5,10}`**: l·ªçc t·ª´ qu√° hi·∫øm do t·∫≠p ch·ªâ \~5.8k c√¢u.
* **`max_df‚àà{0.9,0.95}`**: lo·∫°i t·ª´ qu√° ph·ªï bi·∫øn (g·∫ßn stopword).
* **`max_features‚àà{5k,10k,15k}`**: c√¢n b·∫±ng bias/variance v·ªõi c·ª° d·ªØ li·ªáu \~5.8k.
* **`sublinear_tf=True`**: gi·∫£m ‚Äúburstiness‚Äù c·ªßa t·ª´ xu·∫•t hi·ªán nhi·ªÅu.
* **`norm='l2'`**: ph√π h·ª£p linear models tr√™n d·ªØ li·ªáu sparse.

In [133]:
# Kh√¥ng ch·ªânh tokenizer trong grid ƒë·ªÉ tr√°nh l·ªói set_params
BASE_TFIDF = TfidfVectorizer(
    preprocessor=preprocess_text_fin,
    tokenizer=word_tokenize,   # g·∫Øn v·ªõi ti·ªÅn x·ª≠ l√Ω text ti·∫øng Anh
    token_pattern=None,
    use_idf=True
)

tfidf_space = {
    # (1,1) gi·ªØ unigram cho t·ª´ kh√≥a ƒë∆°n; (1,2) th√™m bigram ƒë·ªÉ b·∫Øt c·ª•m gi√†u ng·ªØ nghƒ©a
    # v√¨ c√¢u ng·∫Øn n√™n kh√¥ng d√πng trigram ƒë·ªÉ tr√°nh lo√£ng & overfit.
    "tfidf__ngram_range": [(1,1), (1,2)],

    # L·ªçc t·ª´ qu√° hi·∫øm (typo/nhi·ªÖu) v√¨ data ch·ªâ ~5.8k c√¢u ‚Üí gi·∫£m chi·ªÅu & variance.
    "tfidf__min_df": [3, 5, 10],

    # Lo·∫°i t·ª´ qu√° ph·ªï bi·∫øn (g·∫ßn stopword) nh∆∞ng KH√îNG x√≥a stopwords c·ª©ng
    # ƒë·ªÉ gi·ªØ ph·ªß ƒë·ªãnh / ƒë∆°n v·ªã t√†i ch√≠nh (e.g., "not", "%", "per cent").
    "tfidf__max_df": [0.90, 0.95],

    # C√¢n b·∫±ng gi·ªØa ƒë·ªô ph·ªß t·ª´ v·ª±ng t√†i ch√≠nh v√† nguy c∆° overfit/chi ph√≠ t√≠nh.
    "tfidf__max_features": [5000, 10000, 15000],

    # Gi·∫£m ‚Äúburstiness‚Äù khi m·ªôt t·ª´ l·∫∑p nhi·ªÅu trong c√πng vƒÉn b·∫£n ‚Üí tr·ªçng s·ªë ·ªïn ƒë·ªãnh h∆°n.
    "tfidf__sublinear_tf": [True],

    # L2 gi√∫p linear models ho·∫°t ƒë·ªông t·ªët tr√™n vector sparse, chu·∫©n h√≥a ƒë·ªô d√†i t√†i li·ªáu.
    "tfidf__norm": ["l2"],
}

# 2) Logistic Regression + TF-IDF (baseline m·∫°nh)

**L√Ω do ch·ªçn & tham s·ªë:**

* LR r·∫•t h·ª£p v·ªõi vector TF-IDF high-dimensional; nhanh, ·ªïn ƒë·ªãnh.
* **`C`** ƒëi·ªÅu khi·ªÉn regularization: d·ªØ li·ªáu 5.8k ‚Üí c·∫ßn qu√©t t·ª´ ch·∫∑t ƒë·∫øn l·ªèng.
* **`solver='liblinear'` + `multi_class='ovr'`**: ·ªïn v·ªõi sparse v√† 3 l·ªõp.
* **`class_weight='balanced'`**: b√π l·ªách l·ªõp

In [134]:
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([
    ("tfidf", BASE_TFIDF),
    ("clf", OneVsRestClassifier(
        LogisticRegression(
            penalty="l2",                                    # L2 ·ªïn ƒë·ªãnh tr√™n high-dimensional TF-IDF
            solver="liblinear",                              # Ph√π h·ª£p sparse + multi-class OVR quy m√¥ v·ª´a
            class_weight="balanced",                         # C√¢n l·ªách l·ªõp 54/32/14 theo 1/freq
            max_iter=2000,                                   # ƒê·∫£m b·∫£o h·ªôi t·ª• khi feature nhi·ªÅu (bigram, 10k+)
            random_state=RANDOM_STATE
        ),
        n_jobs=-1
    ))
])


param_grid_lr = {
    **tfidf_space,
    # Qu√©t t·ª´ regularization m·∫°nh ‚Üí y·∫øu ƒë·ªÉ t√¨m ƒëi·ªÉm c√¢n b·∫±ng bias/variance v·ªõi ~5.8k c√¢u
    "clf__estimator__C": [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
}

# 3) Linear SVM (LinearSVC) + TF-IDF (m·∫°nh v·ªõi d·ªØ li·ªáu sparse)

**L√Ω do & tham s·ªë:**

* Linear SVM th∆∞·ªùng cho bi√™n t·ªët tr√™n TF-IDF.
* **`C`** qu√©t r·ªông ƒë·ªÉ t√¨m margin t·ªëi ∆∞u.
* **`loss='squared_hinge'`** l√† chu·∫©n cho LinearSVC.
* **`class_weight='balanced'`** x·ª≠ l√Ω l·ªách.


In [135]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
pipe_svm = Pipeline([
    ("tfidf", BASE_TFIDF),
    ("clf", CalibratedClassifierCV(
        estimator=LinearSVC(
            class_weight="balanced",   # b√π l·ªách l·ªõp 54/32/14 theo 1/freq
            max_iter=5000,             # nhi·ªÅu feature ‚Üí c·∫ßn ƒë·ªß v√≤ng ƒë·ªÉ h·ªôi t·ª•
            random_state=42          # th√™m n·∫øu mu·ªën t√°i l·∫≠p
        ),
        cv=3,                          # ‚ö†Ô∏è calibration c≈©ng CV ‚Üí gi·∫£m t·ª´ 5 xu·ªëng 3 ƒë·ªÉ ti·∫øt ki·ªám th·ªùi gian
        method="sigmoid"               # Platt scaling: nhanh, ·ªïn ƒë·ªãnh; ƒë·ªß t·ªët cho 5.8k m·∫´u
    ))
])
param_grid_svm = {
    **tfidf_space,
    # ƒêi·ªÅu ch·ªânh ƒë·ªô r·ªông margin; qu√©t r·ªông ƒë·ªÉ tr√°nh under/over-regularize
    "clf__estimator__C": [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
}

# 4) Multinomial Naive Bayes + TF-IDF (baseline c·ª±c nhanh)

**L√Ω do & tham s·ªë:**

* MNB ph√π h·ª£p x√°c su·∫•t t·ª´; nhanh, nh·∫π ƒë·ªÉ l√†m baseline.
* **`alpha`** (Laplace/Lidstone) r·∫•t quan tr·ªçng v√¨ t·ª´ hi·∫øm xu·∫•t hi·ªán nhi·ªÅu trong text ng·∫Øn ‚Üí qu√©t log-scale.
* **`fit_prior`** b·∫≠t/t·∫Øt ƒë·ªÉ xem t√°c ƒë·ªông c·ªßa ph√¢n b·ªë l·ªõp l·ªách.


In [136]:
from sklearn.naive_bayes import MultinomialNB

pipe_mnb = Pipeline([
    ("tfidf", BASE_TFIDF),
    ("clf", MultinomialNB()) # Baseline r·∫•t nhanh cho text; d√πng x√°c su·∫•t t·ª´

])

param_grid_mnb = {
    **tfidf_space,
    # L√†m m∆∞·ª£t x√°c su·∫•t cho t·ª´ hi·∫øm (r·∫•t ph·ªï bi·∫øn trong c√¢u ng·∫Øn) ‚Üí qu√©t log-scale
    "clf__alpha": [0.01, 0.05, 0.1, 0.5, 1.0],
    # Th·ª≠ d√πng/kh√¥ng d√πng prior v√¨ d·ªØ li·ªáu l·ªách l·ªõp; xem prior c√≥ gi√∫p ·ªïn ƒë·ªãnh hay l√†m thi√™n l·ªách
    "clf__fit_prior": [True, False],
}

# 5) Random Forest + TF-IDF

**L√Ω do & tham s·ªë:**

* RF kh√¥ng l√Ω t∆∞·ªüng n·∫øu **ch·ªâ** TF-IDF (qu√° high-dim). Tuy nhi√™n, n·∫øu sau n√†y anh **g·ªôp th√™m feature th·ªëng k√™** (ƒë·ªô d√†i c√¢u, s·ªë ti·ªÅn, c√≥/kh√¥ng d·∫•u %) th√¨ RF ph√°t huy.
* D√π v·∫≠y, ƒë·ªÉ ho√†n ch·ªânh benchmark, ta v·∫´n cho grid g·ªçn, c√≥ **`class_weight='balanced_subsample'`** ƒë·ªÉ b√π l·ªách.


In [137]:
from sklearn.ensemble import RandomForestClassifier

pipe_rf = Pipeline([
    ("tfidf", BASE_TFIDF),
    ("clf", RandomForestClassifier(
        n_jobs=-1,
        class_weight="balanced_subsample",
        random_state=42
    ))
])

param_grid_rf = {
    **tfidf_space,
}

## 6) Gradient Boosting + TF-IDF (boosting c√¢y nh·∫π)

**L√Ω do & tham s·ªë:**

* GBC th∆∞·ªùng k√©m thu·∫≠n l·ª£i h∆°n linear models tr√™n TF-IDF thu·∫ßn, nh∆∞ng ƒë√°ng th·ª≠ cho benchmark; n·∫øu th√™m feature th·ªëng k√™/kh√°c mi·ªÅn, GBC c·∫£i thi·ªán r√µ.

* Kh√¥ng c√≥ class_weight, ta regularize b·∫±ng subsample + min_samples_leaf ƒë·ªÉ b·ªõt overfit l·ªõp l·ªõn.

In [138]:
from sklearn.ensemble import GradientBoostingClassifier

pipe_gb = Pipeline([
    ("tfidf", BASE_TFIDF),
    ("clf", GradientBoostingClassifier(
        random_state=RANDOM_STATE
    ))
])

param_grid_gb = {
    **tfidf_space,
    # C·∫∑p (learning_rate, n_estimators) ƒëi·ªÅu khi·ªÉn bias/variance:
    # lr nh·ªè + nhi·ªÅu c√¢y ‚Üí t·ªïng qu√°t h∆°n; lr l·ªõn + √≠t c√¢y ‚Üí nhanh nh∆∞ng d·ªÖ overfit.
    # "clf__n_estimators": [200, 400],
    # "clf__learning_rate": [0.05, 0.1],
    # # C√¢y n√¥ng (2‚Äì3) ph√π h·ª£p d·ªØ li·ªáu nhi·ªÖu/sparse, tr√°nh h·ªçc quan h·ªá gi·∫£
    # "clf__max_depth": [2, 3],
    # # Stochastic boosting (subsample<1) gi·∫£m variance & c·∫£i thi·ªán t·ªïng qu√°t h√≥a
    # "clf__subsample": [0.8, 1.0],
    # # Tr√°nh l√° qu√° nh·ªè d·ªÖ b√°m nhi·ªÖu c·ªßa l·ªõp l·ªõn
    # "clf__min_samples_leaf": [1, 2],
    # # Gi·ªõi h·∫°n s·ªë feature m·ªói split ƒë·ªÉ b·ªõt nhi·ªÖu khi high-dim TF-IDF
    # "clf__max_features": [None, "sqrt"],
}

# 5) GridSearchCV

In [139]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
def run_gs(pipe, grid, X_train, y_train):
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring="f1_macro",
        cv=cv,
        n_jobs=-1,
        verbose=1
    )
    gs.fit(X_train, y_train)
    return gs

# 6) Preprocessing 

In [140]:
def preprocess_csv(csv_file: str) -> pd.DataFrame:
    df = pd.read_csv(csv_file)
    df["labels"] = LabelEncoder().fit_transform(df["Sentiment"])
    df.drop_duplicates(subset=["Sentence"], keep="first", inplace=True)
    # df["Sentence"] = df["Sentence"].astype(str).apply(preprocess_text_fin)
    return df

In [141]:
df = preprocess_csv(DATA_PATH)
X_train, X_test, y_train, y_test = train_test_split(
    np.array(df["Sentence"]),
    np.array(df["labels"]), 
    test_size=0.25, 
    random_state=42
    )

In [142]:

gs_lr = run_gs(pipe_lr, param_grid_lr, X_train, y_train)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


In [143]:
gs_svm = run_gs(pipe_svm, param_grid_svm, X_train, y_train)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


In [144]:
gs_mnb = run_gs(pipe_mnb, param_grid_mnb, X_train, y_train)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


In [145]:
gs_rf = run_gs(pipe_rf, param_grid_rf, X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [146]:
gs_gb = run_gs(pipe_gb, param_grid_gb, X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [147]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    classification_report, confusion_matrix, roc_auc_score
)
from sklearn.utils.multiclass import unique_labels

# 1) T√≥m t·∫Øt k·∫øt qu·∫£ Cross-Validation (best) c·ªßa nhi·ªÅu GridSearch
def summarize_cv_results(gridsearch_dict):
    rows = []
    for name, gs in gridsearch_dict.items():
        rows.append({
            "model": name,
            "cv_best_f1_macro": gs.best_score_,
            "best_params": gs.best_params_
        })
    df = pd.DataFrame(rows).sort_values("cv_best_f1_macro", ascending=False)
    print("\n=== CV Best Summary (sorted by f1_macro) ===")
    print(df.to_string(index=False))
    return df

# 2) ƒê√°nh gi√° chi ti·∫øt tr√™n t·∫≠p test cho 1 GridSearch ƒë√£ fit
def evaluate_on_test(gs, X_test, y_test, model_name="Model"):
    best_est = gs.best_estimator_
    y_pred   = best_est.predict(X_test)

    # Th·ª≠ l·∫•y score cho ROC-AUC macro (∆∞u ti√™n predict_proba, fallback decision_function)
    y_score = None
    has_proba = hasattr(best_est, "predict_proba")
    has_decision = hasattr(best_est, "decision_function")
    if has_proba:
        try:
            y_score = best_est.predict_proba(X_test)
        except Exception:
            y_score = None
    if (y_score is None) and has_decision:
        try:
            y_score = best_est.decision_function(X_test)
        except Exception:
            y_score = None

    # Metrics c∆° b·∫£n
    acc   = accuracy_score(y_test, y_pred)
    f1m   = f1_score(y_test, y_pred, average="macro")
    precm = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recm  = recall_score(y_test, y_pred, average="macro", zero_division=0)

    # ROC-AUC macro (n·∫øu c√≥ y_score nhi·ªÅu l·ªõp)
    rocauc = None
    if y_score is not None:
        try:
            rocauc = roc_auc_score(y_test, y_score, multi_class="ovo", average="macro")
        except Exception:
            rocauc = None

    # In t·ªïng quan
    print(f"\n================= {model_name} =================")
    print("Best params:", gs.best_params_)
    print(f"CV best f1_macro: {gs.best_score_:.4f}")
    print(f"Test Accuracy   : {acc:.4f}")
    print(f"Test F1-macro   : {f1m:.4f}")
    print(f"Test Precision-m: {precm:.4f}")
    print(f"Test Recall-m   : {recm:.4f}")
    if rocauc is not None:
        print(f"Test ROC-AUC-m  : {rocauc:.4f}")

    # Classification report
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=4, zero_division=0))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    labels = unique_labels(y_test, y_pred)
    print("Confusion matrix (rows=true, cols=pred):")
    print(pd.DataFrame(cm, index=[f"true_{l}" for l in labels],
                          columns=[f"pred_{l}" for l in labels]).to_string())
    return {
        "model": model_name,
        "cv_best_f1_macro": gs.best_score_,
        "test_accuracy": acc,
        "test_f1_macro": f1m,
        "test_precision_macro": precm,
        "test_recall_macro": recm,
        "test_roc_auc_macro": rocauc,
        "best_params": gs.best_params_
    }

# 3) Ch·∫°y ƒë√°nh gi√° cho t·∫•t c·∫£ m√¥ h√¨nh + b·∫£ng t·ªïng h·ª£p test
def evaluate_all(gs_dict, X_test, y_test):
    _ = summarize_cv_results(gs_dict)
    test_rows = []
    for name, gs in gs_dict.items():
        res = evaluate_on_test(gs, X_test, y_test, model_name=name)
        test_rows.append(res)
    test_df = pd.DataFrame(test_rows).sort_values("test_f1_macro", ascending=False)
    print("\n=== Test Summary (sorted by f1_macro) ===")
    print(test_df[["model","cv_best_f1_macro","test_f1_macro","test_accuracy","test_precision_macro","test_recall_macro","test_roc_auc_macro"]].to_string(index=False))
    return test_df


## ‚úÖ K·∫øt lu·∫≠n t·ªïng quan

* **M√¥ h√¨nh t·ªët nh·∫•t**: **Logistic Regression (LR)**

  * **CV F1-macro**: 0.6517 ‚Üí **Test F1-macro**: **0.6943** (cao nh·∫•t)
  * **Test Accuracy**: 0.7423 | **ROC-AUC macro**: 0.8774 (kh·∫£ nƒÉng x·∫øp h·∫°ng t·ªët ‚Üí t·ªëi ∆∞u ng∆∞·ª°ng c√≤n d∆∞ ƒë·ªãa)
* **Th·ª© h·∫°ng ti·∫øp theo**: **MultinomialNB** (0.6581) ‚âà **LinearSVM** (0.6488) > **RandomForest** (0.6209) ‚âà **GradBoost** (0.6176)

## üìå Di·ªÖn gi·∫£i nhanh theo l·ªõp

* **LR** c√¢n b·∫±ng t·ªët nh·∫•t gi·ªØa c√°c l·ªõp; **l·ªõp 0** v·∫´n l√† ƒëi·ªÉm y·∫øu c·ªßa m·ªçi m√¥ h√¨nh nh∆∞ng LR c√≥ **recall l·ªõp 0 \~0.60** (t·ªët h∆°n SVM/RF/GB).
* **SVM** accuracy t∆∞∆°ng ƒë∆∞∆°ng LR nh∆∞ng **macro-F1 th·∫•p** do **recall l·ªõp 0 s·ª•t (0.315)**.
* **MNB** l√† baseline nhanh v√† kh√° c√¢n b·∫±ng, nh∆∞ng k√©m LR m·ªôt ch√∫t.
* **RF/GBoost** cho accuracy ·ªïn nh∆∞ng **macro-F1 th·∫•p** ‚Äî ƒë·∫∑c tr∆∞ng c·ªßa c√¢y tr√™n TF-IDF sparse.


In [152]:
gs_dict = {
    "Logistic Regression": gs_lr,
    "LinearSVM": gs_svm,
    "MultinomialNB": gs_mnb,
    "RandomForest": gs_rf,
    "GradBoost": gs_gb,
}
test_summary = evaluate_all(gs_dict, X_test, y_test)


=== CV Best Summary (sorted by f1_macro) ===
              model  cv_best_f1_macro                                                                                                                                                                                         best_params
Logistic Regression          0.665786                   {'clf__estimator__C': 2.0, 'tfidf__max_df': 0.9, 'tfidf__max_features': 5000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}
      MultinomialNB          0.636856 {'clf__alpha': 0.5, 'clf__fit_prior': False, 'tfidf__max_df': 0.9, 'tfidf__max_features': 5000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 1), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}
          LinearSVM          0.617239                   {'clf__estimator__C': 0.5, 'tfidf__max_df': 0.9, 'tfidf__max_features': 5000, 'tfidf__min_df': 3, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}
       RandomFores

In [153]:
from pathlib import Path
from joblib import dump
import time

# === Th∆∞ m·ª•c l∆∞u ===
SAVE_DIR = Path("../../models")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

def save_selfcontained(name: str, model_or_gs, with_ts: bool = False):
    pipe = model_or_gs.best_estimator_ if hasattr(model_or_gs, "best_estimator_") else model_or_gs
    assert hasattr(pipe, "predict"), "Object ph·∫£i l√† pipeline/estimator ƒë√£ fit."
    suffix = f"_{time.strftime('%Y%m%d-%H%M%S')}" if with_ts else ""
    path = SAVE_DIR / f"selfcontained_{name}{suffix}.joblib"
    dump(pipe, path, compress=3)
    print(f"‚úÖ Saved: {path}")

# === L∆∞u t·∫•t c·∫£ m√¥ h√¨nh ===
save_selfcontained("logreg",        gs_lr)        # Logistic Regression
save_selfcontained("linearsvm_cal", gs_svm)       # LinearSVC ƒë√£ b·ªçc CalibratedClassifierCV
save_selfcontained("mnb",           gs_mnb)       # MultinomialNB
save_selfcontained("rf",            gs_rf)        # RandomForest
save_selfcontained("gboost",        gs_gb)        # GradientBoosting

‚úÖ Saved: ..\..\models\selfcontained_logreg.joblib
‚úÖ Saved: ..\..\models\selfcontained_linearsvm_cal.joblib
‚úÖ Saved: ..\..\models\selfcontained_mnb.joblib
‚úÖ Saved: ..\..\models\selfcontained_rf.joblib
‚úÖ Saved: ..\..\models\selfcontained_gboost.joblib
