# LR

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import joblib

In [8]:
# 1) Load data
train = pd.read_csv("data/train.csv")
valid = pd.read_csv("data/valid.csv")
test = pd.read_csv("data/test.csv")

# 2) Pick columns (handle missing)
X_train = train["text_clean"]
y_train = train["label"]
X_valid = valid["text_clean"]
y_valid = valid["label"]
X_test = test["text_clean"]
y_test = test["label"]

In [19]:
# 3) Pipeline 
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents="unicode"
    )),
    ("clf", LogisticRegression(
        solver="saga",
        penalty="l2",
        max_iter=3000,
        n_jobs=-1,
        class_weight="balanced"
    ))
])

In [20]:
# 4) Param grid 
param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [1, 2, 5],
    "tfidf__max_df": [0.9, 0.95, 1.0],
    "tfidf__sublinear_tf": [True, False],
    "clf__C": [0.25, 0.5, 1.0, 2.0, 4.0],
}

In [21]:
# 5) GridSearchCV 
# Use macro-F1 for multi-class; change to "f1_weighted" if your classes are imbalanced and you care about overall
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("\nBest CV score (f1_macro):", grid.best_score_)
print("Best params:", grid.best_params_)

Fitting 5 folds for each of 180 candidates, totalling 900 fits

Best CV score (f1_macro): 0.6756785211529684
Best params: {'clf__C': 0.5, 'tfidf__max_df': 0.9, 'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'tfidf__sublinear_tf': True}


In [25]:
# 6) Evaluate on valid set 
best_model = grid.best_estimator_
pred = best_model.predict(X_valid)

print("\nHoldout Accuracy:", accuracy_score(y_valid, pred))
print("Holdout Macro F1:", f1_score(y_valid, pred, average="macro"))
print("\nClassification report:\n", classification_report(y_valid, pred))
print("Confusion matrix:\n", confusion_matrix(y_valid, pred))



Holdout Accuracy: 0.6933997509339975
Holdout Macro F1: 0.695131974274536

Classification report:
               precision    recall  f1-score   support

           0       0.68      0.71      0.69      1188
           1       0.63      0.62      0.63      1472
           2       0.77      0.76      0.77      1355

    accuracy                           0.69      4015
   macro avg       0.69      0.70      0.70      4015
weighted avg       0.69      0.69      0.69      4015

Confusion matrix:
 [[ 838  282   68]
 [ 323  910  239]
 [  73  246 1036]]


In [26]:
# 7) Evaluate on test set 
best_model = grid.best_estimator_
pred = best_model.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test, pred))
print("Test Macro F1:", f1_score(y_test, pred, average="macro"))
print("\nClassification report:\n", classification_report(y_test, pred))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))



Test Accuracy: 0.6800298804780877
Test Macro F1: 0.6819684140376904

Classification report:
               precision    recall  f1-score   support

           0       0.68      0.67      0.68      1188
           1       0.61      0.62      0.62      1473
           2       0.76      0.75      0.75      1355

    accuracy                           0.68      4016
   macro avg       0.68      0.68      0.68      4016
weighted avg       0.68      0.68      0.68      4016

Confusion matrix:
 [[ 801  311   76]
 [ 309  914  250]
 [  72  267 1016]]


In [27]:
# 1) Load data
train = pd.read_csv("data/train.csv")
valid = pd.read_csv("data/valid.csv")
test = pd.read_csv("data/test.csv")

# 2) Pick columns (handle missing)
X_train = train["text_clean"]
y_train = train["label"]
X_valid = valid["text_clean"]
y_valid = valid["label"]
X_test = test["text_clean"]
y_test = test["label"]

# 4) Pipeline: TF-IDF -> Logistic Regression (multiclass)
model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents="unicode",
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LogisticRegression(
        max_iter=2000,
        n_jobs=-1,
    ))
])

# 5) Train
model.fit(X_train, y_train)

# 6) Evaluate
pred = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, pred))
print("Macro F1:", f1_score(y_valid, pred, average="macro"))
print("\nClassification report:\n", classification_report(y_valid, pred))
print("Confusion matrix:\n", confusion_matrix(y_valid, pred))

# 7) Save model
joblib.dump(model, "logreg_tfidf.joblib")
print("Saved to logreg_tfidf.joblib")


Accuracy: 0.6861768368617683
Macro F1: 0.6879467874219563

Classification report:
               precision    recall  f1-score   support

           0       0.70      0.64      0.67      1188
           1       0.61      0.65      0.63      1472
           2       0.77      0.76      0.76      1355

    accuracy                           0.69      4015
   macro avg       0.69      0.69      0.69      4015
weighted avg       0.69      0.69      0.69      4015

Confusion matrix:
 [[ 765  353   70]
 [ 272  959  241]
 [  55  269 1031]]
Saved to logreg_tfidf.joblib


SVM

In [12]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# =========================
# Load data
# =========================
train = pd.read_csv("data/train.csv")
valid = pd.read_csv("data/valid.csv")
test  = pd.read_csv("data/test.csv")

X_train = train["text_clean"]
y_train = train["label"]
X_valid = valid["text_clean"]
y_valid = valid["label"]
X_test  = test["text_clean"]
y_test  = test["label"]

# =========================
# Pipeline: TF-IDF + Linear SVM
# =========================
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents="unicode"
    )),
    ("clf", LinearSVC(
        class_weight="balanced",
        max_iter=5000
    ))
])

# =========================
# Param grid (song song vá»›i LR)
# =========================
param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__min_df": [1, 2, 5],
    "tfidf__max_df": [0.9, 0.95, 1.0],
    "tfidf__sublinear_tf": [True, False],
    "clf__C": [0.25, 0.5, 1.0, 2.0, 4.0],
}

# =========================
# GridSearch
# =========================
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train, y_train)

print("Best CV Macro-F1:", grid.best_score_)
print("Best params:", grid.best_params_)

# =========================
# Evaluate on valid
# =========================
best_model = grid.best_estimator_
pred = best_model.predict(X_valid)

print("Valid Accuracy:", accuracy_score(y_valid, pred))
print("Valid Macro F1:", f1_score(y_valid, pred, average="macro"))
print("\nClassification report:\n", classification_report(y_valid, pred))
print("Confusion matrix:\n", confusion_matrix(y_valid, pred))

# =========================
# Evaluate on test
# =========================
pred = best_model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, pred))
print("Test Macro F1:", f1_score(y_test, pred, average="macro"))
print("\nClassification report:\n", classification_report(y_test, pred))
print("Confusion matrix:\n", confusion_matrix(y_test, pred))


Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best CV Macro-F1: 0.6758208266951533
Best params: {'clf__C': 0.25, 'tfidf__max_df': 0.9, 'tfidf__min_df': 1, 'tfidf__ngram_range': (1, 2), 'tfidf__sublinear_tf': True}
Valid Accuracy: 0.687173100871731
Valid Macro F1: 0.6863049492832762

Classification report:
               precision    recall  f1-score   support

           0       0.66      0.71      0.69      1188
           1       0.65      0.56      0.60      1472
           2       0.74      0.80      0.77      1355

    accuracy                           0.69      4015
   macro avg       0.68      0.69      0.69      4015
weighted avg       0.68      0.69      0.68      4015

Confusion matrix:
 [[ 840  259   89]
 [ 344  831  297]
 [  80  187 1088]]
Test Accuracy: 0.6795318725099602
Test Macro F1: 0.6794786076149442

Classification report:
               precision    recall  f1-score   support

           0       0.68      0.70      0.69      1188
           1      

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents="unicode",
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        sublinear_tf=True
    )),
    ("clf", LinearSVC(
        max_iter=5000
    ))
])

model.fit(X_train, y_train)

pred = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, pred))
print("Macro F1:", f1_score(y_valid, pred, average="macro"))
print("\nClassification report:\n", classification_report(y_valid, pred))


Accuracy: 0.6547945205479452
Macro F1: 0.6558016780969845

Classification report:
               precision    recall  f1-score   support

           0       0.65      0.66      0.65      1188
           1       0.58      0.55      0.57      1472
           2       0.73      0.76      0.75      1355

    accuracy                           0.65      4015
   macro avg       0.65      0.66      0.66      4015
weighted avg       0.65      0.65      0.65      4015



# Roberta-Twitter

In [None]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)

# =========================
# Config
# =========================
TEST_CSV_PATH = "data/test.csv"  
task = "sentiment"
MODEL_NAME = f"cardiffnlp/twitter-roberta-base-{task}"

BATCH_SIZE = 32
MAX_LENGTH = 256

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# =========================
# Dataset
# =========================
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def collate_fn(batch_texts, tokenizer):
    return tokenizer(
        batch_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH,
    )

# =========================
# Load data
# =========================
df = pd.read_csv(TEST_CSV_PATH)
X = df["text_clean"].fillna("").astype(str).tolist()
y_true = df["label"].astype(int).to_numpy()

# =========================
# Load model/tokenizer
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()

# =========================
# Inference (batched)
# =========================
loader = DataLoader(
    TextDataset(X),
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=lambda x: collate_fn(x, tokenizer),
)

all_probs = []
with torch.no_grad():
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = model(**batch).logits.detach().cpu().numpy()  # (B, 3)
        probs = softmax(logits, axis=1)                        # (B, 3)
        all_probs.append(probs)

probs = np.vstack(all_probs)          # (N, 3)
y_pred = probs.argmax(axis=1)         # (N,)

# =========================
# Metrics
# =========================
acc = accuracy_score(y_true, y_pred)
f1  = f1_score(y_true, y_pred, average="macro")
prec = precision_score(y_true, y_pred, average="macro", zero_division=0)
rec  = recall_score(y_true, y_pred, average="macro", zero_division=0)

# ROC-AUC multi-class (OvR) needs probabilities

try:
    auc = roc_auc_score(
        y_true,
        probs,
        multi_class="ovr",
        average="macro"
    )
except ValueError as e:
    auc = None
    print(f"[WARN] ROC-AUC cannot be computed: {e}")

print("Accuracy   :", acc)
print("Macro F1   :", f1)
print("Macro Prec :", prec)
print("Macro Recall:", rec)
print("Macro ROC-AUC (OVR):", auc)

Accuracy   : 0.636203349985307
Macro F1   : 0.6371563833433259
Macro Prec : 0.659878647015379
Macro Recall: 0.6313676681735534
Macro ROC-AUC (OVR): 0.8192323538565706
