In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm
import pickle
import os

# ==========================
# CONFIG
# ==========================
MODEL_NAME = "bert-base-uncased"
TEXT_COL = "text"
LABEL_COL = "emotion"   # đổi thành "risk" khi train risk
BATCH_SIZE = 16
MAX_LEN = 128
EPOCHS = 5
LR = 2e-5
PATIENCE = 2
OUTPUT_MODEL = "best_bert_model.pt"
OUTPUT_ENCODER = "label_encoder.pkl"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# ==========================
# LOAD DATA
# ==========================
train_df = pd.read_csv("../data/train/train.csv")
val_df   = pd.read_csv("../data/val/val.csv")
test_df  = pd.read_csv("../data/test/test.csv")

# ==========================
# LABEL ENCODING
# ==========================
le = LabelEncoder()
train_df[LABEL_COL] = le.fit_transform(train_df[LABEL_COL])
val_df[LABEL_COL]   = le.transform(val_df[LABEL_COL])
test_df[LABEL_COL]  = le.transform(test_df[LABEL_COL])
num_labels = len(le.classes_)
print("Labels:", le.classes_)

# Save label encoder for inference
with open(OUTPUT_ENCODER, "wb") as f:
    pickle.dump(le, f)

# ==========================
# CLASS WEIGHTS
# ==========================
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df[LABEL_COL]),
    y=train_df[LABEL_COL]
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

# ==========================
# TOKENIZER
# ==========================
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# ==========================
# DATASET
# ==========================
class TextDataset(Dataset):
    def __init__(self, df):
        self.texts = df[TEXT_COL].astype(str).tolist()
        self.labels = df[LABEL_COL].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# ==========================
# DATALOADER
# ==========================
train_loader = DataLoader(TextDataset(train_df), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(TextDataset(val_df), batch_size=BATCH_SIZE)
test_loader  = DataLoader(TextDataset(test_df), batch_size=BATCH_SIZE)

# ==========================
# MODEL
# ==========================
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)
model.to(DEVICE)
optimizer = AdamW(model.parameters(), lr=LR)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

# ==========================
# TRAINING LOOP + EARLY STOPPING
# ==========================
best_f1 = 0
counter = 0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(
            input_ids=batch["input_ids"].to(DEVICE),
            attention_mask=batch["attention_mask"].to(DEVICE)
        )
        loss = criterion(outputs.logits, batch["labels"].to(DEVICE))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print("Train loss:", avg_loss)

    # Validation
    model.eval()
    preds, labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(
                input_ids=batch["input_ids"].to(DEVICE),
                attention_mask=batch["attention_mask"].to(DEVICE)
            )
            preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
            labels.extend(batch["labels"].numpy())

    macro_f1 = f1_score(labels, preds, average="macro")
    print("Validation Macro F1:", macro_f1)

    if macro_f1 > best_f1:
        best_f1 = macro_f1
        counter = 0
        torch.save(model.state_dict(), OUTPUT_MODEL)
        print("✅ Saved best model")
    else:
        counter += 1
        if counter >= PATIENCE:
            print("⛔ Early stopping")
            break

# ==========================
# TEST & REPORT
# ==========================
print("\n===============================")
print(" TEST RESULTS ")
print("===============================")
model.load_state_dict(torch.load(OUTPUT_MODEL))
model.eval()
preds, labels = [], []
with torch.no_grad():
    for batch in test_loader:
        outputs = model(
            input_ids=batch["input_ids"].to(DEVICE),
            attention_mask=batch["attention_mask"].to(DEVICE)
        )
        preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
        labels.extend(batch["labels"].numpy())

print(classification_report(labels, preds, target_names=le.classes_))

# ==========================
# INFERENCE FUNCTION
# ==========================
def predict(text, model_path=OUTPUT_MODEL, encoder_path=OUTPUT_ENCODER):
    # Load label encoder
    with open(encoder_path, "rb") as f:
        le = pickle.load(f)
    # Load model
    model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(le.classes_)
    )
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.eval()
    model.to(DEVICE)

    # Tokenize
    enc = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model(**enc)
        pred_id = outputs.logits.argmax(dim=1).item()
    return le.classes_[pred_id]

# ==========================
# TEST INFERENCE
# ==========================
example_text = "I feel very stressed about my exams."
print("Predicted emotion:", predict(example_text))


Device: cpu
Labels: ['angry' 'anxious' 'happy' 'neutral' 'sad' 'stressed']


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/5


  0%|          | 3/9242 [01:11<61:32:21, 23.98s/it]


KeyboardInterrupt: 