In [5]:
!pip install --upgrade pip

!pip install transformers tokenizers

print("Successfully upgraded pip and installed transformers & tokenizers.")

Successfully upgraded pip and installed transformers & tokenizers.


In [2]:
!pip install torch

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [37]:

import pandas as pd
import numpy as np
import re
import random
import os
import unicodedata
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW                
from transformers import get_scheduler, AutoTokenizer, AutoConfig, AutoModel 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from datetime import timedelta


In [12]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed()

In [13]:
df = pd.read_csv("/kaggle/input/mainnn/Data AI.csv", sep=";", engine="python", encoding="utf-8")
sentences = df['Sentence'].tolist()
labels = df['Emotion'].tolist()
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, encoded_labels, test_size=0.2, random_state=42)


In [14]:
# Tokenizer
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
config = AutoConfig.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding='max_length', max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding='max_length', max_length=128)


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [16]:
# Dataset
class PhoBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['label'] = torch.tensor(self.labels[idx])
        return item

train_dataset = PhoBERTDataset(train_encodings, train_labels)
val_dataset = PhoBERTDataset(val_encodings, val_labels)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

In [17]:
# Model
class PhoBERTClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.phobert = AutoModel.from_pretrained(model_name, config=config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.phobert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]
        return self.classifier(self.dropout(pooled))

# Init
num_classes = len(label_encoder.classes_)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PhoBERTClassifier(num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*10)

2025-06-23 09:16:39.035060: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750670199.216353      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750670199.271760      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [18]:
# Training & Evaluation
history = {"train_loss": [], "val_loss": [], "val_acc": []}
best_acc = 0
best_model_state_dict = None
best_y_true, best_y_pred = [], []


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [19]:
def evaluate(model, dataloader):
    model.eval()
    y_true, y_pred = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)
            preds = torch.argmax(outputs, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            total_loss += loss.item()
    acc = accuracy_score(y_true, y_pred)
    return total_loss / len(dataloader), acc, y_true, y_pred

In [20]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(timedelta(seconds=elapsed_rounded))

def train_one_epoch(model, loader, optimizer, scheduler, loss_fn, device, epoch):
    model.train()
    total_loss = 0
    pbar = tqdm(enumerate(loader), total=len(loader), desc=f"Epoch {epoch:02d} | Training")
    for i, batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        current_lr = optimizer.param_groups[0]['lr']
        avg_loss = total_loss / (i + 1)
        pbar.set_postfix(avg_loss=f"{avg_loss:.4f}", lr=f"{current_lr:.1e}")
    return total_loss / len(loader)

print("Starting training...")
total_t0 = time.time()

history = {"train_loss": [], "val_loss": [], "val_acc": []}
best_acc = 0
best_model_state_dict = None

for epoch in range(1, 12):
    train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, loss_fn, device, epoch)
    val_loss, val_acc, _, _ = evaluate(model, val_loader)
    
    print(f"Epoch {epoch:02d} Summary | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)

    if val_acc > best_acc:
        best_acc = val_acc
        best_model_state_dict = model.state_dict()
        print(f"----> Improved accuracy to {best_acc:.4f}. Best model updated. <----")

total_training_time = format_time(time.time() - total_t0)
print("\n--- Training Complete ---")
print(f"Total training took: {total_training_time}")

Starting training...


Epoch 01 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 01 Summary | Train Loss: 1.0081 | Val Loss: 0.6054 | Val Acc: 0.7992
----> Improved accuracy to 0.7992. Best model updated. <----


Epoch 02 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 02 Summary | Train Loss: 0.5389 | Val Loss: 0.5035 | Val Acc: 0.8341
----> Improved accuracy to 0.8341. Best model updated. <----


Epoch 03 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 03 Summary | Train Loss: 0.4078 | Val Loss: 0.4484 | Val Acc: 0.8547
----> Improved accuracy to 0.8547. Best model updated. <----


Epoch 04 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 04 Summary | Train Loss: 0.3228 | Val Loss: 0.4480 | Val Acc: 0.8601
----> Improved accuracy to 0.8601. Best model updated. <----


Epoch 05 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 05 Summary | Train Loss: 0.2603 | Val Loss: 0.4341 | Val Acc: 0.8637
----> Improved accuracy to 0.8637. Best model updated. <----


Epoch 06 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 06 Summary | Train Loss: 0.2139 | Val Loss: 0.4463 | Val Acc: 0.8652
----> Improved accuracy to 0.8652. Best model updated. <----


Epoch 07 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 07 Summary | Train Loss: 0.1803 | Val Loss: 0.4672 | Val Acc: 0.8649


Epoch 08 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 08 Summary | Train Loss: 0.1570 | Val Loss: 0.4668 | Val Acc: 0.8658
----> Improved accuracy to 0.8658. Best model updated. <----


Epoch 09 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 09 Summary | Train Loss: 0.1408 | Val Loss: 0.4561 | Val Acc: 0.8702
----> Improved accuracy to 0.8702. Best model updated. <----


Epoch 10 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 10 Summary | Train Loss: 0.1302 | Val Loss: 0.4592 | Val Acc: 0.8693


Epoch 11 | Training:   0%|          | 0/235 [00:00<?, ?it/s]

Epoch 11 Summary | Train Loss: 0.1235 | Val Loss: 0.4592 | Val Acc: 0.8693

--- Training Complete ---
Total training took: 1:07:37


In [21]:
print(f" Best Validation Accuracy: {best_acc:.4f}")
best_model_path = "/kaggle/working/phobert_best_model1.pt"
torch.save(best_model_state_dict, best_model_path)
print(f"Best model saved to: {best_model_path}")

 Best Validation Accuracy: 0.8702
Best model saved to: /kaggle/working/phobert_best_model1.pt


In [40]:
def get_predictions_and_probabilities(model, dataloader, device):
    model.eval()
    y_true, y_pred, y_proba = [], [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_proba.extend(probabilities.cpu().numpy())
    return np.array(y_true), np.array(y_pred), np.array(y_proba)

In [41]:
# The comprehensive display function (Corrected for the warning)
def display_all_results(y_true, y_pred, y_proba, class_labels, history):
    print("              CLASSIFICATION REPORT ON TEST SET")
    print(classification_report(y_true, y_pred, target_names=class_labels, digits=4))

    fig = plt.figure(figsize=(24, 30))
    gs = fig.add_gridspec(4, 2)

    ax1 = fig.add_subplot(gs[0, 0])
    ax1.plot(history['train_loss'], 'b-o', label='Train Loss')
    ax1.plot(history['val_loss'], 'r-o', label='Val Loss')
    ax1.set_title('Loss over Epochs', fontsize=16)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()

    ax2 = fig.add_subplot(gs[0, 1])
    ax2.plot(history['val_acc'], 'g-o', label='Val Accuracy')
    ax2.set_title('Validation Accuracy over Epochs', fontsize=16)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    
    ax3 = fig.add_subplot(gs[1, :])
    report_dict = classification_report(y_true, y_pred, target_names=class_labels, output_dict=True)
    report_df = pd.DataFrame(report_dict).iloc[:-1, :].T
    sns.heatmap(report_df, annot=True, cmap='viridis', fmt='.4f', ax=ax3, annot_kws={"size": 12})
    ax3.set_title('Classification Report Heatmap', fontsize=16)
    
    ax4 = fig.add_subplot(gs[2, 0])
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
    disp.plot(cmap='Blues', ax=ax4, xticks_rotation=45)
    ax4.set_title('Confusion Matrix', fontsize=16)

    ax5 = fig.add_subplot(gs[2, 1])
    report_df_metrics = report_df[['precision', 'recall', 'f1-score']].drop(['accuracy', 'macro avg', 'weighted avg'])
    report_df_metrics.plot(kind='barh', ax=ax5)
    ax5.set_title('Per-Class Performance Metrics', fontsize=16)
    ax5.set_xlabel('Score')
    ax5.grid(axis='x', linestyle='--')
    
    ax6 = fig.add_subplot(gs[3, :])
    y_true_bin = label_binarize(y_true, classes=range(len(class_labels)))

    cmap = plt.colormaps['tab10'] 
    colors = [cmap(i) for i in np.linspace(0, 1, len(class_labels))]
    
    for i, class_name in enumerate(class_labels):
        fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_proba[:, i])
        roc_auc = auc(fpr, tpr)
        ax6.plot(fpr, tpr, color=colors[i], lw=2, label=f'{class_name} (AUC = {roc_auc:.4f})')
        
    ax6.plot([0, 1], [0, 1], 'k--', lw=2, label='Chance (AUC = 0.50)')
    ax6.set_xlim([0.0, 1.0])
    ax6.set_ylim([0.0, 1.05])
    ax6.set_xlabel('False Positive Rate', fontsize=12)
    ax6.set_ylabel('True Positive Rate', fontsize=12)
    ax6.set_title('Receiver Operating Characteristic (ROC) Curves', fontsize=16)
    ax6.legend(loc="lower right")
    ax6.grid(True)

    plt.tight_layout()
    plt.show()

In [None]:

print("Loading best model for final evaluation...")
model.load_state_dict(best_model_state_dict)

y_true_final, y_pred_final, y_proba_final = get_predictions_and_probabilities(
    model, 
    val_loader,
    device
)

# 4. Define your class labels from the encoder
class_names = list(label_encoder.classes_)

# 5. Display everything!
display_all_results(y_true_final, y_pred_final, y_proba_final, class_names, history)

Loading best model for final evaluation...


Evaluating:   0%|          | 0/59 [00:00<?, ?it/s]

              CLASSIFICATION REPORT ON TEST SET
              precision    recall  f1-score   support

     buồn bã     0.8600    0.8970    0.8781      1233
     ghê tởm     0.8654    0.8730    0.8692      1134
  ngạc nhiên     0.8361    0.8280    0.8320      1041
      sợ hãi     0.8700    0.8744    0.8722      1194
    tức giận     0.8903    0.8932    0.8917      1535
      vui vẻ     0.8826    0.8418    0.8617      1384

    accuracy                         0.8693      7521
   macro avg     0.8674    0.8679    0.8675      7521
weighted avg     0.8694    0.8693    0.8692      7521



In [43]:
def predict_with_confidence(text, model, tokenizer, label_encoder, device):
    # Set the model to evaluation mode
    model.eval()
    
    encoded_text = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt' 
    )
    
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        
        probabilities = F.softmax(outputs, dim=1)
        
        confidence, predicted_idx = torch.max(probabilities, dim=1)
        
        predicted_label = label_encoder.inverse_transform(predicted_idx.cpu().numpy())[0]
        
        all_probs = probabilities.cpu().numpy().flatten()
        class_labels = label_encoder.classes_
        confidence_per_label = {label: prob for label, prob in zip(class_labels, all_probs)}
        
    return predicted_label, confidence.item(), confidence_per_label

In [44]:

print("Loading best model for prediction...")
model.load_state_dict(best_model_state_dict)
model.to(device)

input_text = "trời ơi sản phẩm này tốt ngoài sức tưởng tượng của mình"

predicted_label, confidence, all_confidences = predict_with_confidence(
    input_text, 
    model, 
    tokenizer, 
    label_encoder, 
    device
)

print(f"\nInput Text: '{input_text}'")
print("-" * 50)
print(f"==> Final Prediction: {predicted_label.upper()} (Confidence: {confidence:.2%})")
print("-" * 50)
print("Confidence Breakdown:")
sorted_confidences = sorted(all_confidences.items(), key=lambda item: item[1], reverse=True)
for label, prob in sorted_confidences:
    print(f"- {label:<12}: {prob:.2%}")

Loading best model for prediction...

Input Text: 'trời ơi sản phẩm này tốt ngoài sức tưởng tượng của mình'
--------------------------------------------------
==> Final Prediction: VUI VẺ (Confidence: 60.06%)
--------------------------------------------------
Confidence Breakdown:
- vui vẻ      : 60.06%
- ngạc nhiên  : 23.34%
- buồn bã     : 15.71%
- sợ hãi      : 0.48%
- tức giận    : 0.23%
- ghê tởm     : 0.18%
