In [1]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m47.7

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from underthesea import word_tokenize, text_normalize
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
stopword_path = '/kaggle/input/vietnamese-stopwords.txt'
data_path = '/kaggle/input/tiktok_comments_balanced.csv'

with open(stopword_path, 'r', encoding='utf-8') as f:
    stopwords = set([line.strip() for line in f if line.strip()])

In [4]:
def preprocess_text(text):
    text = text_normalize(text)
    tokens = word_tokenize(text, format="text").split()
    tokens = [token for token in tokens if token.lower() not in stopwords]
    return ' '.join(tokens)


In [5]:
comments=pd.read_csv('/kaggle/input/tiktok_comments_balanced.csv', usecols=['emotion_label', 'text'])
comments = comments.dropna(subset=['emotion_label'])

comments['emotion_label'] = comments['emotion_label'].astype(int)
print(comments.head())

print(f"Số lượng comments: {comments.shape[0]}")
print(f"comments {type(comments['text'].tolist())}")
unique_emotions = np.sort(comments['emotion_label'].unique())
print(unique_emotions)

                                                text  emotion_label
0                                            sợ thật              3
1               ối dồi ôi fpt shop cháy tới đó mệt_á              2
2                       xem mà khóc thương_k chịu dc              2
3               đàn_bà sống thọ hơn đàn_ông là vậy k              0
4  cô ấy già đi nhiều quá vẫn nhớ ảnh chụp cô lúc...              2
Số lượng comments: 14453
comments <class 'list'>
[0 1 2 3 4]


In [6]:
model_name="vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

2025-05-21 08:07:31.749208: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747814851.968790      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747814852.027929      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
comments['text'] = comments['text'].astype(str)
comments['text'] = comments['text'].apply(preprocess_text)
encoded_comments = tokenizer.batch_encode_plus(
    comments['text'].tolist(),
    add_special_tokens=True,
    padding='max_length',
    max_length=128,
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

input_ids = encoded_comments['input_ids']
attention_masks = encoded_comments['attention_mask']
labels = torch.tensor(comments['emotion_label'].values)


print(f"attention_masks{attention_masks}")
print(f"labels{labels}")

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

attention_maskstensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
labelstensor([3, 2, 2,  ..., 3, 4, 0])


In [8]:
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=0)
print(val_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7cd2c8d92f10>


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
optimizer = AdamW(model.parameters(), lr=5e-5)

train_losses = []
val_accuracies = []
val_losses = []

model.train()
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}", leave=False):
        input_ids, attention_mask, labels = [x.to(model.device) for x in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    model.eval()
    true_labels = []
    pred_labels = []
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}", leave=False):
            input_ids, attention_mask, labels = [x.to(model.device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, axis=1)
            pred_labels.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    accuracy = accuracy_score(true_labels, pred_labels)
    val_accuracies.append(accuracy)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Accuracy: {accuracy:.4f}")

Training Epoch 1:   0%|          | 0/813 [00:00<?, ?it/s]

Validation Epoch 1:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 1.1643 | Val Loss: 1.0239 | Val Accuracy: 0.6086


In [11]:
model.eval()

true_labels = []
predictions = []

with torch.no_grad():  # No need to track gradients
    for batch in tqdm(val_loader, desc="Evaluating"):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(model.device)
        attention_mask = attention_mask.to(model.device)
        labels = labels.to(model.device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        pred_labels = torch.argmax(logits, axis=1)
        
        predictions.extend(pred_labels.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Evaluating:   0%|          | 0/91 [00:00<?, ?it/s]

Accuracy: 0.6086
Precision: 0.6060
Recall: 0.6070
F1 Score: 0.6016


In [12]:
save_directory = '/kaggle/working/phobert-base'
os.makedirs(save_directory, exist_ok=True)

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to /kaggle/working/phobert-base


In [13]:
model = AutoModelForSequenceClassification.from_pretrained('/kaggle/working/phobert-base')
tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/phobert-base')

In [14]:
torch.save({
    'epoch': num_epochs,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss.item(),
}, os.path.join(save_directory, 'training_checkpoint.pth'))

In [15]:
checkpoint = torch.load('/kaggle/working/phobert-base/training_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']

In [16]:
# Mapping label để hiển thị dễ hiểu (bạn chỉnh sửa nếu cần)
label_names = {
    0: 'Vui vẻ',
    1: 'Tức giận',
    2: 'Buồn bã',
    3: 'Sợ hãi',
    4: 'Trung lập'
}

In [17]:
print("\n--- Examples of Predictions ---")
for i in range(10):
    pred = predictions[i]
    true = true_labels[i]
    text = comments.iloc[val_dataset.indices[i]]['text']
    status = "✅ Correct" if pred == true else "❌ Incorrect"
    print(f"{status} | Text: {text}\n  Predicted: {label_names[pred]}, Actual: {label_names[true]}\n")

correct = [(p, t, comments.iloc[val_dataset.indices[i]]['text']) 
           for i, (p, t) in enumerate(zip(predictions, true_labels)) if p == t]
incorrect = [(p, t, comments.iloc[val_dataset.indices[i]]['text']) 
             for i, (p, t) in enumerate(zip(predictions, true_labels)) if p != t]

print(f"Tổng số đúng: {len(correct)}")
print(f"Tổng số sai: {len(incorrect)}")



--- Examples of Predictions ---
✅ Correct | Text: tiktok
  Predicted: Vui vẻ, Actual: Vui vẻ

✅ Correct | Text: dương
  Predicted: Trung lập, Actual: Trung lập

✅ Correct | Text: run mắc cười
  Predicted: Sợ hãi, Actual: Sợ hãi

❌ Incorrect | Text: t rep m xui
  Predicted: Buồn bã, Actual: Tức giận

✅ Correct | Text: sợ
  Predicted: Sợ hãi, Actual: Sợ hãi

✅ Correct | Text: ghê
  Predicted: Sợ hãi, Actual: Sợ hãi

✅ Correct | Text: vnpt xe cẩu anh_hùng
  Predicted: Vui vẻ, Actual: Vui vẻ

✅ Correct | Text: nam mô_a di_đà phật
  Predicted: Trung lập, Actual: Trung lập

✅ Correct | Text: sapa k ae
  Predicted: Trung lập, Actual: Trung lập

✅ Correct | Text: eo sợ thế_thua
  Predicted: Sợ hãi, Actual: Sợ hãi

Tổng số đúng: 880
Tổng số sai: 566


In [18]:
def predict_text(text):
    model.eval()
    text = preprocess_text(text)
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        padding='max_length',
        max_length=64,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encoded['input_ids'].to(model.device)
    attention_mask = encoded['attention_mask'].to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    print(f"\n🔮 Dự đoán cảm xúc: {label_names[predicted_class]}")

In [19]:
user_input = "nghe bạn nam bảo : khổ thế nhờ đã dịch cô vít thì chớ mà xót🥺"
print(user_input)
predict_text(user_input)

nghe bạn nam bảo : khổ thế nhờ đã dịch cô vít thì chớ mà xót🥺

🔮 Dự đoán cảm xúc: Buồn bã


In [20]:
!zip -r /kaggle/working/phobert-base.zip /kaggle/working/phobert-base


  adding: kaggle/working/phobert-base/ (stored 0%)
  adding: kaggle/working/phobert-base/tokenizer_config.json (deflated 77%)
  adding: kaggle/working/phobert-base/bpe.codes (deflated 59%)
  adding: kaggle/working/phobert-base/model.safetensors (deflated 16%)
  adding: kaggle/working/phobert-base/added_tokens.json (stored 0%)
  adding: kaggle/working/phobert-base/config.json (deflated 54%)
  adding: kaggle/working/phobert-base/special_tokens_map.json (deflated 57%)
  adding: kaggle/working/phobert-base/training_checkpoint.pth (deflated 30%)
  adding: kaggle/working/phobert-base/vocab.txt (deflated 55%)
