In [43]:
!pip install transformers torch scikit-learn pandas



In [44]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# C·∫•u h√¨nh thi·∫øt b·ªã
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [45]:
df = pd.read_csv("/kaggle/input/tiktok-comments/tiktok_comments_balanced.csv")
df.head()

Unnamed: 0,line_number,text,emotion_label
0,2679,s·ª£ th·∫≠t,3
1,26129,·ªëi d·ªìi √¥i fpt shop ch√°y t·ªõi ƒë√≥ m·ªát_√°,2
2,20113,xem m√† kh√≥c th∆∞∆°ng_k ch·ªãu dc,2
3,17380,ƒë√†n_b√† s·ªëng th·ªç h∆°n ƒë√†n_√¥ng l√† v·∫≠y k,0
4,18200,c√¥ ·∫•y gi√† ƒëi nhi·ªÅu qu√° v·∫´n nh·ªõ ·∫£nh ch·ª•p c√¥ l√∫c...,2


In [46]:
label_mapping = {
    0: "Vui v·∫ª",        # t√≠ch c·ª±c
    1: "T·ª©c gi·∫≠n",      # negative (gi·∫≠n d·ªØ, b·ª±c t·ª©c)
    2: "Bu·ªìn",          # sadness
    3: "S·ª£ h√£i",        # lo l·∫Øng, ho·∫£ng lo·∫°n
    4: "Trung l·∫≠p"      # neutral
}
df["label_text"] = df["emotion_label"].map(label_mapping)

le = LabelEncoder()
df['label'] = le.fit_transform(df['label_text'])
num_classes = len(le.classes_)


df

Unnamed: 0,line_number,text,emotion_label,label_text,label
0,2679,s·ª£ th·∫≠t,3,S·ª£ h√£i,1
1,26129,·ªëi d·ªìi √¥i fpt shop ch√°y t·ªõi ƒë√≥ m·ªát_√°,2,Bu·ªìn,0
2,20113,xem m√† kh√≥c th∆∞∆°ng_k ch·ªãu dc,2,Bu·ªìn,0
3,17380,ƒë√†n_b√† s·ªëng th·ªç h∆°n ƒë√†n_√¥ng l√† v·∫≠y k,0,Vui v·∫ª,4
4,18200,c√¥ ·∫•y gi√† ƒëi nhi·ªÅu qu√° v·∫´n nh·ªõ ·∫£nh ch·ª•p c√¥ l√∫c...,2,Bu·ªìn,0
...,...,...,...,...,...
14448,26549,nh√¨n s·ª£ th·∫ø,3,S·ª£ h√£i,1
14449,19664,kh·ªï th√¢n hai nh√† b√™n c·∫°nh,2,Bu·ªìn,0
14450,27967,h√£i lun vk ∆°i,3,S·ª£ h√£i,1
14451,7682,v≈©,4,Trung l·∫≠p,2


In [47]:
# Chia d·ªØ li·ªáu
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, stratify=df['label'], random_state=42
)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
MAX_LEN = 64

def tokenize(texts):
    input_ids, attention_masks = [], []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Token h√≥a
train_ids, train_masks = tokenize(train_texts.tolist())
val_ids, val_masks = tokenize(val_texts.tolist())

train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())

# DataLoader
train_data = TensorDataset(train_ids, train_masks, train_labels)
val_data = TensorDataset(val_ids, val_masks, val_labels)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [48]:
# Load BERT v√† LSTM
bert = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)
bert.eval() 

# ƒê·ªãnh nghƒ©a LSTM classifier th·ªß t·ª•c
lstm = nn.LSTM(input_size=768, hidden_size=128, num_layers=1,
               batch_first=True, bidirectional=True).to(device)
dropout = nn.Dropout(0.3).to(device)
fc = nn.Linear(128 * 2, num_classes).to(device)

params = list(lstm.parameters()) + list(fc.parameters())
optimizer = optim.Adam(params, lr=2e-4)
criterion = nn.CrossEntropyLoss()

In [49]:
# Hu·∫•n luy·ªán
EPOCHS = 10
for epoch in range(EPOCHS):
    lstm.train()
    fc.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        with torch.no_grad():
            bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_output.last_hidden_state  # (B, T, 768)

        lstm_out, _ = lstm(sequence_output)
        final_output = lstm_out[:, -1, :]
        dropped = dropout(final_output)
        logits = fc(dropped)

        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")

Epoch 1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:43<00:00,  8.40it/s]


Epoch 1 - Loss: 1.3652


Epoch 2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:43<00:00,  8.34it/s]


Epoch 2 - Loss: 1.1975


Epoch 3: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:42<00:00,  8.50it/s]


Epoch 3 - Loss: 1.1255


Epoch 4: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:43<00:00,  8.39it/s]


Epoch 4 - Loss: 1.0668


Epoch 5: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:42<00:00,  8.46it/s]


Epoch 5 - Loss: 1.0215


Epoch 6: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:42<00:00,  8.46it/s]


Epoch 6 - Loss: 0.9661


Epoch 7: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:42<00:00,  8.43it/s]


Epoch 7 - Loss: 0.9195


Epoch 8: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:42<00:00,  8.42it/s]


Epoch 8 - Loss: 0.8613


Epoch 9: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:43<00:00,  8.41it/s]


Epoch 9 - Loss: 0.8158


Epoch 10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 362/362 [00:43<00:00,  8.41it/s]

Epoch 10 - Loss: 0.7595





**ƒê√°nh gi√°**

In [50]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

lstm.eval()
fc.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc="ƒê√°nh gi√°"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        bert_output = bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = bert_output.last_hidden_state

        lstm_out, _ = lstm(sequence_output)
        final_output = lstm_out[:, -1, :]
        logits = fc(dropout(final_output))

        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# ƒê√°nh gi√°
acc = accuracy_score(all_labels, all_preds)
print(f"\n‚úÖ Accuracy: {acc:.4f}\n")

print("üìã Classification Report:")
print(classification_report(all_labels, all_preds, target_names=le.classes_.astype(str)))

# (Tu·ª≥ ch·ªçn) Confusion Matrix
print("üìä Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))


ƒê√°nh gi√°: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 91/91 [00:10<00:00,  8.68it/s]


‚úÖ Accuracy: 0.5358

üìã Classification Report:
              precision    recall  f1-score   support

        Bu·ªìn       0.53      0.52      0.53       626
      S·ª£ h√£i       0.82      0.60      0.69       551
   Trung l·∫≠p       0.50      0.61      0.55       551
    T·ª©c gi·∫≠n       0.56      0.48      0.51       611
      Vui v·∫ª       0.39      0.48      0.43       552

    accuracy                           0.54      2891
   macro avg       0.56      0.54      0.54      2891
weighted avg       0.56      0.54      0.54      2891

üìä Confusion Matrix:
[[326  14  85  76 125]
 [ 68 331  50  52  50]
 [ 45  10 336  40 120]
 [ 92  34  71 291 123]
 [ 80  17 129  61 265]]





Test model

In [51]:
def predict_sentiment(texts):
    if isinstance(texts, str):
        texts = [texts]

    # Token h√≥a
    input_ids, attention_masks = tokenize(texts)
    input_ids, attention_masks = input_ids.to(device), attention_masks.to(device)

    # Tr√≠ch ƒë·∫∑c tr∆∞ng t·ª´ BERT
    with torch.no_grad():
        bert_output = bert(input_ids=input_ids, attention_mask=attention_masks)
        sequence_output = bert_output.last_hidden_state

        lstm_out, _ = lstm(sequence_output)
        final_output = lstm_out[:, -1, :]
        logits = fc(dropout(final_output))
        preds = torch.argmax(logits, dim=1)

    return le.inverse_transform(preds.cpu().numpy())


In [52]:
test_inputs = [
    "T·ª± nhi√™n ra ƒë∆∞·ªùng c√°i k ƒë·ª•ng ch·∫°m j ai c·ªßng s·ª£ ngangüòë",
    "L√∫c ƒë·∫ßu th√¨ ""T√πng B√≤ ƒë√¢m, xong 1 l√∫c sau th√¨ ""v√†o t√π nha""üòÇ",
    "b·∫Øt qu√° nhanh ü•∞ C√¥ng An Vi·ªát Nam m√¨nh qu√° gi·ªèi, xu·∫•t s·∫Øc ü•∞"
]

predicted_labels = predict_sentiment(test_inputs)

for text, label in zip(test_inputs, predicted_labels):
    print(f"üìù \"{text}\" ‚Üí üí¨ D·ª± ƒëo√°n: {label}")


üìù "T·ª± nhi√™n ra ƒë∆∞·ªùng c√°i k ƒë·ª•ng ch·∫°m j ai c·ªßng s·ª£ ngangüòë" ‚Üí üí¨ D·ª± ƒëo√°n: S·ª£ h√£i
üìù "L√∫c ƒë·∫ßu th√¨ T√πng B√≤ ƒë√¢m, xong 1 l√∫c sau th√¨ v√†o t√π nhaüòÇ" ‚Üí üí¨ D·ª± ƒëo√°n: T·ª©c gi·∫≠n
üìù "b·∫Øt qu√° nhanh ü•∞ C√¥ng An Vi·ªát Nam m√¨nh qu√° gi·ªèi, xu·∫•t s·∫Øc ü•∞" ‚Üí üí¨ D·ª± ƒëo√°n: Vui v·∫ª
