### Bài 1

In [1]:
!pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsuite-0.5.0


In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from google.colab import drive
from pyvi import ViTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import f1_score, classification_report

drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/UIT-VSFC data'

train_df = pd.read_json(f'{base_path}/UIT-VSFC-train.json')
dev_df   = pd.read_json(f'{base_path}/UIT-VSFC-dev.json')
test_df  = pd.read_json(f'{base_path}/UIT-VSFC-test.json')

labelid = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

y_train = train_df["sentiment"].map(labelid).values
y_dev   = dev_df["sentiment"].map(labelid).values
y_test  = test_df["sentiment"].map(labelid).values

X_train_text = train_df['sentence'].values
X_dev_text   = dev_df['sentence'].values
X_test_text  = test_df['sentence'].values

def preprocess_text(text_list):
    return [ViTokenizer.tokenize(str(text)) for text in text_list]

X_train_text = preprocess_text(X_train_text)
X_dev_text   = preprocess_text(X_dev_text)
X_test_text  = preprocess_text(X_test_text)

vocab_size = 12000
max_len = 100
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_dev_seq   = tokenizer.texts_to_sequences(X_dev_text)
X_test_seq  = tokenizer.texts_to_sequences(X_test_text)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='pre')
X_dev_pad   = pad_sequences(X_dev_seq, maxlen=max_len, padding='pre')
X_test_pad  = pad_sequences(X_test_seq, maxlen=max_len, padding='pre')

X_train_pad = np.array(X_train_pad).astype(np.int64)
X_dev_pad   = np.array(X_dev_pad).astype(np.int64)
X_test_pad  = np.array(X_test_pad).astype(np.int64)

y_train = np.array(y_train).astype(np.int64)
y_dev   = np.array(y_dev).astype(np.int64)
y_test  = np.array(y_test).astype(np.int64)

train_data = TensorDataset(torch.tensor(X_train_pad), torch.tensor(y_train))
dev_data   = TensorDataset(torch.tensor(X_dev_pad), torch.tensor(y_dev))
test_data  = TensorDataset(torch.tensor(X_test_pad), torch.tensor(y_test))

batch_size = 64
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
dev_loader   = DataLoader(dev_data, batch_size=batch_size)
test_loader  = DataLoader(test_data, batch_size=batch_size)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
class LstmModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers, n_labels, padding_idx=0):
        super(LstmModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=n_layers,
            batch_first=True,
            dropout=0.2
        )

        self.classifier = nn.Linear(hidden_size, n_labels)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embeds)
        last_hidden = hidden[-1]
        out = self.dropout(last_hidden)
        logits = self.classifier(out)
        return logits

In [28]:
VOCAB_SIZE = vocab_size
EMBEDDING_DIM = 128
HIDDEN_SIZE = 256
N_LAYERS = 5
N_LABELS = 3
LR = 0.001
epochs=20

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LstmModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, N_LAYERS, N_LABELS)
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()

def train_and_evaluate(model, train_loader, dev_loader, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for text, labels in train_loader:
            text, labels = text.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(text)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        model.eval()
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for text, labels in dev_loader:
                text, labels = text.to(device), labels.to(device)
                outputs = model(text)
                preds = torch.argmax(outputs, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        epoch_f1 = f1_score(all_labels, all_preds, average='weighted')
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {avg_train_loss:.4f}")
        print(f"F1: {epoch_f1:.12f}")
        print("-" * 30)

train_and_evaluate(model, train_loader, dev_loader, epochs)

Epoch 1/20
Train Loss: 0.5109
F1: 0.863097510887
------------------------------
Epoch 2/20
Train Loss: 0.3372
F1: 0.869259657246
------------------------------
Epoch 3/20
Train Loss: 0.2652
F1: 0.912155847344
------------------------------
Epoch 4/20
Train Loss: 0.2307
F1: 0.909451276361
------------------------------
Epoch 5/20
Train Loss: 0.2025
F1: 0.917021240413
------------------------------
Epoch 6/20
Train Loss: 0.1777
F1: 0.910768065187
------------------------------
Epoch 7/20
Train Loss: 0.1614
F1: 0.907937126380
------------------------------
Epoch 8/20
Train Loss: 0.1294
F1: 0.917524534055
------------------------------
Epoch 9/20
Train Loss: 0.1158
F1: 0.907533743782
------------------------------
Epoch 10/20
Train Loss: 0.1034
F1: 0.911162861669
------------------------------
Epoch 11/20
Train Loss: 0.0920
F1: 0.920521109734
------------------------------
Epoch 12/20
Train Loss: 0.0821
F1: 0.906519145492
------------------------------
Epoch 13/20
Train Loss: 0.0694
F1: 0.

In [29]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for text, labels in test_loader:
            text, labels = text.to(device), labels.to(device)
            outputs = model(text)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f"Đánh giá trên tập test:")
    print(f"F1-Score (Weighted): {f1:.4f}")
    print(classification_report(all_labels, all_preds, target_names=['Tiêu cực', 'Trung tính', 'Tích cực']))

evaluate_model(model, test_loader)

Đánh giá trên tập test:
F1-Score (Weighted): 0.8952
              precision    recall  f1-score   support

    Tiêu cực       0.91      0.92      0.92      1409
  Trung tính       0.47      0.46      0.47       167
    Tích cực       0.92      0.92      0.92      1590

    accuracy                           0.90      3166
   macro avg       0.77      0.77      0.77      3166
weighted avg       0.89      0.90      0.90      3166

