In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%pip install transformers
%pip install torch
%pip install pandas
%pip install datasets

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
from datasets import load_dataset
dataset = load_dataset("imdb", split='train[:1000]')
test_data = load_dataset("imdb", split='test[:200]')

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

def preprocess(data):
    inputs = tokenizer(data['text'], padding=True, truncation=True, return_tensors="pt", max_length=128)
    labels = torch.tensor(data['label'])
    return inputs, labels

train_inputs, train_labels = preprocess(dataset)
test_inputs, test_labels = preprocess(test_data)

class TransformerClassifier(nn.Module):
    def __init__(self, num_labels):
        super(TransformerClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Take the CLS token output
        return self.classifier(pooled_output)

model = TransformerClassifier(num_labels=2)

from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=8)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")


Epoch 1, Loss: 0.02079708417505026
Epoch 2, Loss: 0.0016345793046057225
Epoch 3, Loss: 0.0009931858177296817
Epoch 4, Loss: 0.0007268807697109878
Epoch 5, Loss: 0.0005843806103803217
Epoch 6, Loss: 0.0005018800520338118
Epoch 7, Loss: 0.0004361661018338054
Epoch 8, Loss: 0.0003964835526421666
Epoch 9, Loss: 0.0003597970535047352
Epoch 10, Loss: 0.00033509825682267547
Accuracy: 100.00%


---

In [1]:
!pip install --upgrade transformers accelerate




In [12]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Tải dữ liệu
train_data = load_dataset("imdb", split='train[:1000]')
test_data = load_dataset("imdb", split='test[:200]')

# Khởi tạo tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Tokenize dữ liệu
def tokenize_data(data, tokenizer):
    return data.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length'), batched=True)

train_data_bert = tokenize_data(train_data, bert_tokenizer)
test_data_bert = tokenize_data(test_data, bert_tokenizer)
train_data_roberta = tokenize_data(train_data, roberta_tokenizer)
test_data_roberta = tokenize_data(test_data, roberta_tokenizer)

# Đảm bảo dữ liệu đã được đặt lại dạng PyTorch tensor
train_data_bert.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data_bert.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
train_data_roberta.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data_roberta.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [15]:
import torch
from torch.utils.data import Dataset

class IMDbDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        item = {
            'input_ids': self.dataset['input_ids'][idx],
            'attention_mask': self.dataset['attention_mask'][idx],
            'labels': self.dataset['label'][idx]
        }
        return item

    def __len__(self):
        return len(self.dataset)

train_dataset_bert = IMDbDataset(train_data_bert)
test_dataset_bert = IMDbDataset(test_data_bert)
train_dataset_roberta = IMDbDataset(train_data_roberta)
test_dataset_roberta = IMDbDataset(test_data_roberta)


In [27]:
from sklearn.metrics import accuracy_score
import numpy as np


In [28]:
# Hàm tính toán độ chính xác
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

In [29]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Khởi tạo mô hình BERT
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Định nghĩa Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer_bert = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset_bert,
    eval_dataset=test_dataset_bert,
    compute_metrics=compute_metrics,
)

trainer_bert.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.001425,1.0
2,No log,0.000721,1.0
3,No log,0.0006,1.0


TrainOutput(global_step=189, training_loss=0.01980894583242911, metrics={'train_runtime': 304.4982, 'train_samples_per_second': 9.852, 'train_steps_per_second': 0.621, 'total_flos': 789333166080000.0, 'train_loss': 0.01980894583242911, 'epoch': 3.0})

In [30]:
# Khởi tạo mô hình RoBERTa
roberta_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

trainer_roberta = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_dataset_roberta,
    eval_dataset=test_dataset_roberta,
    compute_metrics=compute_metrics,
)

trainer_roberta.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.000212,1.0
2,No log,0.000137,1.0
3,No log,0.000121,1.0


TrainOutput(global_step=189, training_loss=0.02813162374748755, metrics={'train_runtime': 306.8951, 'train_samples_per_second': 9.775, 'train_steps_per_second': 0.616, 'total_flos': 789333166080000.0, 'train_loss': 0.02813162374748755, 'epoch': 3.0})

In [31]:
# Đánh giá mô hình BERT
bert_results = trainer_bert.evaluate()
bert_accuracy = bert_results['eval_accuracy']

# Đánh giá mô hình RoBERTa
roberta_results = trainer_roberta.evaluate()
roberta_accuracy = roberta_results['eval_accuracy']

In [33]:
print("BERT:", bert_results)
print("RoBERTa:", roberta_results)

BERT: {'eval_loss': 0.0006004861206747591, 'eval_accuracy': 1.0, 'eval_runtime': 6.0315, 'eval_samples_per_second': 33.159, 'eval_steps_per_second': 2.155, 'epoch': 3.0}
RoBERTa: {'eval_loss': 0.00012144901847932488, 'eval_accuracy': 1.0, 'eval_runtime': 6.412, 'eval_samples_per_second': 31.192, 'eval_steps_per_second': 2.027, 'epoch': 3.0}
