In [None]:
import pandas as pd
df=pd.read_csv(r'C:\Users\nguye\Downloads\sentiment-analysis-svm-distilbert\imdb_dataset.csv')
df.head()

Unnamed: 0,text,label,type
0,Story of a man who has unnatural feelings for ...,0,train
1,Airport '77 starts as a brand new luxury 747 p...,0,train
2,This film lacked something I couldn't put my f...,0,train
3,"Sorry everyone,,, I know this is supposed to b...",0,train
4,When I was little my parents took me along to ...,0,train


In [None]:
import re
import html
import unicodedata

def clean_text_bert(text):
    text = html.unescape(str(text))
    text = re.sub(r'<.*?>', ' ', text)
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'([,.!?])\1+', r'\1', text)  # thu gọn dấu câu lặp
    text = text.lower()  # nếu bạn dùng bert-base-uncased
    return text


In [None]:
bert_data = df['text'].apply(clean_text_bert)

In [None]:
print(bert_data)

0        story of a man who has unnatural feelings for ...
1        airport '77 starts as a brand new luxury 747 p...
2        this film lacked something i couldn't put my f...
3        sorry everyone, i know this is supposed to be ...
4        when i was little my parents took me along to ...
                               ...                        
49995    i was extraordinarily impressed by this film. ...
49996    although i'm not a golf fan, i attended a snea...
49997    from the start of "the edge of love", the view...
49998    this movie, with all its complexity and subtle...
49999    i've seen this story before but my kids haven'...
Name: text, Length: 50000, dtype: object


In [None]:
bert_df = pd.DataFrame({
    'text': bert_data,
    'label': df['label']
})


In [None]:
bert_df.head()

Unnamed: 0,text,label
0,story of a man who has unnatural feelings for ...,0
1,airport '77 starts as a brand new luxury 747 p...,0
2,this film lacked something i couldn't put my f...,0
3,"sorry everyone, i know this is supposed to be ...",0
4,when i was little my parents took me along to ...,0


In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers scikit-learn tqdm


Looking in indexes: https://download.pytorch.org/whl/cu121


In [None]:

from sklearn.model_selection import StratifiedKFold
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
import numpy as np
from tqdm import tqdm
from torch.optim import AdamW
from transformers.optimization import get_linear_schedule_with_warmup

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for fold, (train_idx, val_idx) in enumerate(skf.split(bert_df['text'], bert_df['label'])):
    print(f"\n===== Fold {fold+1} =====")

    X_train, X_val = bert_df['text'].iloc[train_idx], bert_df['text'].iloc[val_idx]
    y_train, y_val = bert_df['label'].iloc[train_idx], bert_df['label'].iloc[val_idx]

    # 3Tokenize
    train_enc = tokenizer(list(X_train), truncation=True, padding=True, max_length=128, return_tensors='pt')
    val_enc   = tokenizer(list(X_val), truncation=True, padding=True, max_length=128, return_tensors='pt')

    # 4 Dataset & DataLoader
    train_dataset = torch.utils.data.TensorDataset(
        train_enc['input_ids'], train_enc['attention_mask'], torch.tensor(y_train.values)
    )
    val_dataset = torch.utils.data.TensorDataset(
        val_enc['input_ids'], val_enc['attention_mask'], torch.tensor(y_val.values)
    )

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16)

    # 5️Model + Optimizer
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * 3
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    # 6️ Huấn luyện
    for epoch in range(3):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            b_input_ids, b_mask, b_labels = [t.to(device) for t in batch]
            optimizer.zero_grad()

            outputs = model(input_ids=b_input_ids, attention_mask=b_mask, labels=b_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")

    # 7️ Đánh giá
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            b_input_ids, b_mask, b_labels = [t.to(device) for t in batch]
            outputs = model(input_ids=b_input_ids, attention_mask=b_mask)
            logits = outputs.logits#ma trận xác suất trước softmax
            preds.extend(torch.argmax(logits, axis=1).cpu().numpy())#danh sách nhãn mô hình dự đoán
            true_labels.extend(b_labels.cpu().numpy())# danh sách nhãn mô hình thực tế

    acc = np.mean(np.array(preds) == np.array(true_labels))# So sánh từng dự đoán với nhãn thật
    print(f" Fold {fold+1} Accuracy: {acc:.4f}")
    accuracies.append(acc)
print("\n Accuracy từng fold:", accuracies)
print(" Accuracy trung bình:", np.mean(accuracies))



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]


===== Fold 1 =====


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2500/2500 [07:30<00:00,  5.55it/s]


Epoch 1 | Loss: 0.3230


Epoch 2: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 2 | Loss: 0.1947


Epoch 3: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 3 | Loss: 0.1089
 Fold 1 Accuracy: 0.8977

===== Fold 2 =====


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2500/2500 [07:34<00:00,  5.50it/s]


Epoch 1 | Loss: 0.3216


Epoch 2: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 2 | Loss: 0.1921


Epoch 3: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 3 | Loss: 0.1034
 Fold 2 Accuracy: 0.8882

===== Fold 3 =====


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2500/2500 [07:34<00:00,  5.49it/s]


Epoch 1 | Loss: 0.3247


Epoch 2: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 2 | Loss: 0.1936


Epoch 3: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 3 | Loss: 0.1075
 Fold 3 Accuracy: 0.8888

===== Fold 4 =====


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2500/2500 [07:34<00:00,  5.50it/s]


Epoch 1 | Loss: 0.3196


Epoch 2: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 2 | Loss: 0.1946


Epoch 3: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 3 | Loss: 0.1050
 Fold 4 Accuracy: 0.8981

===== Fold 5 =====


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2500/2500 [07:34<00:00,  5.50it/s]


Epoch 1 | Loss: 0.3181


Epoch 2: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 2 | Loss: 0.1899


Epoch 3: 100%|██████████| 2500/2500 [07:35<00:00,  5.49it/s]


Epoch 3 | Loss: 0.1020
 Fold 5 Accuracy: 0.8945

 Accuracy từng fold: [np.float64(0.8977), np.float64(0.8882), np.float64(0.8888), np.float64(0.8981), np.float64(0.8945)]
 Accuracy trung bình: 0.8934599999999999


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
'''save_path = "/content/drive/MyDrive/bert_sentiment_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(" Model đã được lưu vào Google Drive:", save_path)'''


'save_path = "/content/drive/MyDrive/bert_sentiment_model"\nmodel.save_pretrained(save_path)\ntokenizer.save_pretrained(save_path)\nprint(" Model đã được lưu vào Google Drive:", save_path)'