In [31]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
import time
import numpy as np

In [32]:
df = pd.read_csv('emails_en.csv')
df.head()

Unnamed: 0,index,email_from,data,label
0,0,['lehoangminh.ai.tech@gmail.com'],Unprecedented shocking promotion: shop without...,spam
1,1,['lehoangminh.ai.tech@gmail.com'],Get 5 million VND instantly with just a few si...,spam
2,2,['lehoangminh.ai.tech@gmail.com'],You have won the special prize from our promot...,spam
3,3,['lehoangminh.ai.tech@gmail.com'],Secret information just for you: a huge cash o...,spam
4,4,['lehoangminh.ai.tech@gmail.com'],Click the link to receive your prize instantly...,spam


In [33]:
label_encoder = LabelEncoder()
df['label_id'] = label_encoder.fit_transform(df['label'])
print(label_encoder.classes_) # tương đương các nhãn sau khi labelencode

['advertising' 'entertainment' 'friends' 'spam' 'study' 'work']


In [34]:
df.head()

Unnamed: 0,index,email_from,data,label,label_id
0,0,['lehoangminh.ai.tech@gmail.com'],Unprecedented shocking promotion: shop without...,spam,3
1,1,['lehoangminh.ai.tech@gmail.com'],Get 5 million VND instantly with just a few si...,spam,3
2,2,['lehoangminh.ai.tech@gmail.com'],You have won the special prize from our promot...,spam,3
3,3,['lehoangminh.ai.tech@gmail.com'],Secret information just for you: a huge cash o...,spam,3
4,4,['lehoangminh.ai.tech@gmail.com'],Click the link to receive your prize instantly...,spam,3


In [35]:
# Load tokenizer & model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6) 

emails_split = []
labels_split = []

for i in range(len(df)):
    text = df.loc[i, "data"]
    label_id = df.loc[i, "label_id"]
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    emails_split.extend(sentences)
    labels_split.extend([label_id] * len(sentences))

print("Tổng số câu sau khi tách:", len(emails_split))
print("Ví dụ:", emails_split[:3], labels_split[:3])

train_texts, test_texts, train_labels, test_labels = train_test_split(
    emails_split,
    labels_split,
    test_size=0.2,
    random_state=42,
    stratify=labels_split  # cân bằng theo nhãn
)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = EmailDataset(train_encodings, train_labels)
test_dataset = EmailDataset(test_encodings, test_labels)

print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

# Thiết lập tham số huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    logging_steps=8,
    weight_decay=0.05,
    no_cuda=False  
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Train
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tổng số câu sau khi tách: 436
Ví dụ: ['Unprecedented shocking promotion: shop without paying upfront', 'You also get many attractive gifts when you register today', 'Limited quantity, so the offer is only for the fastest customers'] [3, 3, 3]
Train size: 348, Test size: 88


Step,Training Loss
10,1.8435
20,1.7776
30,1.776
40,1.7079
50,1.425
60,1.246
70,0.9658
80,0.8402
90,0.6876
100,0.4782


TrainOutput(global_step=440, training_loss=0.3271097704641182, metrics={'train_runtime': 52.6, 'train_samples_per_second': 66.16, 'train_steps_per_second': 8.365, 'total_flos': 73324284432480.0, 'train_loss': 0.3271097704641182, 'epoch': 10.0})

In [37]:
# Labels
label_names = ['advertising', 'entertainment', 'friends', 'spam', 'study', 'work']

# Load model & tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("./results/checkpoint-440")
model.eval()

# Test data
texts = [
    "Congratulations! You have been selected to receive a $1000 gift card.",
    "Huge weekend sale: 50% off all items.",
    "New action movie released this week with amazing reviews.",
    "Hey, let's meet this weekend for coffee and catch up.",
    "The final exam schedule has been posted.",
    "Reminder: project meeting at 9am tomorrow."
]

# ----------- Đo thời gian batch inference -------------
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

start = time.time()
with torch.no_grad():
    outputs = model(**inputs)
end = time.time()

batch_time = end - start
avg_time_per_sample = batch_time / len(texts)

print(f"Batch size: {len(texts)}")
print(f"Tổng thời gian dự đoán: {batch_time:.4f} giây")
print(f"Thời gian trung bình mỗi sample: {avg_time_per_sample:.4f} giây")

# ----------- Đo thời gian từng sample riêng lẻ -------------
times = []
for t in texts:
    inp = tokenizer(t, return_tensors="pt", truncation=True, padding=True, max_length=128)
    start = time.time()
    with torch.no_grad():
        _ = model(**inp)
    end = time.time()
    times.append(end - start)

print(f"\nDự đoán từng sample:")
print(f"Thời gian trung bình mỗi sample: {sum(times)/len(times):.4f} giây")
print(f"Tốc độ cao nhất: {min(times):.4f} giây, chậm nhất: {max(times):.4f} giây")


Batch size: 6
Tổng thời gian dự đoán: 0.1192 giây
Thời gian trung bình mỗi sample: 0.0199 giây

Dự đoán từng sample:
Thời gian trung bình mỗi sample: 0.0309 giây
Tốc độ cao nhất: 0.0166 giây, chậm nhất: 0.0555 giây


In [38]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print("\n===== Classification Report =====")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))

print("\n===== Confusion Matrix =====")
print(confusion_matrix(y_true, y_pred))


===== Classification Report =====
               precision    recall  f1-score   support

  advertising       0.83      0.71      0.77        14
entertainment       0.80      0.86      0.83        14
      friends       0.86      0.86      0.86        14
         spam       0.78      0.88      0.82        16
        study       0.87      0.87      0.87        15
         work       0.86      0.80      0.83        15

     accuracy                           0.83        88
    macro avg       0.83      0.83      0.83        88
 weighted avg       0.83      0.83      0.83        88


===== Confusion Matrix =====
[[10  0  1  2  0  1]
 [ 1 12  0  0  0  1]
 [ 0  2 12  0  0  0]
 [ 1  1  0 14  0  0]
 [ 0  0  1  1 13  0]
 [ 0  0  0  1  2 12]]
