In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import time

In [4]:
df = pd.read_csv('30_emails_en.csv')
df.head()

Unnamed: 0,index,email_from,data,label
0,0,['lehoangminh.ai.tech@gmail.com'],Unprecedented shocking promotion: shop without...,spam
1,1,['lehoangminh.ai.tech@gmail.com'],Get 5 million VND instantly with just a few si...,spam
2,2,['lehoangminh.ai.tech@gmail.com'],You have won the special prize from our promot...,spam
3,3,['lehoangminh.ai.tech@gmail.com'],Secret information just for you: a huge cash o...,spam
4,4,['lehoangminh.ai.tech@gmail.com'],Click the link to receive your prize instantly...,spam


In [5]:
label_encoder = LabelEncoder()
df['label_encode'] = label_encoder.fit_transform(df['label'])
print(label_encoder.classes_) # tương đương các nhãn sau khi labelencode

['advertising' 'entertainment' 'friends' 'spam' 'study' 'work']


In [5]:
df.head()

Unnamed: 0,index,email_from,data,label,label_encode
0,0,['lehoangminh.ai.tech@gmail.com'],Unprecedented shocking promotion: shop without...,spam,3
1,1,['lehoangminh.ai.tech@gmail.com'],Get 5 million VND instantly with just a few si...,spam,3
2,2,['lehoangminh.ai.tech@gmail.com'],You have won the special prize from our promot...,spam,3
3,3,['lehoangminh.ai.tech@gmail.com'],Secret information just for you: a huge cash o...,spam,3
4,4,['lehoangminh.ai.tech@gmail.com'],Click the link to receive your prize instantly...,spam,3


In [None]:
# Load tokenizer & model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6) 

# Email
emails_split = []
label = []
real_label = label_encoder.classes_
for id in range(len(df)): 
    email_text = df.iloc[id, 2].split('.')
    label_item = df.iloc[id, 4]
    label.extend([label_item] * len(email_text))
    emails_split += email_text

labels = torch.tensor(label, dtype=torch.long)
# Tokenize
inputs = tokenizer(emails_split, padding=True, truncation=True, return_tensors="pt")

# Forward
# outputs = model(**inputs, labels=torch.tensor(labels))
# Chuẩn bị dataset cho Trainer

class EmailDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Tạo dataset
train_dataset = EmailDataset(inputs, labels)

# Thiết lập tham số huấn luyện
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=20,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    logging_steps=20,
    no_cuda=True  
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

# Train
trainer.train()
# loss = outputs.loss
# print(loss, logits)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item['labels'] = torch.tensor(self.labels[idx])


Step,Training Loss
20,1.7964
40,1.5256
60,1.2148
80,0.8939
100,0.623
120,0.5753
140,0.3402
160,0.5015
180,0.4068
200,0.3835


TrainOutput(global_step=380, training_loss=0.6154835688440423, metrics={'train_runtime': 284.0694, 'train_samples_per_second': 10.49, 'train_steps_per_second': 1.338, 'total_flos': 36754645518720.0, 'train_loss': 0.6154835688440423, 'epoch': 20.0})

In [15]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Các sample test
sample_list = [
    ("Congratulations! You have been selected to receive a $1000 gift card. Click the link below to claim your prize now. Hurry, this offer expires soon!", "spam"),
    ("Huge weekend sale: 50% off all items. Visit our store today to grab the best deals!", "advertising"),
    ("New action movie released this week with amazing reviews. Don't miss it!", "entertainment"),
    ("Hey, let's meet this weekend for coffee and catch up.", "friends"),
    ("The final exam schedule has been posted. Please check the portal and prepare accordingly.", "study"),
    ("Reminder: project meeting at 9am tomorrow in the main conference room.", "work")
]

texts = [t for t, _ in sample_list]
true_labels = [lbl for _, lbl in sample_list]

# Label mapping
label_names = ['advertising', 'entertainment', 'friends', 'spam', 'study', 'work']

# Load model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("./results/checkpoint-380")
model.eval()

# Tokenize
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)
    preds = torch.argmax(probs, dim=-1).tolist()

# Map số → tên nhãn
pred_labels = [label_names[i] for i in preds]

# In kết quả
for text, pred, true in zip(texts, pred_labels, true_labels):
    print(f"\nEmail: {text[:60]}...")
    print(f"Predicted: {pred} | True: {true}")



Email: Congratulations! You have been selected to receive a $1000 g...
Predicted: spam | True: spam

Email: Huge weekend sale: 50% off all items. Visit our store today ...
Predicted: advertising | True: advertising

Email: New action movie released this week with amazing reviews. Do...
Predicted: entertainment | True: entertainment

Email: Hey, let's meet this weekend for coffee and catch up....
Predicted: friends | True: friends

Email: The final exam schedule has been posted. Please check the po...
Predicted: study | True: study

Email: Reminder: project meeting at 9am tomorrow in the main confer...
Predicted: work | True: work


In [None]:
# Labels
label_names = ['advertising', 'entertainment', 'friends', 'spam', 'study', 'work']

# Load model & tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("./results/checkpoint-380")
model.eval()

# Test data
texts = [
    "Congratulations! You have been selected to receive a $1000 gift card.",
    "Huge weekend sale: 50% off all items.",
    "New action movie released this week with amazing reviews.",
    "Hey, let's meet this weekend for coffee and catch up.",
    "The final exam schedule has been posted.",
    "Reminder: project meeting at 9am tomorrow."
]

# ----------- Đo thời gian batch inference -------------
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

start = time.time()
with torch.no_grad():
    outputs = model(**inputs)
end = time.time()

batch_time = end - start
avg_time_per_sample = batch_time / len(texts)

print(f"Batch size: {len(texts)}")
print(f"Tổng thời gian dự đoán: {batch_time:.4f} giây")
print(f"Thời gian trung bình mỗi sample: {avg_time_per_sample:.4f} giây")

# ----------- Đo thời gian từng sample riêng lẻ -------------
times = []
for t in texts:
    inp = tokenizer(t, return_tensors="pt", truncation=True, padding=True, max_length=128)
    start = time.time()
    with torch.no_grad():
        _ = model(**inp)
    end = time.time()
    times.append(end - start)

print(f"\nDự đoán từng sample:")
print(f"Thời gian trung bình mỗi sample: {sum(times)/len(times):.4f} giây")
print(f"Tốc độ cao nhất: {min(times):.4f} giây, chậm nhất: {max(times):.4f} giây")


Batch size: 6
Tổng thời gian dự đoán: 0.1007 giây
Thời gian trung bình mỗi sample: 0.0168 giây

Dự đoán từng sample:
Thời gian trung bình mỗi sample: 0.0242 giây
Tốc độ cao nhất: 0.0155 giây, chậm nhất: 0.0299 giây
