In [1]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer
import pandas as pd
import numpy as np
from sklearn.utils import resample

In [2]:
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


Torch version: 2.5.1+cu121
CUDA available: True
Device name: NVIDIA GeForce RTX 3050 Laptop GPU


### `–ó–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–Ω—è –¥–∞—Ç–∞—Å–µ—Ç—É`

In [None]:
data = pd.read_csv("../data/spam.csv", encoding='latin-1')

data = data[['v1', 'v2']]
data.columns = ['label', 'message']

data['label'] = data['label'].map({'ham': 0, 'spam': 1})

print("–§–∞–π–ª –∑–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ!")
print("–†–æ–∑–º—ñ—Ä –¥–∞—Ç–∞—Å–µ—Ç—É:", data.shape)
print(data.sample(5))


–§–∞–π–ª –∑–∞–≤–∞–Ω—Ç–∞–∂–µ–Ω–æ!
–†–æ–∑–º—ñ—Ä –¥–∞—Ç–∞—Å–µ—Ç—É: (5572, 2)
      label                                            message
1011      0        I just got home babe, are you still awake ?
4833      0                      I hope your pee burns tonite.
4577      1  Urgent! call 09066350750 from your landline. Y...
2570      0                   From 5 to 2 only my work timing.
2354      0               R we going with the  &lt;#&gt;  bus?


pandas.core.frame.DataFrame

In [4]:
for a in data['label'].unique():
    print(f"–ö–ª–∞—Å {a} –º–∞—î {data[data['label'] == a].shape[0]} –ø—Ä–∏–∫–ª–∞–¥—ñ–≤") 

–ö–ª–∞—Å 0 –º–∞—î 4825 –ø—Ä–∏–∫–ª–∞–¥—ñ–≤
–ö–ª–∞—Å 1 –º–∞—î 747 –ø—Ä–∏–∫–ª–∞–¥—ñ–≤


### `–¢–æ–∫–µ–Ω—ñ–∑–∞—Ü—ñ—è`

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts, tokenizer, max_len=64):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors="pt")

### `–ü—ñ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç—É —Ç–∞ –±–∞–ª–∞–Ω—Å—É–≤–∞–Ω–Ω—è –∫–ª–∞—Å—ñ–≤`

In [6]:
majority = data[data['label'] == 0]  # ham
minority = data[data['label'] == 1]  # spam

print("–î–æ –±–∞–ª–∞–Ω—Å—É–≤–∞–Ω–Ω—è:")
print("ham:", len(majority), "| spam:", len(minority))

# Oversampling –º–µ–Ω—à–æ–≥–æ –∫–ª–∞—Å—É (spam)
minority_upsampled = resample(
    minority,
    replace=True,                # –∑ –ø–æ–≤—Ç–æ—Ä–µ–Ω–Ω—è–º
    n_samples=len(majority),     # –¥–æ –∫—ñ–ª—å–∫–æ—Å—Ç—ñ ham
    random_state=42
)

balanced_data = pd.concat([majority, minority_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

balanced_data.to_csv('../data/balanced_data.csv', index=False)


print("\n–ü—ñ—Å–ª—è –±–∞–ª–∞–Ω—Å—É–≤–∞–Ω–Ω—è:")
print(balanced_data['label'].value_counts())

train_texts, test_texts, train_labels, test_labels = train_test_split(
    balanced_data['message'], balanced_data['label'], test_size=0.2, random_state=42, stratify=balanced_data['label']
)

train_encodings = tokenize_data(train_texts, tokenizer)
test_encodings = tokenize_data(test_texts, tokenizer)

class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SpamDataset(train_encodings, train_labels)
test_dataset = SpamDataset(test_encodings, test_labels)


–î–æ –±–∞–ª–∞–Ω—Å—É–≤–∞–Ω–Ω—è:
ham: 4825 | spam: 747

–ü—ñ—Å–ª—è –±–∞–ª–∞–Ω—Å—É–≤–∞–Ω–Ω—è:
label
0    4825
1    4825
Name: count, dtype: int64


### `–ê—Ä—Ö—ñ—Ç–µ–∫—Ç—É—Ä–∞ –º–æ–¥–µ–ª—ñ`
#### `PositionalEncoding, Embedding, TransformerEncoderLayer(2 —à–∞—Ä–∏), Linear`

In [7]:
# –ü–æ–∑–∏—Ü—ñ–π–Ω–µ –∫–æ–¥—É–≤–∞–Ω–Ω—è
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class SimpleTransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers=2, num_classes=2, dim_feedforward=256):

        super(SimpleTransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            batch_first=False  # permute(1,0,2)
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)
    
    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = self.pos_encoder(x)
        x = x.permute(1, 0, 2)  # (seq_len, batch, d_model)
        x = self.transformer(x)
        x = x.mean(dim=0)       
        out = self.fc(x)
        return out


### `–¢—Ä–µ–Ω—É–≤–∞–Ω–Ω—è –º–æ–¥–µ–ª—ñ —Ç–∞ —Ç—é–Ω—ñ–Ω–≥ –≥—ñ–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ñ–≤`

In [None]:
from itertools import product

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

d_models = [128, 256, 512]  
nheads = [2, 4, 8]           
num_epochs = 3

best_f1 = 0
best_params = None

for d_m, nh in product(d_models, nheads):
    print(f"\n-> –ù–∞–≤—á–∞–Ω–Ω—è –º–æ–¥–µ–ª—ñ –∑ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏ d_model={d_m}, nhead={nh}")
    model = SimpleTransformerClassifier(
        vocab_size=tokenizer.vocab_size,
        d_model=d_m,
        nhead=nh,
        num_layers=2
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"  Epoch {epoch+1}/{num_epochs}, avg loss = {total_loss / len(train_loader):.4f}")


-> –ù–∞–≤—á–∞–Ω–Ω—è –º–æ–¥–µ–ª—ñ –∑ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏ d_model=128, nhead=2




  Epoch 1/3, avg loss = 0.3016
  Epoch 2/3, avg loss = 0.1069


### `–û—Ü—ñ–Ω–∫–∞ —Ç–∞ –∑–±–µ—Ä–µ–∂–µ–Ω–Ω—è –º–æ–¥–µ–ª—ñ`

In [None]:
model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids)
        preds += torch.argmax(outputs, dim=1).cpu().tolist()
        true_labels += labels.cpu().tolist()

acc = accuracy_score(true_labels, preds)
prec = precision_score(true_labels, preds)
rec = recall_score(true_labels, preds)
f1 = f1_score(true_labels, preds)

print(f"  üîπ Acc: {acc:.5f} | Prec: {prec:.5f} | Rec: {rec:.5f} | F1: {f1:.5f}")

if f1 > best_f1:
    best_f1 = f1
    best_params = (d_m, nh)
    torch.save(model.state_dict(), "best_simple_transformer.pt")
    print("  ‚úÖ –ù–æ–≤—ñ –Ω–∞–π–∫—Ä–∞—â—ñ –≤–∞–≥–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ!")

# —Ü–µ–π print —É–∂–µ –ø—ñ—Å–ª—è –≤—Å—ñ—Ö –ø–µ—Ä–µ–≤—ñ—Ä–æ–∫
print(f"\nüèÜ –ù–∞–π–∫—Ä–∞—â–∞ –∫–æ–º–±—ñ–Ω–∞—Ü—ñ—è: d_model={best_params[0]}, nhead={best_params[1]} –∑ F1={best_f1:.6f}")

  üîπ Acc: 0.99689 | Prec: 0.99382 | Rec: 1.00000 | F1: 0.99690
  ‚úÖ –ù–æ–≤—ñ –Ω–∞–π–∫—Ä–∞—â—ñ –≤–∞–≥–∏ –∑–±–µ—Ä–µ–∂–µ–Ω–æ!

üèÜ –ù–∞–π–∫—Ä–∞—â–∞ –∫–æ–º–±—ñ–Ω–∞—Ü—ñ—è: d_model=512, nhead=8 –∑ F1=0.996901
