In [None]:
import pandas as pd
import numpy as np
import pickle
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')



In [None]:
with open('data/train_test_split.pkl', 'rb') as f:
    data = pickle.load(f)
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']

with open('data/classical_results.pkl', 'rb') as f:
    classical = pickle.load(f)
    results = classical['results']

print("Data loaded")

Data loaded


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
class SMSDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
train_dataset = SMSDataset(X_train.reset_index(drop=True), y_train.reset_index(drop=True), tokenizer)
test_dataset = SMSDataset(X_test.reset_index(drop=True), y_test.reset_index(drop=True), tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./bert_sms_spam',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"
[34m[1mwandb[0m: Using W&B in offline mode.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.0237,0.049088
2,0.001,0.049724


TrainOutput(global_step=558, training_loss=0.05996016027759694, metrics={'train_runtime': 11869.9425, 'train_samples_per_second': 0.751, 'train_steps_per_second': 0.047, 'total_flos': 586342986869760.0, 'train_loss': 0.05996016027759694, 'epoch': 2.0})

In [None]:
predictions = trainer.predict(test_dataset)
bert_pred = np.argmax(predictions.predictions, axis=1)
bert_pred_proba = torch.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()

bert_accuracy = accuracy_score(y_test, bert_pred)
bert_auc = roc_auc_score(y_test, bert_pred_proba)

results['BERT'] = {'accuracy': bert_accuracy, 'auc': bert_auc, 'pred': bert_pred, 'proba': bert_pred_proba}
print(f"BERT - Accuracy: {bert_accuracy:.4f}, AUC: {bert_auc:.4f}")

BERT - Accuracy: 0.9910, AUC: 0.9968


In [None]:
with open('data/all_results.pkl', 'wb') as f:
    pickle.dump({
        'results': results,
        'y_test': y_test
    }, f)
print("All results saved to data/all_results.pkl")

All results saved to data/all_results.pkl
