In [None]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import torch
import random
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from tqdm.notebook import tqdm

In [None]:
class Config:
    MODEL_NAME = "asafaya/bert-mini-arabic"
    DATA_FILE = 'saudi_spam_ham_dataset.csv'
    OUTPUT_DIR = "./haseen-spam-classifier"

    # Training Parameters
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 16
    VALID_BATCH_SIZE = 32
    TEST_BATCH_SIZE = 32
    EPOCHS = 5
    LEARNING_RATE = 2e-5

    # Early Stopping
    PATIENCE = 2

    # Reproducibility
    SEED = 42

def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed_value)

set_seed(Config.SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
df = pd.read_csv(Config.DATA_FILE)
df['label'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)
df = df[['text', 'label']]

In [None]:
# Create test set (20%)
train_val_df, test_df = train_test_split(
    df, test_size=0.2, random_state=Config.SEED, stratify=df['label']
)
# Create train and validation sets from the remaining data
train_df, val_df = train_test_split(
    train_val_df, test_size=0.2, random_state=Config.SEED, stratify=train_val_df['label']
)

print(f"Data split: Train={len(train_df)}, Validation={len(val_df)}, Test={len(test_df)}")

Data split: Train=5305, Validation=1327, Test=1659


In [None]:
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(Config.MODEL_NAME, num_labels=2)
model.to(device)
print(f"Model '{Config.MODEL_NAME}' loaded successfully.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-mini-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model 'asafaya/bert-mini-arabic' loaded successfully.


In [None]:
def create_dataset(df, tokenizer, max_len):
    encodings = tokenizer(
        df['text'].tolist(),
        truncation=True, padding=True, max_length=max_len, return_tensors="pt"
    )
    return TensorDataset(
        encodings['input_ids'], encodings['attention_mask'], torch.tensor(df['label'].values)
    )

train_dataset = create_dataset(train_df, tokenizer, Config.MAX_LEN)
val_dataset = create_dataset(val_df, tokenizer, Config.MAX_LEN)
test_dataset = create_dataset(test_df, tokenizer, Config.MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=Config.TRAIN_BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Config.VALID_BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=Config.TEST_BATCH_SIZE, shuffle=False)
print("DataLoaders created.")

DataLoaders created.


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=Config.LEARNING_RATE)
total_steps = len(train_loader) * Config.EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

best_val_loss = float('inf')
epochs_no_improve = 0
os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
best_model_path = os.path.join(Config.OUTPUT_DIR, "best_model.pt")

for epoch in range(Config.EPOCHS):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1} [Train]", leave=False):
        model.zero_grad()
        input_ids, attention_mask, labels = [t.to(device) for t in batch]
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1} [Val]", leave=False):
            input_ids, attention_mask, labels = [t.to(device) for t in batch]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_val_loss += outputs.loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"Epoch {epoch + 1}/{Config.EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        epochs_no_improve = 0
        torch.save(model.state_dict(), best_model_path)
        print("-> Validation loss improved, saving best model.")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= Config.PATIENCE:
            print(f"-> Early stopping triggered after {Config.PATIENCE} epochs with no improvement.")
            break

Epoch 1 [Train]:   0%|          | 0/332 [00:00<?, ?it/s]

Epoch 1 [Val]:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch 1/5 | Train Loss: 0.1176 | Val Loss: 0.0149
-> Validation loss improved, saving best model.


Epoch 2 [Train]:   0%|          | 0/332 [00:00<?, ?it/s]

Epoch 2 [Val]:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch 2/5 | Train Loss: 0.0052 | Val Loss: 0.0147
-> Validation loss improved, saving best model.


Epoch 3 [Train]:   0%|          | 0/332 [00:00<?, ?it/s]

Epoch 3 [Val]:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch 3/5 | Train Loss: 0.0030 | Val Loss: 0.0118
-> Validation loss improved, saving best model.


Epoch 4 [Train]:   0%|          | 0/332 [00:00<?, ?it/s]

Epoch 4 [Val]:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch 4/5 | Train Loss: 0.0015 | Val Loss: 0.0165


Epoch 5 [Train]:   0%|          | 0/332 [00:00<?, ?it/s]

Epoch 5 [Val]:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch 5/5 | Train Loss: 0.0013 | Val Loss: 0.0132
-> Early stopping triggered after 2 epochs with no improvement.


In [None]:
print("\n--- Evaluating on Test Set with Best Model ---")
model.load_state_dict(torch.load(best_model_path))
model.eval()

all_preds = []
all_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids, attention_mask, labels = [t.to(device) for t in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("\n--- Final Classification Report ---")
print(classification_report(all_labels, all_preds, target_names=['ham', 'spam']))
print(f"Final Test Accuracy: {accuracy_score(all_labels, all_preds):.4f}")


--- Evaluating on Test Set with Best Model ---


Testing:   0%|          | 0/52 [00:00<?, ?it/s]


--- Final Classification Report ---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       661
        spam       1.00      1.00      1.00       998

    accuracy                           1.00      1659
   macro avg       1.00      1.00      1.00      1659
weighted avg       1.00      1.00      1.00      1659

Final Test Accuracy: 0.9994


In [None]:
final_model_dir = os.path.join(Config.OUTPUT_DIR, "final_model_for_deployment")
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
print(f"\nFinal model and tokenizer for deployment saved to: {final_model_dir}")


Final model and tokenizer for deployment saved to: ./haseen-spam-classifier/final_model_for_deployment
