In [5]:
!pip install transformers torch pandas scikit-learn sentencepiece
!pip install accelerate -U

import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm



In [10]:
df = pd.read_csv('saudi_spam_ham_dataset.csv')
df['label'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)
df = df[['text', 'label']]

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
model_name = "asafaya/bert-mini-arabic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.safetensors:   0%|          | 0.00/46.6M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at asafaya/bert-mini-arabic and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def create_dataset(df, tokenizer, max_len=128):
    encodings = tokenizer(
        df['text'].tolist(),
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )
    return TensorDataset(
        encodings['input_ids'],
        encodings['attention_mask'],
        torch.tensor(df['label'].values)
    )

train_dataset = create_dataset(train_df, tokenizer)
test_dataset = create_dataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
EPOCHS = 3
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [27]:
for epoch in range(EPOCHS):
    print(f"--- Epoch {epoch + 1}/{EPOCHS} ---")
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc="Training")
    for batch in progress_bar:
        batch = [t.to(device) for t in batch]
        input_ids, attention_mask, labels = batch

        model.zero_grad()

        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': f'{loss.item():.3f}'})

    avg_train_loss = total_loss / len(train_loader)
    print(f"Average Training Loss: {avg_train_loss:.4f}")

--- Epoch 1/3 ---


Training: 100%|██████████| 414/414 [00:10<00:00, 40.82it/s, training_loss=0.005]


Average Training Loss: 0.1037
--- Epoch 2/3 ---


Training: 100%|██████████| 414/414 [00:09<00:00, 45.85it/s, training_loss=0.003]


Average Training Loss: 0.0055
--- Epoch 3/3 ---


Training: 100%|██████████| 414/414 [00:08<00:00, 48.20it/s, training_loss=0.002]

Average Training Loss: 0.0033





In [28]:
print("\n--- Evaluating ---")
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = [t.to(device) for t in batch]
        input_ids, attention_mask, labels = batch

        outputs = model(
            input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.cpu().numpy())


--- Evaluating ---


Evaluating: 100%|██████████| 104/104 [00:00<00:00, 164.93it/s]


In [29]:
print("\n--- Classification Report ---")
print(classification_report(all_labels, all_preds, target_names=['ham', 'spam']))

accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy:.4f}")


--- Classification Report ---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       659
        spam       1.00      1.00      1.00       997

    accuracy                           1.00      1656
   macro avg       1.00      1.00      1.00      1656
weighted avg       1.00      1.00      1.00      1656

Accuracy: 1.0000


In [30]:
OUTPUT_DIR = "./spam_classifier_model"
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"\nModel and tokenizer saved to {OUTPUT_DIR}")


Model and tokenizer saved to ./spam_classifier_model


In [36]:
try:
    train_texts = set(train_df['text'].tolist())
    test_texts = set(test_df['text'].tolist())

    common_texts = train_texts.intersection(test_texts)

    if len(common_texts) > 0:
        print(f"WARNING: Found {len(common_texts)} overlapping messages between the training and testing sets!")
        print("-----------------------------------------------------------------")
        print("This is data leakage and likely explains an unrealistically high accuracy score.")
    else:
        print("Congratulations! No data leakage was found between the training and testing sets.")
        print("-----------------------------------------------------------------")
        print("This is a strong indicator that your data partition is clean and the model's performance is genuine.")

except NameError:
    print("Error: Please ensure that 'train_df' and 'test_df' variables are defined in your current session before running this code.")

Congratulations! No data leakage was found between the training and testing sets.
-----------------------------------------------------------------
This is a strong indicator that your data partition is clean and the model's performance is genuine.
