1. Setup

In [17]:
%pip install --upgrade --quiet datasets
%pip install --upgrade --quiet evaluate

In [18]:
from datasets import load_dataset
import pandas as pd
from transformers import GPT2Tokenizer
import torch
from transformers import GPT2ForSequenceClassification
import evaluate
import numpy as np
from transformers import TrainingArguments
from transformers import Trainer


2. Load & Inspect Dataset

In [19]:
# load the dataset directly from the source
raw = load_dataset("ucirvine/sms_spam")

# This dataset has only a 'train' split
train_ds = raw["train"].select(range(4000))
val_ds = raw["train"].select(range(4000, 5000))

print(train_ds.features)

{'sms': Value('string'), 'label': ClassLabel(names=['ham', 'spam'])}


In [20]:
raw

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 5574
    })
})

In [30]:
# Filtrer les spams (label == 1)
spams = raw['train'].filter(lambda x: x['label'] == 1)

# Afficher les premiers spams
print(spams[:5])

Filter:   0%|          | 0/5574 [00:00<?, ? examples/s]

{'sms': ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n", "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv\n", 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.\n', 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030\n', 'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info\n'], 'label': [1, 1, 1, 1, 1]}


3. Tokenization

In [22]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

def tokenize_fn(examples):
    return tokenizer(
        examples["sms"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
val_tok = val_ds.map(tokenize_fn, batched=True)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

4. Model Initialization

In [23]:
model = GPT2ForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    pad_token_id=tokenizer.eos_token_id
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5. Metrics Definition

In [24]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels)["precision"],
        "recall":    recall.compute(predictions=preds, references=labels)["recall"],
        "f1":        f1.compute(predictions=preds, references=labels)["f1"]
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

📌 Why track precision and recall alongside accuracy?

Accuracy can be misleading in imbalanced datasets. If spam only makes up 10% of messages, a model that always predicts "ham" can achieve 90% accuracy—yet it's useless.

    Precision tells us how many predicted spams are actually spam.

    Recall tells us how many actual spams were correctly identified.

📌 High accuracy but low recall?

The model is missing a lot of spam, even though it seems accurate overall. This may result in users receiving unwanted messages.

6. TrainingArguments Configuration

In [25]:
training_args = TrainingArguments(
    output_dir="./gpt2-spam-classifier",
    do_train=True,
    do_eval=True,
    eval_steps=500,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=500,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,

    report_to=None,
    save_total_limit=1,
)

📌 What does weight_decay do?

It helps prevent overfitting by penalizing large weights.

    Use higher values if the model overfits quickly.

    Use lower values if the model underfits or struggles to learn.

7. Train & Evaluate

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate
metrics = trainer.evaluate()
print(metrics)



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mk_benyahia[0m ([33mk_benyahia-pstb[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.1485
1000,0.0309
1500,0.0062


{'eval_loss': 0.048849839717149734, 'eval_accuracy': 0.994, 'eval_precision': 1.0, 'eval_recall': 0.9568345323741008, 'eval_f1': 0.9779411764705882, 'eval_runtime': 3.9712, 'eval_samples_per_second': 251.813, 'eval_steps_per_second': 31.477, 'epoch': 3.0}


📌 Interpret your results:

Expect:

{"eval_loss": 0.05, "eval_accuracy": 0.994, "eval_precision": 1, "eval_recall": 0.96, "eval_f1": 0.98}

These would mean:

    The model performs very well.

    High precision: few false positives (ham marked as spam).

    High recall: it catches most of the actual spam.

    High F1-score: good balance.

In [31]:
# Répertoire de sauvegarde
save_directory = "./gpt2-spam-model"

# Sauvegarder le modèle
model.save_pretrained(save_directory)

# Sauvegarder le tokenizer
tokenizer.save_pretrained(save_directory)

('./gpt2-spam-model/tokenizer_config.json',
 './gpt2-spam-model/special_tokens_map.json',
 './gpt2-spam-model/vocab.json',
 './gpt2-spam-model/merges.txt',
 './gpt2-spam-model/added_tokens.json')

In [32]:
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer

# Chemin vers le modèle sauvegardé
save_directory = "./gpt2-spam-model"

# Recharger modèle et tokenizer
model = GPT2ForSequenceClassification.from_pretrained(save_directory)
tokenizer = GPT2Tokenizer.from_pretrained(save_directory)


In [33]:

from transformers import TextClassificationPipeline

pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

Device set to use cuda:0


In [29]:
from transformers import TextClassificationPipeline

# Créer le mapping inverse : LABEL_0 → ham, LABEL_1 → spam
id2label = {0: "ham", 1: "spam"}

# Créer un pipeline de classification
pipeline = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=False,
    device=0 if torch.cuda.is_available() else -1,
)

# Exemple de SMS à tester
sms_test = [
    "Congratulations! You've won a free iPhone. Click here to claim now!",
    "Salut, on se voit ce soir pour dîner ?",
    "You have been selected for a cash prize. Call now!",
]

# Prédiction et affichage lisible
for sms in sms_test:
    pred = pipeline(sms, top_k=1)[0]  # top_k=1 remplace return_all_scores=False
    label_id = int(pred["label"].split("_")[-1])  # Extraire le 0 ou 1 de "LABEL_0"
    label_name = id2label[label_id]  # Convertir en "ham"/"spam"
    print(f'SMS: "{sms}"\n → Prédiction: {label_name.upper()} (score: {pred["score"]:.4f})\n')

Device set to use cuda:0


SMS: "Congratulations! You've won a free iPhone. Click here to claim now!"
 → Prédiction: SPAM (score: 0.9925)

SMS: "Salut, on se voit ce soir pour dîner ?"
 → Prédiction: HAM (score: 0.9999)

SMS: "You have been selected for a cash prize. Call now!"
 → Prédiction: SPAM (score: 0.9972)

