<a href="https://colab.research.google.com/github/Naomie25/DI-Bootcamp/blob/main/Week7_Day2_DailyChallenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Setup

In [1]:
%pip install --quiet datasets evaluate transformers[sentencepiece]

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h

2. Load & Inspect Dataset :

In [2]:
from datasets import Dataset
from datasets import load_dataset
import pandas as pd

# Load the UCI SMS Spam dataset (sms_spam) from Hugging Face hub
df = pd.read_parquet("hf://datasets/ucirvine/sms_spam/plain_text/train-00000-of-00001.parquet")

full_dataset=Dataset.from_pandas(df)

# We'll use 4,000 for train, 1,000 for validation
train_ds = full_dataset.select(range(4000))
val_ds   = full_dataset.select(range(4000,5000))

# print the features of the train dataset. It should show 'sms' and 'label'
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


3. Tokenization :

In [3]:
from transformers import GPT2Tokenizer


model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# GPT-2 has no pad token by default—set it to eos
tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(examples):
    # returns input_ids, attention_mask; keep max_length small for SMS
    return tokenizer(
        examples["sms"],
        padding="max_length",
        truncation=True,
        max_length=64
    )
train_tok = train_ds.map(tokenize_fn, batched=True)
val_tok   = val_ds.map(tokenize_fn, batched=True)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

4. Model Initialization

In [4]:
import torch
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained( # Load GPT-2 with sequence classification head
    model_name,
    num_labels=2,           # spam vs. ham
    pad_token_id=tokenizer.eos_token_id
)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5. Metrics Definition

In [5]:
import evaluate
import numpy as np

accuracy  = evaluate.load("accuracy")
precision = evaluate.load("precision")  #  load precision metric
recall    = evaluate.load("recall")     # load recall metric
f1        = evaluate.load("f1")         # load F1-score metric

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision.compute(predictions=preds, references=labels)["precision"], # apply the function used for accurracy but for precision
        "recall":    recall.compute(predictions=preds, references=labels)["recall"], # apply the function used for accurracy but for recall
        "f1":        f1.compute(predictions=preds, references=labels)["f1"]# apply the function used for accurracy but for F1
    }

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

6. Training Arguments Configuration

In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",           # where to save checkpoints and logs
    do_train=True,                    # enable training
    do_eval=True,                     # enable evaluation during training
    eval_steps=500,                   # evaluate every 500 steps
    save_steps=500,                   # save checkpoint every 500 steps
    logging_dir="./logs",             # TensorBoard logs location
    logging_steps=500,                # log training metrics every 500 steps

    per_device_train_batch_size=8,   # batch size per GPU/CPU for training
    per_device_eval_batch_size=8,    # batch size per GPU/CPU for evaluation
    num_train_epochs=3,              # total number of training epochs
    learning_rate=5e-5,              # typical fine-tuning learning rate
    weight_decay=0.01,               # regularization to prevent overfitting

    report_to=None,                  # disable reporting (e.g., to WandB)
    save_total_limit=1,              # keep only the last checkpoint
)


In [7]:
from transformers import Trainer

# Train
trainer = Trainer(
    model=model,                        # GPT2ForSequenceClassification model
    args=training_args,                 # TrainingArguments defined earlier
    train_dataset=train_tok,           #tokenized training dataset
    eval_dataset=val_tok,              #  Tokenized validation dataset
    compute_metrics=compute_metrics,   # a function that returns accuracy, precision, etc.
)

trainer.train()

# Evaluate
metrics = trainer.evaluate()  # ✅ Runs evaluation on val_tok
print(metrics)




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnaomiemarciano25[0m ([33mnaoteam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.1392
1000,0.08
1500,0.0365


{'eval_loss': 0.04078679531812668, 'eval_accuracy': 0.995, 'eval_precision': 0.9926470588235294, 'eval_recall': 0.9712230215827338, 'eval_f1': 0.9818181818181818, 'eval_runtime': 3.9822, 'eval_samples_per_second': 251.115, 'eval_steps_per_second': 31.389, 'epoch': 3.0}
