# Lightweight Fine-Tuning Project

Description of choices in this soluton.

**PEFT technique:** LoRA (Low-Rank Adaptation) implemented through the Hugging Face PEFT framework.

**Model:** The pre-trained gpt2 model is adapted for sequence classification tasks using LoRA.

**Evaluation approach:** The Hugging Face Trainer class is used for evaluation.

**Fine-tuning dataset:** The fine-tuning is performed on the SMS Spam dataset from the Hugging Face datasets library.

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
# Install the required version of datasets in case you have an older version
# You will need to choose "Kernel > Restart Kernel" from the menu after executing this cell
! pip install -q "datasets==2.15.0"

In [2]:
# Load the sms_spam dataset
# See: https://huggingface.co/datasets/sms_spam

from datasets import load_dataset

# The sms_spam dataset only has a train split, so we use the train_test_split method to split it into train and test
full_dataset = load_dataset("sms_spam", split="train")

# Use half the dataset
dataset = full_dataset.select(range(len(full_dataset) // 2)).train_test_split(
    test_size=0.2, shuffle=True, seed=23
)
#full_dataset=None

splits = ["train", "test"]

# View the dataset characteristics
dataset["train"]
dataset["test"]

Dataset({
    features: ['sms', 'label'],
    num_rows: 558
})

In [3]:
# Inspect the first example. Do you think this is spam or not?
print(dataset["train"][0])
print(dataset["test"][0])

print (f"train: {len(dataset['train'])}")
print (f"test:  {len(dataset['test'])}")

{'sms': "Haha good to hear, I'm officially paid and on the market for an 8th\n", 'label': 0}
{'sms': 'Sorry my roommates took forever, it ok if I come by now?\n', 'label': 0}
train: 2229
test:  558


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    print (split)
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True), batched=True
    )

# Inspect the available columns in the dataset
tokenized_dataset["train"]

train
test


Map:   0%|          | 0/558 [00:00<?, ? examples/s]

Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2229
})

In [5]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained gpt2 model
base_model = AutoModelForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

# GPT-2 model uses the pad token
base_model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [7]:
# Pretrained model Trainer
base_trainer = Trainer(
    model=base_model,
    args=TrainingArguments(
        output_dir="./data/base/spam_not_spam",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [8]:
# Evaluate the performance of pretrained model gpt2

In [9]:
pretrained_metrics = base_trainer.evaluate()
print(pretrained_metrics)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 0.8761538863182068, 'eval_accuracy': 0.6810035842293907, 'eval_runtime': 2.8256, 'eval_samples_per_second': 197.48, 'eval_steps_per_second': 12.387}


In [10]:
#base_trainer = None

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [11]:
# Create a LoRA configuration

In [12]:
from peft import LoraConfig
# Define LoRA configuration
config = LoraConfig(
    r=8,  # Attention rank
    lora_alpha=32,  # Scaling factor
    target_modules=["c_fc", "c_proj"],  # Adjusted for gpt2
    lora_dropout=0.1,  # Dropout for regularization
    bias="none",  # No additional biases
    task_type="SEQ_CLS",  # Sequence classification task
)

In [13]:
# Convert the Base model to PEFT model

In [14]:
from peft import get_peft_model

# Apply LoRA configuration
model = get_peft_model(base_model, config)

# confirm that model is using parameter-efficient fine-turning
model.print_trainable_parameters()



trainable params: 887,808 || all params: 125,327,616 || trainable%: 0.7083897614393303


In [15]:
# GPT-2 model uses the pad token
model.config.pad_token_id = tokenizer.pad_token_id

In [16]:
print(model)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GPT2ForSequenceClassification(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): Conv1D()
              (c_proj): Linear(
                in_features=768, out_features=768, bias=True
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=768, bias=False)
                )
                (lora_embedding_A): ParameterDict()
 

In [17]:
import shutil
shutil.rmtree("./data/spam_not_spam")

In [18]:
# Train the model
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./data/spam_not_spam",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

# Run the training
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.063128,0.973118


TrainOutput(global_step=279, training_loss=0.2288560764763945, metrics={'train_runtime': 24.7451, 'train_samples_per_second': 90.078, 'train_steps_per_second': 11.275, 'total_flos': 59898880760832.0, 'train_loss': 0.2288560764763945, 'epoch': 1.0})

In [19]:
# Evaluate the model

In [20]:
results = trainer.evaluate()
print(f"Evaluation results: {results}")

Evaluation results: {'eval_loss': 0.06312833726406097, 'eval_accuracy': 0.9731182795698925, 'eval_runtime': 2.5522, 'eval_samples_per_second': 218.637, 'eval_steps_per_second': 13.714, 'epoch': 1.0}


###  ⚠️ IMPORTANT ⚠️

Due to workspace storage constraints, you should not store the model weights in the same directory but rather use `/tmp` to avoid workspace crashes which are irrecoverable.
Ensure you save it in /tmp always.

In [21]:
# Saving the model
model.save_pretrained("/tmp/lora_gpt2")

In [22]:
tokenizer.save_pretrained("/tmp/lora_gpt2")

('/tmp/lora_gpt2/tokenizer_config.json',
 '/tmp/lora_gpt2/special_tokens_map.json',
 '/tmp/lora_gpt2/vocab.json',
 '/tmp/lora_gpt2/merges.txt',
 '/tmp/lora_gpt2/added_tokens.json',
 '/tmp/lora_gpt2/tokenizer.json')

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [23]:
# Reload saved LoRA weights
from peft import AutoPeftModelForSequenceClassification

In [24]:
# Load the PEFT model
inference_model = AutoPeftModelForSequenceClassification.from_pretrained(
    "/tmp/lora_gpt2",
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tokenizer.pad_token = tokenizer.eos_token
inference_model = inference_model.to("cuda")
# GPT-2 model uses the pad token
inference_model.config.pad_token_id = tokenizer.pad_token_id

### Evaluate the inference_model

In [26]:
# Inference model Trainer
inference_trainer = Trainer(
    model=inference_model,
    args=TrainingArguments(
        output_dir="./data/inference/spam_not_spam",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)


In [27]:
# Display the results of trainer of inference model

In [28]:
inference_results = inference_trainer.evaluate()
print(f"Evaluation inference results: {inference_results}")

Evaluation inference results: {'eval_loss': 0.06312833726406097, 'eval_accuracy': 0.9731182795698925, 'eval_runtime': 2.5692, 'eval_samples_per_second': 217.185, 'eval_steps_per_second': 13.623}


In [29]:
# Compare before and after reloading values

In [30]:
print(f"Before reload results: {results}")

Before reload results: {'eval_loss': 0.06312833726406097, 'eval_accuracy': 0.9731182795698925, 'eval_runtime': 2.5522, 'eval_samples_per_second': 218.637, 'eval_steps_per_second': 13.714, 'epoch': 1.0}


In [31]:
assert(inference_results==results,"Dont match")
print("Match")

Match


  assert(inference_results==results,"Dont match")


In [32]:
# Perform inference
import torch
from transformers import AutoTokenizer

# Ensure the model is on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inference_model = inference_model.to(device)  # Move model to GPU

tokenizer = AutoTokenizer.from_pretrained("gpt2")
question="Is this message spam?"
inputs = tokenizer(question, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}
outputs = inference_model(**inputs)
print(outputs)

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[-0.4028, -2.3404]], device='cuda:0', grad_fn=<IndexBackward0>), past_key_values=((tensor([[[[-1.4879e+00,  1.9890e+00,  1.0759e+00,  ..., -5.7608e-01,
           -8.6697e-01,  1.4378e+00],
          [-1.9078e+00,  2.4359e+00,  1.6746e+00,  ..., -1.2324e+00,
           -1.5769e+00,  2.0689e+00],
          [-2.0367e+00,  2.4628e+00,  3.6078e-01,  ..., -9.7354e-01,
           -6.1322e-01,  2.4231e+00],
          [-1.7821e+00,  3.0103e+00,  1.6854e+00,  ..., -5.8921e-01,
           -1.8405e+00,  1.5849e+00],
          [-2.5361e+00,  2.9027e+00,  1.9364e+00,  ..., -1.0628e+00,
           -2.3543e+00,  1.9235e+00]],

         [[ 2.8395e-01, -1.2671e-01, -5.6960e-01,  ..., -8.9820e-01,
            1.1233e+00,  1.6004e+00],
          [-2.3522e-01, -9.6090e-01, -2.0498e+00,  ..., -1.1942e-01,
            4.0640e+00, -2.8279e-01],
          [-1.3925e+00, -1.2548e+00, -2.3218e-01,  ...,  5.4722e-01,
            5.4904e+00,  1.5760e+00],


In [33]:
# get logits
# get class
predicted_class = torch.argmax(outputs.logits, dim=-1).item()
answer = inference_model.config.id2label[predicted_class]

print (f"Result for input:{question} is {answer}")

Result for input:Is this message spam? is not spam


In [34]:
# Saving the model
inference_model.save_pretrained("./tmp/lora_gpt2")

In [35]:
#done.