## Logging into huggingface

In [1]:
# login to huggingface snippet

from huggingface_hub import login



login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Enabling GPU usage

In [2]:
import torch

# Check if a GPU is available

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


# Required Imports

In [3]:
from transformers import (

    AutoTokenizer,

    AutoModelForSequenceClassification,

    Trainer,

    TrainingArguments,

    DataCollatorWithPadding,

)



from datasets import load_dataset

import numpy as np

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Loading the Model and the Model's Tokenizer

In [None]:


model_name = "meta-llama/Llama-3.2-1B"

dataset = load_dataset("glue", "sst2")

tokenizer = AutoTokenizer.from_pretrained(model_name)



# loading model - the Llama-3.2-1B model is not meant for classification, so the AutoModelForSequenceClassification adds a classification head to the loaded model

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)



print("Model (after adding classification head) and tokenizer loaded.")

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model (after adding classification head) and tokenizer loaded.


## Setting the tokenizer padding manually

In [6]:
if tokenizer.pad_token is None:

    tokenizer.pad_token = tokenizer.eos_token  # Use the eos_token as the pad_token

    model.resize_token_embeddings(len(tokenizer))  # Resize embeddings to accommodate new tokens



print("Tokenizer padding set.")

Tokenizer padding set.


# Calculating Total Model Parameters Before Adding the Classication Layer and Fine-tuning:

### Without the classification head: The LLaMA model, being a language model, is designed to predict the next token in a sequence, not to directly output class labels for tasks like sentiment analysis. To fine-tune LLaMA for classification tasks, such as SST-2, a classification head (a linear layer) must be added to the model. This head maps the contextual embeddings generated by the LLaMA model (specifically from the [CLS] token or its equivalent) into logits corresponding to the target classes. Without this classification layer, the model outputs sequence-level embeddings that are not suitable for computing classification loss or making predictions. Fine-tuning the model with this added layer allows it to adapt its parameters to the classification task, making it capable of generating the necessary class logits for tasks like sentiment analysis.

### Without adding this layer, we cannot use the model for a classification task. Hence, for this assignment, when we are comparing the number of parameters before and after fine-tuning, we are considering the model WITHOUT adding the classification layer, and with the classification layer. Fine-tuning by itself does not change the model's parameters. But adding the classification layer adds to the number of parameters, which is needed for fine-tuning the model on classification tasks. Hence, we compared the total model parameters of the base model (calculated below) and the parameters of the fine-tuned model (calculated after fine-tuning). But if we calculate the parameters after adding the layer before fine-tuning and after fine-tuning, the number of parameters will be the same.



## In short, fine-tuning by itself does not change the model's parameters, but adding the classification layer (which is needed for fine-tuning for a classification task) does.

In [7]:
from transformers import AutoModelForCausalLM

from tabulate import tabulate



# this is the original model - before we added the classification layer:

base_model = AutoModelForCausalLM.from_pretrained(model_name) 



def print_model_parameters_tabular(model):

    parameters = []

    total_params = 0

    trainable_params = 0



    # Collect model parameter details

    for name, param in model.named_parameters():

        num_params = param.numel()

        total_params += num_params

        if param.requires_grad:

            trainable_params += num_params

        parameters.append([name, list(param.shape), param.requires_grad, f"{num_params:,}"])



    # Print parameter details in a more compact, line-by-line format

    print(f"Model Parameters for {type(model).__name__}:\n")

    for param in parameters:

        name, shape, requires_grad, num_elements = param

        print(f"Parameter Name: {name}")

        print(f"  Shape: {shape}")

        print(f"  Requires Grad: {requires_grad}")

        print(f"  Total Elements: {num_elements}")

        print("-" * 50)  # Separator line for clarity

    

    # Print summary

    print(f"\nTotal Parameters: {total_params:,}")

    print(f"Trainable Parameters: {trainable_params:,}")

    print(f"Non-Trainable Parameters: {total_params - trainable_params:,}")



# Example usage

print_model_parameters_tabular(base_model)

print("Model parameters before fine-tuning (before adding classification head) printed.")

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Model Parameters for LlamaForCausalLM:

Parameter Name: model.embed_tokens.weight
  Shape: [128256, 2048]
  Requires Grad: True
  Total Elements: 262,668,288
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.q_proj.weight
  Shape: [2048, 2048]
  Requires Grad: True
  Total Elements: 4,194,304
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.k_proj.weight
  Shape: [512, 2048]
  Requires Grad: True
  Total Elements: 1,048,576
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.v_proj.weight
  Shape: [512, 2048]
  Requires Grad: True
  Total Elements: 1,048,576
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.o_proj.weight
  Shape: [2048, 2048]
  Requires Grad: True
  Total Elements: 4,194,304
--------------------------------------------------
Parameter Name: model.layers.0.mlp.gate_proj.weight
  Shape: [8192, 2048]
  Requi



Total Model Parameters Without the Classification Layer (i.e., before fine-tuning the model for classification): 1235814400.

Yes, this number matches with the parameters reported in the official documentation of the model

# Preparing the SST-2 Dataset:

1. Creating test and train splits

In [8]:
# Perform an 80:20 train-test split

split_dataset = dataset["train"].train_test_split(test_size=0.2, seed=1)

train_dataset = split_dataset["train"]

test_dataset = split_dataset["test"]

eval_dataset = dataset["validation"]

print("Training data: ", train_dataset)

print("Testing data: ", test_dataset)

print("Dataset divided into training and testing portions.")

Training data:  Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 53879
})
Testing data:  Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 13470
})
Dataset divided into training and testing portions.


2. Tokenizing the dataset:

In [9]:
# Tokenize the dataset

def preprocess_function(examples):

    return tokenizer(examples["sentence"], truncation=True, max_length=128)



train_dataset = train_dataset.map(preprocess_function, batched=True)

test_dataset = test_dataset.map(preprocess_function, batched=True)

eval_dataset = eval_dataset.map(preprocess_function, batched=True)



print("Tokenized training data: ", train_dataset)

print("Tokenized testing data: ", test_dataset)

print("Data tokenized.")

Map:   0%|          | 0/53879 [00:00<?, ? examples/s]

Map:   0%|          | 0/13470 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Tokenized training data:  Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 53879
})
Tokenized testing data:  Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 13470
})
Data tokenized.


In [11]:
# Data collator for padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("Instantiated data collator.")

Instantiated data collator.


3. Function to Compute Evaluation Metrics

In [12]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")

    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}



print("Function to compute metrics written.")

Function to compute metrics written.


Shifting to GPU Before Training

In [13]:
model.to(device)

print("GPU in use for training.")

GPU in use for training.


# Evaluating the model's performance before fine-tuning:

In [14]:
pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [16]:
import evaluate

print("Imported evaluate.")

Imported evaluate.


In [17]:
# Define training arguments

training_args = TrainingArguments(

    output_dir="./results",          # Directory for model checkpoints

    per_device_eval_batch_size=1,  # Evaluation batch size

    do_eval=True,                   # Perform evaluation

    logging_dir='./logs',           # Logging directory

    report_to="none",               # Disable reporting to other systems (e.g., WandB)

)



# Create the Trainer

trainer = Trainer(

    model=model,      # Your model with the classification head

    args=training_args,              # Training arguments

    eval_dataset=test_dataset,          # Evaluation dataset

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics, # Metrics function

)



# Evaluate the model

results = trainer.evaluate()



# Print the metrics

print(results)

# these results will be used for comparison with the model performance after fine-tuning



print("Model evaluated without fine-tuning on test data and results stored for comparison with performance of fine-tuned model.")

{'eval_loss': 1.7600301504135132, 'eval_model_preparation_time': 0.003, 'eval_accuracy': 0.4414996288047513, 'eval_precision': 0.5531914893617021, 'eval_recall': 0.006902960307978229, 'eval_f1': 0.01363576766749705, 'eval_runtime': 262.2883, 'eval_samples_per_second': 51.356, 'eval_steps_per_second': 51.356}
Model evaluated without fine-tuning on test data and results stored for comparison with performance of fine-tuned model.


# Freezing all Model Layers Except the Classification Layer Before Fine-tuning

(not changing the parameters of the pretrained model due to computation and memory constraints - so we will only be changing the 4096 parameters of the classification layer)

In [18]:
# Freeze all layers of the base model

for param in model.model.parameters():

    param.requires_grad = False



# Keep the classification head (score layer) trainable

for param in model.score.parameters():

    param.requires_grad = True



print("All parameters except classification layer parameters frozen.")

All parameters except classification layer parameters frozen.


# Setting Up the Trainer

In [19]:
# Define training arguments

training_args = TrainingArguments(

    output_dir="./results",

    evaluation_strategy="epoch",

    save_strategy="epoch",

    learning_rate=2e-5,

    per_device_train_batch_size=1, # batch size is kept 1 due to computation and memory constraints

    per_device_eval_batch_size=1,

    num_train_epochs=3,

    weight_decay=0.01,

    logging_dir="./logs",

    load_best_model_at_end=True,

    metric_for_best_model="accuracy",

)



# Initialize Trainer

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=train_dataset,

    eval_dataset=eval_dataset,

    tokenizer=tokenizer,

    data_collator=data_collator,

    compute_metrics=compute_metrics,

)

print("Trainer initialised.")



Trainer initialised.


# Fine-tuning:

In [20]:
# Fine-tune the model

trainer.train()

print("Model fine-tuned.")


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011115281066668103, max=1.0…

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5557,0.604033,0.819954,0.815385,0.835586,0.825362
2,0.6067,0.543783,0.848624,0.873206,0.822072,0.846868
3,0.5058,0.527815,0.857798,0.873832,0.842342,0.857798


Model fine-tuned.


# Comparing model performance before and after fine-tuning:

In [21]:
#saving the model first:

trainer.save_model('./finetuned_model_sst2')

print("model saved in working directory.")



import pandas as pd

from tabulate import tabulate



# Your evaluation results dictionaries

evaluation_results_before = {

    'eval_accuracy': 0.4414996288047513, 

    'eval_precision': 0.5531914893617021, 

    'eval_recall': 0.006902960307978229, 

    'eval_f1': 0.01363576766749705

}



evaluation_results_after_fine_tuning = {

    'fine_tune_accuracy': 0.857798, 

    'fine_tune_precision': 0.873832, 

    'fine_tune_recall': 0.842342, 

    'fine_tune_f1': 0.857798

}



# Convert dictionaries to DataFrames

df_before = pd.DataFrame(list(evaluation_results_before.items()), columns=["Metric", "Before Fine-Tuning"])

df_after = pd.DataFrame(list(evaluation_results_after_fine_tuning.items()), columns=["Metric", "After Fine-Tuning"])



# Rename the 'Metric' column to match before merging

df_after['Metric'] = df_after['Metric'].apply(lambda x: x.replace('fine_tune_', 'eval_'))



# Merge the two DataFrames on the 'Metric' column

df_comparison = pd.merge(df_before, df_after, on="Metric")



# Print the comparison table using tabulate for better formatting

print("\nComparison Results:")

print(tabulate(df_comparison, headers='keys', tablefmt='pipe', showindex=False))



print("Comparison before and after fine-tuning done.")


model saved in working directory.

Comparison Results:
| Metric         |   Before Fine-Tuning |   After Fine-Tuning |
|:---------------|---------------------:|--------------------:|
| eval_accuracy  |           0.4415     |            0.857798 |
| eval_precision |           0.553191   |            0.873832 |
| eval_recall    |           0.00690296 |            0.842342 |
| eval_f1        |           0.0136358  |            0.857798 |
Comparison before and after fine-tuning done.


### As can be seen, the model performs better in terms of all metrics suitable for classification - accuracy, precision, recall and f1-score after fine-tuning. This is because fine-tuning allows a pre-trained model to adapt to a specific task, using its pre-existing general knowledge and improving its performance on the task by learning task-specific patterns and features. In this case, while the Llama model is highly effective for tasks such as text generation, question answering, language modeling, and other tasks involving natural language inference, they are not specifically designed for classification tasks. So without adding a classification layer and fine-tuning, it wouldn't be suitable for classification task, evident in the model's performance before fine-tuning (after adding classification layer). But fine-tuning greatly improved the model's performance on the SST-2 dataset.

# Calculating Parameters After Fine-Tuning:

In [None]:
from tabulate import tabulate



def print_model_parameters_tabular(model):

    parameters = []

    total_params = 0

    trainable_params = 0



    # Collect model parameter details

    for name, param in model.named_parameters():

        num_params = param.numel()

        total_params += num_params

        if param.requires_grad:

            trainable_params += num_params

        parameters.append([name, list(param.shape), param.requires_grad, f"{num_params:,}"])



    # Print parameter details in a more compact, line-by-line format

    print(f"Model Parameters for {type(model).__name__}:\n")

    for param in parameters:

        name, shape, requires_grad, num_elements = param

        print(f"Parameter Name: {name}")

        print(f"  Shape: {shape}")

        print(f"  Requires Grad: {requires_grad}")

        print(f"  Total Elements: {num_elements}")

        print("-" * 50)  # Separator line for clarity

    

    # Print summary

    print(f"\nTotal Parameters: {total_params:,}")

    print(f"Trainable Parameters: {trainable_params:,}")

    print(f"Non-Trainable Parameters: {total_params - trainable_params:,}")



# Example usage

print_model_parameters_tabular(model)

print("Parameters after fine-tuning calculated.")

Model Parameters for LlamaForSequenceClassification:

Parameter Name: model.embed_tokens.weight
  Shape: [128256, 2048]
  Requires Grad: False
  Total Elements: 262,668,288
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.q_proj.weight
  Shape: [2048, 2048]
  Requires Grad: False
  Total Elements: 4,194,304
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.k_proj.weight
  Shape: [512, 2048]
  Requires Grad: False
  Total Elements: 1,048,576
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.v_proj.weight
  Shape: [512, 2048]
  Requires Grad: False
  Total Elements: 1,048,576
--------------------------------------------------
Parameter Name: model.layers.0.self_attn.o_proj.weight
  Shape: [2048, 2048]
  Requires Grad: False
  Total Elements: 4,194,304
--------------------------------------------------
Parameter Name: model.layers.0.mlp.gate_proj.weight
  Shape: [

## Total model parameters after adding the classification layer and fine-tuning: 1235818496

### As can be seen, 4096 extra parameters were added after making the model suitable for classification on adding the classification layer and fine-tuning the model on a classification dataset. (as explained above, the 4096 parameters are due to the addition of the classification layer, not due to fine-tuning)

# Uploading the model to Huggingface

In [23]:
# Repository name on Hugging Face Hub (e.g., "username/model_name")

repo_name = "jiya14desai/Llama-3.2-1B_fine-tuned_on_classification"



# Push to Hugging Face Hub

model.push_to_hub(repo_name)

# tokenizer.push_to_hub(repo_name)

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jiya14desai/Llama-3.2-1B_fine-tuned_on_classification/commit/1be21fd06ad33e619e8710308964d818e9f93ee7', commit_message='Upload LlamaForSequenceClassification', commit_description='', oid='1be21fd06ad33e619e8710308964d818e9f93ee7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jiya14desai/Llama-3.2-1B_fine-tuned_on_classification', endpoint='https://huggingface.co', repo_type='model', repo_id='jiya14desai/Llama-3.2-1B_fine-tuned_on_classification'), pr_revision=None, pr_num=None)

### The final fine-tuned model is available on https://huggingface.co/jiya14desai/Llama-3.2-1B_fine-tuned_on_classification

# Testing the fine-tuned model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the fine-tuned model and tokenizer
model_name = "jiya14desai/Llama-3.2-1B_fine-tuned_on_classification"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Use the standard <eos> token as the pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.resize_token_embeddings(len(tokenizer))  # Resize embeddings for new pad token

# Input texts
texts = [
    "This movie is great! A must-watch.",
    "This could be the greatest flop movie of all times.",
    "Average script but good performance by the actors.",
    "A fun one-time watch.",
    "Very mid. Forgettable.",
    "Read the book, don't watch the movie.",
    "Don't watch if you don't have a good taste in art. Only for admirers of art."
]

# Predict for each input
for text in texts:
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.argmax(dim=-1)
    print(f"Input: {text}")
    print(f"Predicted class: {predictions.item()}\n")

config.json:   0%|          | 0.00/946 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Input: This movie is great! A must-watch.
Predicted class: 1

Input: This could be the greatest flop movie of all times.
Predicted class: 0

Input: Average script but good performance by the actors.
Predicted class: 1

Input: A fun one-time watch.
Predicted class: 1

Input: Very mid. Forgettable.
Predicted class: 0

Input: Read the book, don't watch the movie.
Predicted class: 0

Input: Don't watch if you don't have a good taste in art. Only for admirers of art.
Predicted class: 0

