<a href="https://colab.research.google.com/github/HarshithaChilupuri/LLM-sFineTuning/blob/main/Untitled29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
!pip install peft
!pip install evaluate # install the missing package



In [63]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [64]:
model_checkpoint = 'distilbert-base-uncased'

#define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

#generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels =2, id2label=id2label, label2id = label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
#load dataset
dataset = load_dataset("shawhin/imdb-truncated")


In [66]:
dataset #Considered a dataset of only 1000 records and can select datsets with much lower values once we fine tune the model

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [67]:
#Preprocess data
#create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

#Converting text to numerical form for Machine understandability
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left" #truncating the long sequnces of data
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs




In [68]:
   # add pad token if none exists
if tokenizer.pad_token is None: #padding the data if its a short sequence
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

To get a tokenized version of the dataset I've chosen

In [69]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [70]:
#create datacollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='left', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [71]:
#Evaluation Metrics
import evaluate
accuracy = evaluate.load("accuracy")
accuracy


EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

Using Evaluation Metrics to Monitor the Performance of the LLM Model

In [72]:
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis =1)
  return{"accuracy":accuracy.compute(predictions=predictions,references=labels)}

In [73]:
# define list of examples
text_list = ["The most greatest of all time","It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass.","I Love This"]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")

    # compute logits
    logits = model(inputs).logits

    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
The most greatest of all time - Negative
It was good. - Negative
Not a fan, don't recommed. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative
I Love This - Negative


Evaluating the untrained Base Model befoe Fine tuning it and its as good as flipping a coin duh

In [74]:
#Training the Model using Peft for our Lora Model
peft_config = LoraConfig(task_type="SEQ_CLS", #Sequence Classification
                        r=4, #intrinsic rank
                        lora_alpha=32, #this is like a learning rate
                        lora_dropout=0.01, #probability of the dropout rate
                        target_modules = ['q_lin']) #applying loar to query layer, yay!

In [75]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [81]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [82]:
#hyperparameters declared
lr = 1e-3
batch_size = 2
num_epochs = 3

In [83]:
#defining training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification2",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



#Plug Everything to this Trainer Class

In [84]:
# a creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer, #r u there
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

In [85]:
# train model is under-training
trainer.train() #heyy

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6479,0.715791,{'accuracy': 0.848}
2,0.3973,0.591255,{'accuracy': 0.881}
3,0.1686,0.746752,{'accuracy': 0.887}


Trainer is attempting to log a value of "{'accuracy': 0.848}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.881}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.887}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


TrainOutput(global_step=1500, training_loss=0.40459475708007814, metrics={'train_runtime': 6661.6957, 'train_samples_per_second': 0.45, 'train_steps_per_second': 0.225, 'total_flos': 276964641291312.0, 'train_loss': 0.40459475708007814, 'epoch': 3.0})

In [86]:
# evaluate the model
trainer.evaluate()

Trainer is attempting to log a value of "{'accuracy': 0.881}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.5912553668022156,
 'eval_accuracy': {'accuracy': 0.881},
 'eval_runtime': 839.8018,
 'eval_samples_per_second': 1.191,
 'eval_steps_per_second': 0.595,
 'epoch': 3.0}

In [87]:
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Looking in indexes: https://download.pytorch.org/whl/nightly/cpu
Trained model predictions:
--------------------------
The most greatest of all time - Positive
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative
I Love This - Positive


In [92]:
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Assuming tokenizer and model are already loaded
# id2label maps prediction to labels

# Example text list for evaluation
text_list = ["It was not bad","The movie was fantastic!", "The product is defective.", "Excellent service!", "Not worth the price."]

# Ground truth labels (for example purposes, binary: 1 for positive, 0 for negative)
true_labels = [0, 1, 0, 1, 0]

# Store predictions
predicted_labels = []

# Move model to CPU (or MPS for Mac)
model.to('cpu')

print("Trained model predictions:")
print("--------------------------")

for text in text_list:
    # Tokenize the text
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")  # Adjust device as necessary

    # Get model predictions
    with torch.no_grad():
        logits = model(inputs).logits

    # Get predicted label (class with highest score)
    predictions = torch.argmax(logits, dim=1).item()
    predicted_labels.append(predictions)

    # Print the prediction
    print(f"{text} - Predicted label: {id2label[predictions]}")

# Convert to numpy arrays for evaluation
predicted_labels = np.array(predicted_labels)
true_labels = np.array(true_labels)

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(true_labels, predicted_labels)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='binary')

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Trained model predictions:
--------------------------
It was not bad - Predicted label: Negative
The movie was fantastic! - Predicted label: Positive
The product is defective. - Predicted label: Negative
Excellent service! - Predicted label: Positive
Not worth the price. - Predicted label: Negative

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-score: 1.0000
