<a href="https://colab.research.google.com/github/JhanviMistry/LoRA/blob/main/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import math

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, default_data_collator

from peft import PeftModel
import bitsandbytes as bnb



In [None]:
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
adapter_path = './LoRA_tinyllama_tuned_adapter_model'

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

#load the base model and the tokenizer from the hugging face
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True
).eval() #loading the model same as previous but here its just in inferance or evaluation mode

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)

temp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True
)

tuned_model = PeftModel.from_pretrained(temp_model, adapter_path)
tuned_model = tuned_model.merge_and_unload().eval()

In [None]:
#tokenize a batch of texts
def tokenize(batch):
  texts = [
      f"### Instruction:\n{instruction} \n### Response:\n{out}"
      for instruction, out in zip(batch['question'], batch['answer'])
  ]

  tokens = tokenizer(
      texts,
      padding = 'max_length',
      max_length = 256,
      truncation = True,
      return_tensors = 'pt' #pytorch

  )

  tokens['labels'] = tokens['input_ids'].clone()

  return tokens

In [None]:
eval_ds = load_dataset('openai/gsm8k', 'main', split = 'train[:200]')
eval_ds = eval.map(tokenize, batched = True, remove_columns = ['question', 'answer'])
eval_ds = eval_ds.with_format('torch')

In [None]:
eval_loader = DataLoader(
    eval_ds,
    batch_size = 8,
    collate_fn = default_data_collator
)

In [None]:
#compute perplexity -> how likely the model is to generate answer based upon response
#calculate exponential of cross entropy
# the lower the perplexity the better

@torch-no_grad() #no gradient
def compute_perplexity():
  losses = []
  for batch in eval_loader:
    batch = {k: v.to('cuda') for k, v in batch.items()} #moving the batch to GPU
    loss = model(**batch).loss #unpacks the dictionary into , eg: model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
    #.loss computes how wrong the model in on that batch
    losses.append(loss.item())

  return math.exp(sum(losses)/len(losses))

In [None]:
print(f'Base Model Perplexity: {compute_perplexity(base_model):.2f}')
print(f'Base Model Perplexity: {compute_perplexity(tuned_model):.2f}')

In [None]:
import random

raw_data = load_dataset('opensi/gsm8k', 'main', split='train[:200]')
refs = raw_data['answer']

def genarate(model, instruction):
  token_ids = tokenizer(f'### Instruction:\n{instruction}\n### Response:\n', return_tensors = 'pt').input_ids.to('cuda')

  with torch.no_grad():
    output = model.generate(token_ids, max_new_token=256)

  return tokenizer.decode(out[0], skip_special_tokens = True)

In [None]:
raw_data['question'][0]

In [None]:
print(refs[0])

In [None]:
print(generate(base_model, raw_data['question'][0]))

In [None]:
print(generate(tuned_model, raw_data['question'][0]))