<a href="https://colab.research.google.com/github/Hamo0434/LLama-2-form-scratch/blob/main/fine_tuned_model_falcon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import pandas as pd
import numpy
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import time
# %pip install evaluate
import evaluate
from datasets import load_dataset

In [None]:
pip install -U datasets huggingface_hub fsspec

In [None]:
# pip install -U datasets huggingface_hub fsspec

In [None]:
dataset = load_dataset("knkarthick/dialogsum")

In [None]:
import os
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '60'

model_name = 'google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name ,torch_dtype = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def print_number_of_model_trained_parameters(model):
  all_parameters = 0
  trained_parameters = 0
  for _,parameter in model.named_parameters():
    all_parameters += parameter.numel()
    if parameter.requires_grad:
      trained_parameters += parameter.numel()
  return (f"trained_params {trained_parameters} \nall model parameters: {all_parameters}\npercentage of trainable model parameters: {100 * trained_parameters / all_parameters:.2f}%")

print(print_number_of_model_trained_parameters(original_model))

In [None]:
index = 150
dialogue = dataset['test'][index]['dialogue']

summary = dataset['test'][index]['summary']

prompt = f''' summarize the following text'
{dialogue}
summary :
'''
inputs = tokenizer(prompt , return_tensors = 'pt')
output = tokenizer.decode(
      original_model.generate(
        inputs['input_ids'],
        max_new_tokens = 150,
    )[0],
    skip_special_tokens = True
)
dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

In [None]:
def tokenize_function(example):
  start_prompt = 'summarize this text'
  end_prompt = '\n Summary'
  prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
  example['input_ids'] = tokenizer(prompt , padding = 'max_length' , truncation = True , return_tensors = 'pt').input_ids
  example['labels'] = tokenizer(example['summary'] , padding= 'max_length' , truncation = True , return_tensors = 'pt').input_ids
  return example

tokenized_dataset = dataset.map(tokenize_function , batched = True)
tokenized_dataset = tokenized_dataset.remove_columns(['topic' , 'id' , 'dialogue' , 'summary'])

## checking the shapes of the data

In [None]:
print('The shapes of datasets :')
print(f"Training set : {tokenized_dataset['train'].shape}")
print(f"Testing set : {tokenized_dataset['test'].shape}")
print(f"Validation set : {tokenized_dataset['validation'].shape}")

Fine Tune the model in the new dataset

In [None]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
training_arguments = TrainingArguments(
    output_dir = output_dir ,
    num_train_epochs = 1 ,
    learning_rate = 1e-5 ,
    weight_decay = 0.01 ,
    logging_steps = 1 ,
    max_steps = 1
)

trainer = Trainer(
    model = original_model ,
    args = training_arguments ,
    train_dataset = tokenized_dataset['train'] ,
    eval_dataset = tokenized_dataset['test']

)

Now we are already to fine tune the model


In [None]:
trainer.train()

Save the model


In [None]:
trained_model_dir = "./trained_model"
trainer.save_model(trained_model_dir)

trained_model = AutoModelForSeq2SeqLM.from_pretrained(trained_model_dir)

Evaluation of the model

In [None]:
input_ids = tokenizer(prompt , return_tensors = 'pt').input_ids
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
trained_model = trained_model.to(device)
original_model = original_model.to(device)
input_ids = input_ids.to(device)

# Generate output using the original model
generation_config = GenerationConfig(max_new_tokens=150 , num_beams = 1)
output_original_model = original_model.generate(input_ids=input_ids, generation_config=generation_config)
original_model_text_output = tokenizer.decode(output_original_model[0], skip_special_tokens=True)

# Generate output using the trained model
output_trained_model = trained_model.generate(input_ids=input_ids, generation_config=generation_config)
trained_model_text_output = tokenizer.decode(output_trained_model[0], skip_special_tokens=True)

human_baseline_summary = summary
dash_line = '-' * 50
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'TRAINED MODEL:\n{trained_model_text_output}')

In [None]:
# !pip install rouge_score
rouge = evaluate.load('rouge')


In [None]:
dialogue = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _ , dialogue in enumerate(dialogue):
  prompt = f"""
  summarize the following text"""
  input_ids = tokenizer(prompt , return_tensors = 'pt').input_ids
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  input_ids = input_ids.to(device)


    # Generate output using the original model
  generation_config = GenerationConfig(max_new_tokens=150 , num_beams = 1)
  output_original_model = original_model.generate(input_ids=input_ids, generation_config=generation_config)
  original_model_text_output = tokenizer.decode(output_original_model[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

# Generate output using the trained model
  instruct_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=generation_config)
  instruct_model_text_output = tokenizer.decode(output_trained_model[0], skip_special_tokens=True)
  instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries , original_model_summaries , instruct_model_summaries))
df = pandas.DataFrame(zipped_summaries , columns = ['human_baseline_summaries' , 'original_model_summaries' , 'instruct_model_summaries'])
df

In [None]:
from rouge import Rouge
rouge = Rouge()
original_model_results = rouge.get_scores(
    original_model_summaries,
    human_baseline_summaries[0:len(original_model_summaries)],
    )
instruct_model_results = rouge.get_scores(
    instruct_model_summaries,
    human_baseline_summaries[0:len(instruct_model_summaries)],
    )
print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

In [None]:
pip install peft

In [None]:
from peft import LoraConfig , get_peft_model , TaskType

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [None]:
peft_model= get_peft_model(
    original_model,
    lora_config
)
print(print_number_of_model_trained_parameters(peft_model
                                               ))

In [None]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_dataset["train"],
)

In [None]:
peft_trainer.train()
peft_model_path="./peft-dialogue-summary-checkpoint/"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)


In [None]:
from peft import peft_model , PeftConfig
path_base_model = "google/flan-t5-base"
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(path_base_model , torch_dtype  = torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(path_base_model)
peft_model = PeftModel.from_pretrained(peft_model_base , peft_model_path)

In [None]:
index = 150
dialogue = dataset['test'][index]['dialogue']
base_line_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Ensure that input_ids and the models are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
original_model.to(device)
trained_model.to(device)
peft_model.to(device)

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
print(original_model_text_output)

instruct_model_outputs = trained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY: \n{base_line_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL: \n{original_model_text_output}')
print(dash_line)
print(f'TRAINED MODEL: \n{instruct_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: \n{peft_model_text_output}')

In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']
original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
  prompt = f"""
  summarize the following conversation
  {dialogue}
  Summary:

  """
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids

  # Ensure that input_ids and the models are on the same device
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  input_ids = input_ids.to(device)

  human_baseline_text_output = human_baseline_summaries[idx]

  original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

  instruct_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
  instruct_model_summaries.append(instruct_model_text_output)

  peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
  peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)