In [None]:
%pip install torchdata==0.10.1
%pip install --upgrade transformers
%pip install --upgrade sentence-transformers

In [None]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

%pip install  wandb


In [None]:
!nvidia-smi

In [None]:
#Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

In [None]:
PROJECT = "Dialogue-Summarizer-Flan-T5"
MODEL_NAME = 'google/flan-t5-large'
DATASET = "knkarthick/dialogsum"

In [None]:
import wandb
wandb.init(project=PROJECT,
           tags=[MODEL_NAME, DATASET],
           notes ="Dialogue Summarizer using Flan T5")

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch
import time
import evaluate
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

# **I. Load Dataset and Flan-T5**

In [None]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

In [None]:
with wandb.init(project=PROJECT, job_type="dataset"):
   wbtrain = wandb.Table(data=dataset['train'].to_pandas())
   wbvalidation = wandb.Table(data=dataset['validation'].to_pandas())
   wbtest = wandb.Table(data=dataset['test'].to_pandas())
   wandb.log({"dialogsum_train": wbtrain})
   wandb.log({"dialogsum_validation": wbvalidation})
   wandb.log({"dialogsum_test": wbtest})


In [None]:
example_indices = [40, 200]

dash_line = '-'.join('' for x in range(100))

for i, index in enumerate(example_indices):
    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print('INPUT DIALOGUE:')
    print(dataset['test'][index]['dialogue'])
    print(dash_line)
    print('BASELINE HUMAN SUMMARY:')
    print(dataset['test'][index]['summary'])
    print(dash_line)
    print()

In [None]:
#Load the model : Flan-T5: creating an instance of the AutoModelForSeq2SeqLM class with the .from_pretrained() method.
model_name='google/flan-t5-large'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
#Download the tokenizer related to FLAN-T5: Tokenization is the process of splitting texts into smaller units that can be processed by the LLM models.
#Download the tokenizer for the FLAN-T5 model using AutoTokenizer.from_pretrained() method. Parameter use_fast switches on fast tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
#test the Tokenizer
sentence = "What time is it, Tom?"

sentence_encoded = tokenizer(sentence, return_tensors='pt')

sentence_decoded = tokenizer.decode(
        sentence_encoded["input_ids"][0],
        skip_special_tokens=True
    )

print('ENCODED SENTENCE:')
print(sentence_encoded["input_ids"][0])
print('\nDECODED SENTENCE:')
print(sentence_decoded)

In [None]:
# determine the nbr of model parameters and how many of them are trainable
def print_number_of_trainable_model_parameters(model, tag="original_model"):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    with wandb.init(project=PROJECT, job_type="log_parameters"):
      wandb.log({f'{tag}': {"trainable_model_params":trainable_model_params}})
      wandb.log({f'{tag}': {"all_model_params":all_model_params}})
      wandb.log({f'{tag}': {"percentage_of_trainable_model_parameters": 100 * trainable_model_params}} )

    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params}%"

print(print_number_of_trainable_model_parameters(original_model))

In [None]:
#explore how well the base LLM summarizes a dialogue : this an inference and evaluation script
for i, index in enumerate(example_indices):
    dialogue = dataset['test'][index]['dialogue']
    summary = dataset['test'][index]['summary']

    inputs = tokenizer(dialogue, return_tensors='pt')
    output = tokenizer.decode(
       original_model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

    print(dash_line)
    print('Example ', i + 1)
    print(dash_line)
    print(f'INPUT PROMPT:\n{dialogue}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}')
    print(dash_line)
    print(f'MODEL GENERATION - WITHOUT PROMPT ENGINEERING:\n{output}\n')


# **II. Processing the Dataset**

In [None]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])

In [None]:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [None]:
tokenized_datasets['train'].to_pandas().head()

In [None]:
with wandb.init(project=PROJECT, job_type="dataset"):
   wbtrain_tokenized = wandb.Table(data=tokenized_datasets['train'].to_pandas())
   wbvalidation_tokenized = wandb.Table(data=tokenized_datasets['validation'].to_pandas())
   wbtest_tokenized = wandb.Table(data=tokenized_datasets['test'].to_pandas())
   wandb.log({"dialogsum_train_tokenized": wbtrain_tokenized})
   wandb.log({"dialogsum_validation_tokenized": wbvalidation_tokenized})
   wandb.log({"dialogsum_test_tokenized": wbtest_tokenized})

In [None]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

# **III. Perform Parameter Efficient Fine-Tuning (PEFT: LoRA)**

In [None]:
from types import SimpleNamespace
from pathlib import Path
from tqdm.notebook import tqdm
from datetime import datetime
from peft import LoraConfig, get_peft_model, TaskType


In [None]:

config2 = SimpleNamespace(
    # Training hyperparameters
    learning_rate=1e-4,               # Lowered for stability with LoRA
    gradient_accumulation_steps=2,    # Effective batch size = auto-detected * 2
    num_train_epochs=5,               # Reduced to avoid overfitting
    save_steps=500,                   # ~Half an epoch for 5,000 examples, batch size ~8
    save_strategy='steps',
    eval_steps=500,                   # Evaluate midway through each epoch
    logging_steps=500,                # Log midway through each epoch
    evaluation_strategy="steps",
    warmup_steps=100,                 # Shortened for faster adaptation
    save_total_limit=3,               # Keep 3 checkpoints
    load_best_model_at_end=True,      # Revert to best model
    output_dir = f'./peft-dialogue-summary-large-training-final2-{str(int(time.time()))}',
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM, # FLAN-T5
    auto_find_batch_size=True,
)

In [None]:
lora_config = LoraConfig(
    r=config2.r,
    lora_alpha=config2.lora_alpha,
    target_modules=config2.target_modules,
    lora_dropout=config2.lora_dropout,
    bias=config2.bias,
    task_type=config2.task_type # FLAN-T5
)

In [None]:
#Add LoRA adapter layers/parameters to the original LLM to be trained.
peft_model = get_peft_model(original_model,lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=peft_model
)

In [None]:
output_dir = f'./peft-dialogue-summary-large-training-final2-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    auto_find_batch_size=config2.auto_find_batch_size,
    output_dir=config2.output_dir,
    learning_rate=config2.learning_rate,
    gradient_accumulation_steps=config2.gradient_accumulation_steps,
    num_train_epochs=config2.num_train_epochs,
    save_steps=config2.save_steps,
    save_strategy=config2.save_strategy, # we cannot set it to "no". Otherwise, the model cannot guess the best checkpoint.
    eval_steps=config2.eval_steps,
    logging_steps=config2.logging_steps,
    evaluation_strategy=config2.evaluation_strategy,
    warmup_steps=config2.warmup_steps,
    save_total_limit=config2.save_total_limit,
    load_best_model_at_end = config2.load_best_model_at_end,
    report_to="wandb",
    run_name=f"PEFT_tuning-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"

)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
)

In [None]:
#train
with wandb.init(project=PROJECT, job_type="train"):
      peft_trainer.train()

peft_model_path="./peft-dialogue-summary-large-checkpoint-final2"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
peft_model_path="./peft-dialogue-summary-large-checkpoint-final2"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [None]:
#Evaluation
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       peft_model_path,
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [None]:
peft_model = peft_model.to("cuda")

In [None]:
# Define W&B Table to store generations
columns = ["index", "dialoge", "prompt", "human_sumary","peft_model_output"]
table3 = wandb.Table(columns=columns)

In [None]:
lindex = [100,200,300]
for index in lindex:
  dialogue = dataset['test'][index]['dialogue']
  baseline_human_summary = dataset['test'][index]['summary']

  prompt = f"""
  Summarize the following conversation.

  {dialogue}

  Summary: """

  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
  input_ids = input_ids.to("cuda")
    
  original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)


  peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
  peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)


print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'PEFT MODEL: {peft_model_text_output}')

In [None]:
with wandb.init(project=PROJECT, job_type="examples"):

   wandb.log({"peft_model": table3})

In [None]:
rouge = evaluate.load('rouge')

In [None]:
# Evaluate the Model Quantitatively (with ROUGE Metric)
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids = input_ids.to("cuda")


    human_baseline_text_output = human_baseline_summaries[idx]

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

In [None]:
with wandb.init(project=PROJECT, job_type="examples"):
  table2= wandb.Table(data=df)
  wandb.log({"outputs_original_instruct_peft_model": table2})

In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)


peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)

print('PEFT MODEL:')
print(peft_model_results)

In [None]:
human_baseline_summaries = df['human_baseline_summaries'].values
original_model_summaries = df['original_model_summaries'].values
peft_model_summaries     = df['peft_model_summaries'].values

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)

print('PEFT MODEL:')
print(peft_model_results)

In [None]:
# Define W&B Table to store generations
columns = ["metric", "original_model","improvement"]
table10 = wandb.Table(columns=columns)

In [None]:
print("Absolute percentage improvement of PEFT MODEL over HUMAN BASELINE")
with wandb.init(project=PROJECT, job_type="metrics"):
  #peft model
  improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
  for key, value, inst, ori in zip(peft_model_results.keys(), improvement, peft_model_results.values(),original_model_results.values() ):
      print(f'{key}: {value*100:.2f}% original = {ori} instruct = {inst}' )
      table10.add_data(key, ori, f"{value*100:.2f}%")

  wandb.log({"Rouge Metrics": table10})

In [None]:
results_path = "./results/results_flan_large_final2_t5.csv"
df.to_csv(results_path, index=False)

In [None]:
wandb.finish()