In [7]:
%pip install --upgrade pip
%pip install --disable-pip-version-check torch==1.13.1 torchdata==0.5.1 --quiet 

^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [8]:
%pip install transformers==4.27.2 datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2\
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Note: you may need to restart the kernel to use updated packages.


In [9]:
import torch 
import time 
import evaluate  ## for calculating rouge score
import pandas as pd
import numpy as np

from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer

### Load Dataset and LLM

In [10]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

Found cached dataset csv (/home/sagemaker-user/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [11]:
model_name = 'google/flan-t5-base'

# bfloat16 mean we are using the small version of flan-t5
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name)

It is possible to pull out the number of parameters from the model and find out how many of them are trainable.

In [12]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'trainable model parameters: {trainable_model_params}\n \
            all model parameters: {all_model_params} \n \
            percentage of trainable model parameters: {(trainable_model_params / all_model_params) * 100} %'



In [13]:
print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
             all model parameters: 247577856 
             percentage of trainable model parameters: 100.0 %


# 1. Preprocess the Dialog-Summary Dataset

We need to convert dialog-summary into explicit instruction for LLM. Prepend an instruction to the start of the dialog with Summarize the following conversation and to the start of the summary with summary as follows:
<br>
Training prompt (dialogue):

```
Summarize the following conversation.

    Chris: This is his part of conversation
    Antje: This is her part of conversation
    
Summary:
```

Training response (summary):

```
Both Chrish and Antje participated in the conversation.
```

Then preprocess the prompt-response dataset into token and pull out their input_ids (1 per token)

In [14]:
def tokeninze_function(example):
    start_prompt = 'Summarize the following conversation. \n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, 
                                     return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding='max_length', truncation=True, 
                                 return_tensors='pt').input_ids
    
    return example

# The Dataseta ctually contains 3 diff splits: train, validation, and test.
# The tokenize_function code is handling all data across all splits in batches
tokenize_datasets = dataset.map(tokeninze_function, batched=True)
tokenize_datasets = tokenize_datasets.remove_columns(['id', 'topic', 'dialogue',
                                                     'summary'])

Loading cached processed dataset at /home/sagemaker-user/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-1ac4b8d3502cfcf7.arrow
Loading cached processed dataset at /home/sagemaker-user/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-7b58069a688c8eb7.arrow


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

To save some time in the lab, you will subsample the dataset:

In [15]:
tokenize_datasets = tokenize_datasets.filter(lambda exmaple, index: index % 100 == 0, 
                                            with_indices=True)

Loading cached processed dataset at /home/sagemaker-user/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-ac3cd98dc41c44d5.arrow
Loading cached processed dataset at /home/sagemaker-user/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-cd36827d3490488d/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-00ac8b56c956cb88.arrow


Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Check the shape of all three dataset

In [16]:
print(f'Training: {tokenize_datasets["train"].shape}')
print(f'Valdiation: {tokenize_datasets["validation"].shape}')
print(f'Test: {tokenize_datasets["test"].shape}')
print(tokenize_datasets)

Training: (125, 2)
Valdiation: (5, 2)
Test: (15, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
})


Use the Hugging face builtin `Trainer` library. Pass the preprocessed dataset with reference to original model. 

In [17]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'

In [18]:
# import os
# os.environ['WANDB_DISABLED'] = 'true'

## 2. Parameter Efficient Fine Tuning

### 2.1. PEFT using LoRA technique
LoRA required setting up a new layer of adapter. PEFT freezes the underlying LLM parameters and train only adapter.

In [19]:


lora_config = LoraConfig(r=32, #rank 32,
                         lora_alpha=32, ## LoRA Scaling factor 
                         target_modules=['q', 'v'], ## The modules(for example, attention blocks) to apply the LoRA update matrices.
                         lora_dropout = 0.05,
                         bias='none',
                         task_type=TaskType.SEQ_2_SEQ_LM ## flan-t5
)

## target_modules='q', This represents the value projection layer in the transformer model. The value projection layer transforms input tokens into value vectors,
# which are the actual values that are attended to based on the attention scores computed from query and key vectors.

## target_modules='v',This typically refers to the query projection layer in a transformer-based model. The query projection layer is responsible for transforming 
# input tokens into query vectors, which are used to attend to other tokens in the sequence during self-attention mechanism.

Add LoRA parameter to original model

In [20]:
peft_model = get_peft_model(original_model, lora_config)

print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
             all model parameters: 251116800 
             percentage of trainable model parameters: 1.4092820552029972 %


### 2.2. Train PEFT Adapter

In [21]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
## this is we are again back to the hugging face trainer module
peft_training_args = TrainingArguments(output_dir=output_dir,
                                       auto_find_batch_size=True,
                                       learning_rate=1e-3,
                                       num_train_epochs=1,
                                       logging_steps=1,
                                       max_steps=1,
                                        report_to='none' ## can be wandb, but we are reporint to noe
                )

## this is same except we are using PEFT model instead of regular
peft_trainer = Trainer(model=peft_model, 
                      args=peft_training_args,
                      train_dataset=tokenize_datasets['train']
                 )

In [22]:
peft_trainer.train()



Step,Training Loss
1,51.25


TrainOutput(global_step=1, training_loss=51.25, metrics={'train_runtime': 51.2322, 'train_samples_per_second': 0.156, 'train_steps_per_second': 0.02, 'total_flos': 5565031907328.0, 'train_loss': 51.25, 'epoch': 0.06})

In [23]:
peft_model_path = './peft-dialogue-summary-checkpoint-local'

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)


('./peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint-local/spiece.model',
 './peft-dialogue-summary-checkpoint-local/added_tokens.json',
 './peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [24]:
!ls -alh ./peft-dialogue-summary-checkpoint-local/adapter_model.bin

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
-rw-r--r-- 1 sagemaker-user users 14M Mar 21 17:27 ./peft-dialogue-summary-checkpoint-local/adapter_model.bin


This is just a 14MB model 

Inferencing from PEFT fine tuned model

In [28]:
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base', torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

peft_model = PeftModel.from_pretrained(peft_model_base, 
                                      './peft-dialogue-summary-checkpoint-local',
                                      torch_dtype=torch.bfloat16,
                                      is_trainable=False) ## is_trainable mean just a forward pass jsut to get a sumamry

In [29]:
index = 200 ## randomly pick index
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)


peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(f'Human Baseline summary: \n{human_baseline_summary}\n')
print(f'Original Model Output \n{original_model_text_output}\n')
print(f'Peft Model Output \n{peft_model_text_output}\n')

Human Baseline summary: 
#Person1# teaches #Person2# how to upgrade software and hardware in #Person2#'s system.

Original Model Output 
#Person1#: I'm thinking about upgrading my computer.

Peft Model Output 
#Person1#: I'm thinking of upgrading my computer.



### 2.3. Evaluate model quantitavely

In [30]:
dialogue = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
peft_model_summaries = []

for _, dialogue in enumerate(dialogue):
    prompt = f"""
    Summarize the following conversations. 

    {dialogue}

    Summary: """

    input_ids = tokenizer(prompt, return_tensors='pt').input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)
    

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)


zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries,
                           peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,This memo should be typed up and distributed t...,#Person1#: I need to take a dictation for you....
1,In order to prevent employees from wasting tim...,This memo will be distributed to all employees...,#Person1#: I need to take a dictation for you....
2,Ms. Dawson takes a dictation for #Person1# abo...,#Person1: I need to take a dictamenum for ever...,#Person1#: I need to take a dictation for you....
3,#Person2# arrives late because of traffic jam....,The traffic in this city is terrible.,The traffic jam at the Carrefour intersection ...
4,#Person2# decides to follow #Person1#'s sugges...,The traffic jam near the Carrefour intersectio...,The traffic jam at the Carrefour intersection ...
5,#Person2# complains to #Person1# about the tra...,#Person1#: I'm finally here!,The traffic jam at the Carrefour intersection ...
6,#Person1# tells Kate that Masha and Hero get d...,Masha and Hero are having a separation for 2 m...,Masha and Hero are getting divorced.
7,#Person1# tells Kate that Masha and Hero are g...,Masha and Hero are divorced.,Masha and Hero are getting divorced.
8,#Person1# and Kate talk about the divorce betw...,Masha and Hero are divorced.,Masha and Hero are getting divorced.
9,#Person1# and Brian are at the birthday party ...,Brian is having a party with his friends.,Brian's birthday is coming up.


Calculate rouge Metrics score

In [31]:
rouge = evaluate.load('rouge')

In [32]:
original_model_results = rouge.compute(predictions=original_model_summaries, 
                                       references=human_baseline_summaries[0: len(original_model_summaries)],
                                      use_aggregator=True,
                                      use_stemmer=True)

peft_model_results = rouge.compute(predictions=peft_model_summaries, 
                                    references=human_baseline_summaries[0: len(peft_model_summaries)],
                                    use_aggregator=True,
                                    use_stemmer=True)

print(f'Original Model: \n{original_model_results}\n') 
print(f'PEFT Model: \n{peft_model_results}\n') 

Original Model: 
{'rouge1': 0.20830532190588022, 'rouge2': 0.07178140096618357, 'rougeL': 0.17268616035826098, 'rougeLsum': 0.17717816419240226}

PEFT Model: 
{'rouge1': 0.2350105883395357, 'rouge2': 0.10417391304347826, 'rougeL': 0.2156123822571191, 'rougeLsum': 0.22044670728881255}



PEFT rouge score is better than the original model. It uses the less resources and improves in comparison to original model. <br> 
These are just a few examples but you can  imagine its impact at scale and how much it saves in terms of resources, and times. 