In [1]:
import torch
from transformers import BertTokenizer

# Load Pretrained BERT Tokenizer



In [2]:
from transformers import TrainingArguments, Trainer

In [3]:
pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

Loading the model

In [44]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [45]:
model = model.to("cuda")

Loading the dataset

In [46]:
from datasets import load_dataset

ds = load_dataset("knkarthick/samsum")

Explore the dataset

In [47]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})

In [48]:
print(ds['train']['id'][0])

13818513


In [49]:
print(ds['train']['dialogue'][6054])

None


In [50]:
print(ds['train']['summary'][6054])

problem with visualization of the content


In [51]:
# Function to calculate average lengths of dialogues and summaries
def calculate_avg_lengths(dataset):
    total_dialogue_length = 0
    total_summary_length = 0
    num_samples = len(dataset)
    # cnt = 0
    # Iterate over the dataset and sum the lengths
    for entry in dataset:
        # print(cnt)
        if entry['dialogue']!= None:
          total_dialogue_length += len(entry['dialogue'].split())  # Count words in dialogue
        total_summary_length += len(entry['summary'].split())  # Count words in summary
        # cnt += 1
    # Calculate average lengths
    avg_dialogue_length = total_dialogue_length / num_samples
    avg_summary_length = total_summary_length / num_samples

    return avg_dialogue_length, avg_summary_length

# Calculate average lengths for the training dataset
avg_dialogue_len, avg_summary_len = calculate_avg_lengths(ds['train'])

print(f"Average length of dialogue: {avg_dialogue_len:.2f} words")
print(f"Average length of summary: {avg_summary_len:.2f} words")


Average length of dialogue: 93.79 words
Average length of summary: 20.32 words


Tokenize function

In [52]:
# Instead of using remove, try selecting all elements EXCEPT the one you want to remove
ds = ds.filter(lambda example, idx: idx != 6054, with_indices=True)

In [70]:
def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "

    # Generate prompts for each dialogue
    prompt_texts = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]

    # Tokenize dialogues and summaries separately
    tokenized_inputs = tokenizer(prompt_texts, padding="max_length", truncation=True, max_length = 512)
    tokenized_labels = tokenizer(example["summary"], padding="max_length", truncation=True, max_length = 512)

    # Assign input_ids and labels correctly
    example["input_ids"] = tokenized_inputs["input_ids"]
    example["labels"] = tokenized_labels["input_ids"]

    return example


In [71]:
train_dataset = ds['train'].map(tokenize_function, batched=True)
test_dataset = ds['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/14731 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

In [72]:
eval_dataset = ds['validation'].map(tokenize_function, batched=True)

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [73]:
train_dataset

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'labels'],
    num_rows: 14731
})

In [74]:
train_tokenized_dataset = train_dataset.remove_columns(['id', 'dialogue', 'summary'])
test_tokenized_dataset = test_dataset.remove_columns(['id', 'dialogue', 'summary'])

In [75]:
print(train_tokenized_dataset[0])

{'input_ids': [0, 38182, 3916, 2072, 5, 511, 1607, 4, 50118, 50118, 10127, 5219, 35, 38, 17241, 1437, 15269, 4, 1832, 47, 236, 103, 116, 50118, 39237, 35, 9136, 328, 50118, 10127, 5219, 35, 38, 581, 836, 47, 3859, 48433, 50118, 50118, 47977, 35, 1437, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [76]:
tokenizer.decode(train_tokenized_dataset[0]['input_ids'], skip_special_tokens=True)

"Summarize the following conversation.\n\nAmanda: I baked  cookies. Do you want some?\nJerry: Sure!\nAmanda: I'll bring you tomorrow :-)\n\nSummary: "

In [77]:
eval_tokenized_dataset = eval_dataset.remove_columns(['id', 'dialogue', 'summary'])

Helper function to calculate trainable parameters of the model


In [78]:
def trainable_param(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [79]:
def all_param(model):
  return sum(p.numel() for p in model.parameters())

Peft model

In [80]:
from peft import get_peft_model, LoraConfig, TaskType

In [81]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=4, lora_alpha=8, lora_dropout=0.1
)

In [82]:
peft_model = get_peft_model(model, peft_config)
print(trainable_param(peft_model), all_param(peft_model))

221184 139641600


In [83]:
peft_model = peft_model.to("cuda")

In [84]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

In [105]:
output_dir = f'./a'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=2e-4, # Higher learning rate than full fine-tuning.
    num_train_epochs=3,
    logging_steps=100,
    per_device_train_batch_size=8,  # Batch size per GPU for training
    per_device_eval_batch_size=8,   # Batch size per GPU for evaluation
    gradient_accumulation_steps=2,  # Accumulate gradients over multiple batches
    optim="adamw_hf",
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset = train_tokenized_dataset,
    eval_dataset = eval_tokenized_dataset
)

In [115]:
peft_trainer.train()

Step,Training Loss
100,5.2393
200,5.2377
300,5.2381
400,5.2398
500,5.2317
600,5.2377
700,5.2321
800,5.2326
900,5.2327
1000,5.2316


TrainOutput(global_step=2763, training_loss=5.230563318043397, metrics={'train_runtime': 1083.7597, 'train_samples_per_second': 40.777, 'train_steps_per_second': 2.549, 'total_flos': 1.3503069787521024e+16, 'train_loss': 5.230563318043397, 'epoch': 3.0})

Save model

In [87]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [118]:
model.save_pretrained("/content/drive/MyDrive/TheLongTran/Samsum")

In [119]:

tokenizer.save_pretrained("/content/drive/MyDrive/TheLongTran/samsum_tokenize")

('/content/drive/MyDrive/TheLongTran/samsum_tokenize/tokenizer_config.json',
 '/content/drive/MyDrive/TheLongTran/samsum_tokenize/special_tokens_map.json',
 '/content/drive/MyDrive/TheLongTran/samsum_tokenize/vocab.json',
 '/content/drive/MyDrive/TheLongTran/samsum_tokenize/merges.txt',
 '/content/drive/MyDrive/TheLongTran/samsum_tokenize/added_tokens.json',
 '/content/drive/MyDrive/TheLongTran/samsum_tokenize/tokenizer.json')

Evaluate

In [110]:
base_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
base_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

In [111]:
base_model.to("cuda")

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [112]:
!pip install evaluate rouge_score



In [93]:
import evaluate

In [94]:
rouge = evaluate.load('rouge')

In [95]:
import pandas as pd
from transformers import GenerationConfig

In [96]:
eval_tokenized_dataset[0]['input_ids']

[0,
 38182,
 3916,
 2072,
 5,
 511,
 1607,
 4,
 50118,
 50118,
 250,
 35,
 12289,
 1560,
 6,
 32,
 47,
 3610,
 3859,
 17,
 27,
 29,
 1390,
 116,
 50118,
 387,
 35,
 38,
 17,
 27,
 119,
 1256,
 686,
 38,
 524,
 4,
 653,
 17,
 27,
 29,
 62,
 116,
 50118,
 250,
 35,
 2615,
 47,
 213,
 19,
 162,
 7,
 5,
 3477,
 5159,
 116,
 4,
 50118,
 387,
 35,
 653,
 109,
 47,
 236,
 7,
 109,
 116,
 50118,
 250,
 35,
 38,
 236,
 7,
 120,
 10,
 20830,
 13,
 127,
 979,
 4,
 50118,
 387,
 35,
 280,
 40,
 146,
 123,
 98,
 1372,
 4,
 50118,
 250,
 35,
 8976,
 6,
 52,
 17,
 27,
 548,
 3373,
 24,
 171,
 498,
 4,
 38,
 206,
 37,
 17,
 27,
 29,
 1227,
 122,
 4,
 50118,
 387,
 35,
 280,
 17,
 27,
 29,
 205,
 4,
 4833,
 3009,
 10,
 2335,
 16,
 10,
 1828,
 696,
 4,
 2011,
 519,
 10,
 1928,
 25606,
 45406,
 1437,
 50118,
 250,
 35,
 38,
 581,
 120,
 123,
 65,
 9,
 167,
 410,
 3678,
 4,
 50118,
 387,
 35,
 509,
 14,
 351,
 75,
 1733,
 62,
 350,
 380,
 131,
 45406,
 50118,
 250,
 35,
 178,
 3529,
 350,
 203,
 131,
 12,

In [116]:
dialogues = ds['test']['dialogue']
human_baseline_summaries = ds['test']['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt = str(dialogue)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

    original_model_outputs = base_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=30))
    original_model_text_output = base_tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=30))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,"Hannah: Hey, do you have Betty's number?Amanda...",Hannah's phone is missing. Amanda has it. Aman...
1,Eric and Rob are going to watch a stand-up on ...,Eric: MACHINE! I love it!Rob: That's so gr8!Er...,Rob is funny about the train. He loves the tra...
2,Lenny can't decide which trousers to buy. Bob ...,"Lenny: Babe, can you help me with something?Bo...",Bob wants to pick the best outfit. Lenny likes...
3,Emma will be home soon and she will let Will k...,"Will: hey babe, what do you want for dinner to...",Emma is hungry tonight. Will will pick Emma up...
4,Jane is in Warsaw. Ollie and Jane has a party....,"Ollie: Hi , are you in Warsaw?Jane: yes, just ...",Ollie is in Warsaw for lunch. Jane will be the...
...,...,...,...
814,Benjamin didn't come to see a basketball game ...,Alex: Were you able to attend Friday night's b...,Benjamin was unable to attend Friday night's b...
815,The audition starts at 7.30 P.M. in Antena 3.,Jamilla: remember that the audition starts at ...,Jamilla: Antena 3 starts at 7:30 P.M.Jamilla: ...
816,"Marta sent a file accidentally,","Marta: <file_gif>Marta: Sorry girls, I clicked...",Marta: Thanks Marta for clicking the gallery.M...
817,There was a meet-and-greet with James Charles ...,Cora: Have you heard how much fuss British med...,Cora and her sister are surprised about the me...


In [117]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

ORIGINAL MODEL:
{'rouge1': 0.2924926335963355, 'rouge2': 0.07657073408143249, 'rougeL': 0.2323151004087458, 'rougeLsum': 0.2324179894968585}
INSTRUCT MODEL:
{'rouge1': 0.3735312462770852, 'rouge2': 0.13857327343132458, 'rougeL': 0.29613868100870855, 'rougeLsum': 0.2961882210736978}
