# S-1 Set up Kernel, Load Required Dependencies, Dataset and LLM

In [None]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np


In [3]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
base_model = "facebook/bart-base"
max_input_length = 1024
max_target_length = 150

tokenizer = AutoTokenizer.from_pretrained(base_model)
model = AutoModelForSeq2SeqLM.from_pretrained(base_model)
#

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))


trainable model parameters: 139420416
all model parameters: 139420416
percentage of trainable model parameters: 100.00%


In [None]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail",'3.0.0')
#

In [7]:
dataset.shuffle()

sampled_dataset = dataset
sampled_dataset['train'] = dataset['train'].shard(num_shards=50, index=0)
sampled_dataset['validation'] = dataset['validation'].shard(num_shards=40, index=0)
sampled_dataset['test'] = dataset['test'].shard(num_shards=40, index=0)
sampled_dataset
#

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 5743
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 335
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 288
    })
})

In [8]:

index=200

article = sampled_dataset['test'][index]['article']
highlights = sampled_dataset['test'][index]['highlights']

# prompt = f"""
# Summarize the following article.

# {article}

# Summary:
# """
inputs = tokenizer(article,  max_length=max_input_length,return_tensors="pt",truncation=True)
output = tokenizer.decode(
    model.generate(
    inputs["input_ids"],
    max_new_tokens=max_target_length,
    )[0],
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{article}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{highlights}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
President Barack Obama made an unscheduled stop to the Bob Marley museum in Jamaica last night while on a visit to the country for a meeting with Caribbean leaders. The first president to visit Jamaica in three decades, Obama arrived in Kingston yesterday evening and was met by Prime Minister Portia Simpson-Miller, U.S. Ambassador to Jamaica Luis Moreno and a dozen other dignitaries. Obama promptly embraced the local color  by making an unscheduled late night visit to the museum dedicated to the island's most famous son. Scroll down for video . Barack Obama made an unscheduled stop at the Bob Marley museum while on an official visit to Jamaica . The U.S. President walks around the museum with tour guide Natasha Clark during his unannounced visit . On a tour of the house where the reggae legend lived until his death in 1981, Obama, looking relaxed in shirtsleeves, was shown 

# S2-A Full Fine Tunning


## Fine Tuning the model and then pushing it to hugging_face

In [None]:
def tokenize_function(example):
    # start_prompt = 'Summarize the following article.\n\n'
    # end_prompt = '\n\nSummary: '
    # prompt = [start_prompt + article + end_prompt for article in example["article"]]
    example['input_ids'] = tokenizer(example["article"],  max_length=max_input_length, truncation=True).input_ids
    example['labels'] = tokenizer(example["highlights"],max_length=max_target_length,  truncation=True).input_ids

    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = sampled_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['article', 'highlights', 'id',])

In [10]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (5743, 2)
Validation: (335, 2)
Test: (288, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5743
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 335
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 288
    })
})


In [11]:
batch_size = 8
num_train_epochs = 5
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = "bart-base"

args = Seq2SeqTrainingArguments(
    output_dir=f"./{model_name}-finetuned-cnn-news",
    evaluation_strategy="epoch",
    learning_rate=5.6e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
    hub_strategy="every_save"
)
#


In [None]:
from huggingface_hub import notebook_login

notebook_login()
#

In [None]:
from datasets import load_metric

rouge_score = load_metric("rouge")
#

In [14]:
def compute_metrics(eval_pred):
    '''Computes metrics that can be used while training'''
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # median scores, we get the fmeasure as percentage
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}
    #

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# tokenized_datasets = tokenized_datasets.remove_columns(
#     sampled_dataset["train"].column_names
# )
# Get features for data collator
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)
#

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[    0,   574,  4524,  ...,     1,     1,     1],
        [    0,  2620, 11350,  ...,  1441,   479,     2]]), 'labels': tensor([[    0, 29345, 10997,   999,  3028,  7312, 20152,  1516,   984,   844,
           448, 13016,    25,    37,  4072,   504,   302,   479, 50118, 22138,
          2701,   161,    37,    34,   117,   708,     7,   856,  3961,  1334,
            39,  1055,   409,   479, 50118, 28243, 20152,    18,  1107,    31,
            78,   292, 10997,  3541,    33,    57,   547,    11,  2416,  1391,
           479,     2,  -100,  -100,  -100,  -100,  -100],
        [    0, 34057,  5699,   334,  4259,     7,    70,  7780,   408,     6,
          6069,     9,  7220,  2194,   479, 50118, 42686, 32483,   161,    69,
           334,    16,   291,   135,  7780,    42,    76,   479, 50118, 39594,
          1269,    35,  7780,  1159,    40,    28, 14948,    88,    22, 17894,
          8656,   113,   301,   479, 50118,  3762,  1294,   161,    37,   685,
         

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
#

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/hardikJ11/bart-base-finetuned-cnn-news into local empty directory.


In [17]:
import torch
torch.cuda.empty_cache()

In [18]:
# model.push_in_progress =False
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.7005,2.987193,21.7279,9.0406,17.392,20.0627
2,2.937,2.859016,21.3056,8.5254,17.2338,20.0403
3,2.2642,2.674428,21.277,9.6162,17.7775,20.1688
4,1.5774,2.701996,21.7458,9.846,18.1649,20.7067
5,1.0174,2.855997,21.8948,9.7157,17.9348,20.5347


TrainOutput(global_step=3590, training_loss=2.297552512150289, metrics={'train_runtime': 5047.2762, 'train_samples_per_second': 5.689, 'train_steps_per_second': 0.711, 'total_flos': 1.741374456127488e+16, 'train_loss': 2.297552512150289, 'epoch': 5.0})

In [19]:
trainer.evaluate()

{'eval_loss': 2.855997085571289,
 'eval_rouge1': 21.8948,
 'eval_rouge2': 9.7157,
 'eval_rougeL': 17.9348,
 'eval_rougeLsum': 20.5347,
 'eval_runtime': 52.9487,
 'eval_samples_per_second': 6.327,
 'eval_steps_per_second': 0.793,
 'epoch': 5.0}

In [None]:
trainer.push_to_hub(commit_message="Training complete", tags="summarization")

## Loading the fine-tuned model from hugging_face

In [None]:
# from transformers import pipeline
# hub_model_id = "hardikJ11/t5-small-finetuned-cnn-news"
# summarizer = pipeline("summarization", model=hub_model_id)

# from transformers import pipeline
# summarizer = pipeline("summarization", model="hardikJ11/t5-small-finetuned-cnn-news")

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

instruct_tokenizer = AutoTokenizer.from_pretrained("hardikJ11/bart-base-finetuned-cnn-news")
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("hardikJ11/bart-base-finetuned-cnn-news")

In [23]:
device="cuda:0"
instruct_model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

In [24]:
def generate_summary(index):
    device="cuda:0"
    article = sampled_dataset['test'][index]['article']
    highlights = sampled_dataset['test'][index]['highlights']

    prompt = f"""
    Summarize the following article.

    {article}

    Summary:
    """
    inputs = instruct_tokenizer(article,  max_length=max_input_length,return_tensors="pt",truncation=True)
    output = instruct_tokenizer.decode(
        instruct_model.generate(
        inputs["input_ids"].to(device),
        max_new_tokens=max_target_length,
        )[0],
        skip_special_tokens=True
    )
    # output = summarizer(prompt, max_length = 100, min_length = 50, do_sample = False)[0]["summary_text"]

    dash_line = '-'.join('' for x in range(100))
    print(dash_line)
    print(f'INPUT PROMPT:\n{article}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{highlights}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

In [None]:
generate_summary(50)

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Lionel Messi, Neymar and Luis Suarez is a star-studded strike force that almost every team in the world would dearly love to have, but when the going gets tough, Barcelona turn to Jeremy Mathieu and Gerard Pique. The French centre-back flung himself through the air to reach Xavi's free kick at the back post, heading home from close range, finally breaching Celta Vigo's stern defence. It was Mathieu who had opened the scoring in the El Clasico, in similar fashion. Pique, meanwhile, just a minute before Mathieu scored, had made an incredible, game-saving tackle on Celta striker Charles, to prevent him from opening the scoring. Jeremy Mathieu (right) heads home in the 73rd minute to hand Barcelona a 1-0 lead against Celta Vigo . Celta Vigo goalkeeper Alvarez Conde (right) is unable to stop the powerful header from Mathieu . Celta Vigo: Alvarez Conde, Mallo Novegil, Cabral, Fon

#   S-3 Evaluate the Model Quantitatively (with ROUGE Metric)

In [25]:
device="cuda:0"
index = 100
article = sampled_dataset['test'][index]['article']
baseline_human_summary = sampled_dataset['test'][index]['highlights']

prompt = f"""
Summarize the following article.

{article}

Summary: """

input_ids = tokenizer(article, max_length=max_input_length, return_tensors="pt", truncation=True).input_ids.to(device)


original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig( max_new_tokens=max_target_length, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_text_output = instruct_tokenizer.decode(
    instruct_model.generate(
    input_ids,
    max_new_tokens=max_target_length,
    )[0],
    skip_special_tokens=True
)
# instruct_model_text_output = summarizer(prompt, max_length = 100, min_length = 50, do_sample = False)[0]["summary_text"]

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')
#

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Bayern Munich beat Porto 6-1 in the Champions League on Tuesday .
Pep Guardiola's team have now scored 115 goals this season .
Robert Lewandowski scored twice and is their top marksman .
Hamburg are Bayern's favourite opposition with 11 strikes against them .
---------------------------------------------------------------------------------------------------
ORIGINAL MODEL:
Bayern Munich thrnced Porto to reach the Champions League semi-finals on Tuesday night.
Thomas Muller and Thomas Muller score in opening goal against Shakhtar Donetsk.
Bayern Munich have five Bundesliga matches to play for the Bundesliga side.
Bayern Munich have five Bundesliga matches to play for the last campaign.
---------------------------------------------------------------------------------------------------
INSTRUCT MODEL:
Bayern Munich thrnced Porto to reach the Champions League semi-fin

In [26]:
articles = dataset['test'][0:10]['article']
human_baseline_summaries = dataset['test'][0:10]['highlights']

original_model_summaries = []
instruct_model_summaries = []

for idx, article in enumerate(articles):
    prompt = f"""
    Summarize the following article.

    {article}

    Summary: """

    input_ids = tokenizer(article, max_length=max_input_length, return_tensors='pt', truncation=True).input_ids.to(device)

    original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=max_target_length, num_beams=1))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    instruct_model_text_output = tokenizer.decode(
    instruct_model.generate(
    input_ids,
    max_new_tokens=max_target_length,
    )[0],
    skip_special_tokens=True
    )
    # instruct_model_text_output = summarizer(prompt, max_length = 100, min_length = 50, do_sample = False)[0]["summary_text"]


    original_model_summaries.append(original_model_text_output)
    instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,instruct_model_summaries
0,Membership gives the ICC jurisdiction over all...,The court is based in the Netherlands.\nIt is ...,Palestinians signed the Rome Statute in Januar...
1,High temperatures are recorded on the northern...,The temperature was recorded at the Argentine ...,The temperature was recorded at Argentina's Es...
2,Ted Cruz has built a brand as a stalwart conse...,"Sen. Ted Cruz says he will pray with the ""Sena...",Sen. Ted Cruz pledges to energize Iowa's Chris...
3,"Prosecutor: Carlos Colina, 32, will be arraign...","Carlos Colina, 32, will be arraigned April 14 ...","Carlos Colina, 32, will be arraigned April 14 ..."
4,Cuba pulled off a diplomatic coup by gaining a...,Cuba pulls out diplomatic coup by marshaling s...,Cuba pulls off diplomatic coup by marshaling t...
5,Aaron Hernandez has been found guilty in Odin ...,New England Patriots star Aaron Hernandez is d...,New England Patriots star Aaron Hernandez will...
6,"Thabo Sefolosha says he ""experienced a signifi...",NBA player Thabo Sefolosha says his season-end...,The Atlanta Hawks describe his season-ending l...
7,"Former rap mogul Marion ""Suge"" Knight will be ...",NEW: Judge Ronald Coen lowered Knight's bail t...,NEW: Judge Ronald Coen lowered Knight's bail t...
8,"A selection of notes from British artist's 1,5...",London-based artist has collected beautiful pi...,London-based artist has scoured streets of cit...
9,A small boat carrying about 50 migrants left f...,NEW: At least 21 killed during shipwreck off t...,Helicopter hit a reef and sank as it tries to ...


In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)


print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)

In [28]:
print("Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))/np.array(list(original_model_results.values()))
for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Absolute percentage improvement of INSTRUCT MODEL over ORIGINAL MODEL
rouge1: 14.38%
rouge2: 43.64%
rougeL: 15.85%
rougeLsum: 20.85%
