<a href="https://colab.research.google.com/github/MahdiFaourr/MahdiFaourr/blob/main/News_Summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip
!pip install --disable-pip-version-check \
   torch==1.13.1\
   torchdata==0.5.1 --quiet
!pip install \
  transformers==4.27.2 \
  datasets==2.11.0 \
  evaluate==0.4.0 \
  rouge_score==0.1.2 \
  loralib==0.1.1 \
  peft==0.3.0 --quiet
!pip install opendatasets

In [None]:
import opendatasets as od
import pandas as pd
import numpy as np
import evaluate
import torch
from datasets import DatasetDict, Dataset,load_dataset
from transformers import AutoModelForSeq2SeqLM,AutoTokenizer,TrainingArguments,Trainer,GenerationConfig
from peft import LoraConfig,TaskType,get_peft_model,PeftModel

In [None]:
od.download("https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail")

In [None]:
data=pd.read_csv("/content/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv")
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data=data.drop('id',axis=1)

In [None]:
train_data=data.iloc[:10000,:]
test_data=data.iloc[10000:20000,:]

In [None]:
model_name='google/flan-t5-base'
base_model=AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
tokenizer=AutoTokenizer.from_pretrained(model_name,use_fast=True)

In [None]:
def print_number_of_trainable_parameters(model):
  trainable_model_params=0
  all_model_params=0
  for _,param in model.named_parameters():
    all_model_params+=param.numel()
    if param.requires_grad:
      trainable_model_params+=param.numel()
  return f"trainable model params:{trainable_model_params}\nall model parameters:{all_model_params}"
print(print_number_of_trainable_parameters(base_model))

trainable model params:247577856
all model parameters:247577856


In [None]:
dash_line=''.join('-' for x in range(100) )
context=data['article'][1]
summary=data['highlights'][1]
prompt=f""" Summarize the following Context.
{context}
Summary:"""
input=tokenizer(prompt,return_tensors='pt')
output=tokenizer.decode(base_model.generate(input['input_ids'],max_new_tokens=300)[0],skip_special_tokens=True)
print(dash_line)
print(f"Human Summary:\n{summary}")
print(dash_line)
print(f"Model Geneartion-Zero Shot:\n{output}")

----------------------------------------------------------------------------------------------------
Human Summary:
Criminal complaint: Cop used his role to help cocaine traffickers .
Ralph Mata, an internal affairs lieutenant, allegedly helped group get guns .
He also arranged to pay two assassins in a murder plot, a complaint alleges .
----------------------------------------------------------------------------------------------------
Model Geneartion-Zero Shot:
A criminal complaint alleges that a Miami-Dade police officer helped a drug trafficking organization plan a murder plot and get guns.


In [None]:
def make_prompt(example_indices,index_to_summarize):
  prompt=""
  for index in example_indices:
    context=data['article'][index]
    summary=data['highlights'][index]
    prompt+=f"""Summarize the following Context.
    {context}
    Summary:
    {summary}"""
  input_context=data['article'][index_to_summarize]
  prompt+=f""" Summarize the following Context.
  {input_context}
  Summary:
  What was going on?
  """
  return prompt


In [None]:
input=tokenizer(make_prompt([0],1),return_tensors='pt')
output=tokenizer.decode(base_model.generate(input['input_ids'],max_new_tokens=300)[0],skip_special_tokens=True)
print(dash_line)
print(f"Human Summary:\n{data['highlights'][1]}")
print(dash_line)
print(f"Model Generation-One Shot:\n{output}")

----------------------------------------------------------------------------------------------------
Human Summary:
Criminal complaint: Cop used his role to help cocaine traffickers .
Ralph Mata, an internal affairs lieutenant, allegedly helped group get guns .
He also arranged to pay two assassins in a murder plot, a complaint alleges .
----------------------------------------------------------------------------------------------------
Model Generation-One Shot:
A criminal complaint unsealed in U.S. District Court in New Jersey accuses Ralph Mata of helping a drug trafficking organization in exchange for money and gifts.


In [None]:
train_contexts=train_data['article'].tolist()
train_summaries=train_data['highlights'].tolist()

In [None]:
test_contexts=test_data['article'].tolist()
test_summaries=test_data['highlights'].tolist()

In [None]:
tokenized_train_contexts=tokenizer(train_contexts,padding=True,truncation=True,max_length=512,return_tensors='pt')
tokenized_train_summaries=tokenizer(train_summaries,padding=True,truncation=True,max_length=512,return_tensors='pt')

In [None]:
tokenized_test_contexts=tokenizer(test_contexts,padding=True,truncation=True,max_length=512,return_tensors='pt')
tokenized_test_summaries=tokenizer(test_summaries,padding=True,truncation=True,max_length=512,return_tensors='pt')

In [None]:
tokenized_training_dataset={
    "input_ids":tokenized_train_contexts['input_ids'],
    'labels':tokenized_train_summaries['input_ids']
}

In [None]:
tokenized_training_dataset

In [None]:
tokenized_testing_dataset={
    "input_ids":tokenized_test_contexts['input_ids'],
    'labels':tokenized_test_summaries['input_ids']
}

In [None]:
# Create a DatasetDict
train_dataset = DatasetDict({
    "train": Dataset.from_dict(tokenized_training_dataset)
})
test_dataset=DatasetDict({
    "test": Dataset.from_dict(tokenized_testing_dataset)
})

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
lora_config=LoraConfig(r=8,lora_alpha=32,target_modules=['q','v'],lora_dropout=0.05,bias='none',task_type=TaskType.SEQ_2_SEQ_LM)

In [None]:
peft_model=get_peft_model(base_model,lora_config)
print(print_number_of_trainable_parameters(peft_model))

trainable model params:884736
all model parameters:248462592


In [None]:
output_dir=f'./news-summary-peft-model'
# Initialize training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

# Initialize trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset['train'],
    eval_dataset=test_dataset['test']
)


In [None]:
trainer.train()

In [None]:
trainer.save.model.save_pretrained('./news-summary-peft-model')

In [None]:
peft_model=PeftModel.from_pretrained('./news-summary-peft-model')

In [None]:
rouge=evaluate.load('rouge')

In [None]:
contexts=test_data['article'][0:10]
human_summaries=test_data['highlights'][0:10]
base_model_summaries=[]
peft_model_summaries=[]
for _,context in enumerate(contexts):
   prompt=f"""
Summarize the following Conversation.
{context}
Summary:"""
  inputs=tokenizer(prompt,return_tensors='pt')
  base_model_output=tokenizer.decode(base_model.generate(inputs['input_ids'],max_new_tokens=300)[0],skip_special_tokens=True)
  base_model_summaries.append(base_model_output)
  peft_model_output=tokenizer.decode(peft_model.generate(inputs['input_ids'],max_new_tokens=300)[0],skip_special_tokens=True)
  peft_model_summaries.append(peft_model_output)
zipped_summaries=list(zip(human_summaries,base_model_summaries,peft_model_summaries))
df=pd.DataFrame(zipped_summaries,columns=['human_summaries','base_model_summaries','peft_model_summaries'])
df.head()

In [None]:
base_model_results=rouge.compute(predictions=base_model_summaries,
                                     references=human_summaries,use_aggregator=True,use_stemmer=True)

peft_model_results=rouge.compute(predictions=peft_model_summaries,
                                     references=human_summaries,use_aggregator=True,use_stemmer=True)
print("Base Model")
print(base_model_results)
print(dash_line)
print("Peft Model:")
print(peft_model_results)