#Import Tools

In [None]:
# Transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments     # T5 Tokenizer and architecture
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments         # These will help us to fine-tune our model
from transformers import pipeline                                         # Pipeline
from transformers import DataCollatorForSeq2Seq                           # DataCollator to batch the data
import torch                                                              # PyTorch
import pandas as pd                                                       


# Other NLP libraries
from textblob import TextBlob                                             # This is going to help us fix spelling mistakes in texts
from sklearn.feature_extraction.text import TfidfVectorizer               # This is going to helps identify the most common terms in the corpus
import re                                                                 # This library allows us to clean text data
import nltk                                                               # Natural Language Toolkit
nltk.download('punkt')                                                    # This divides a text into a list of sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Configuring Pandas to exhibit larger columns
'''
This is going to allow us to fully read the dialogues and their summary
'''
pd.set_option('display.max_colwidth', 1000)

In [None]:
# Checking if GPU is available
if torch.cuda.is_available():
    print("GPU is available. \nUsing GPU")
    device = torch.device('cuda')
else:
    print("GPU is not available. \nUsing CPU")
    device = torch.device('cpu')

GPU is available. 
Using GPU


#Load Datasets

In [None]:
train_data = pd.read_csv("samsum-train.csv")
validation_data = pd.read_csv("samsum-validation.csv")
test_data = pd.read_csv("samsum-test.csv")

train_data.head()


Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-),Amanda baked cookies and will bring Jerry some tomorrow.
1,13728867,Olivia: Who are you voting for in this election? \r\nOliver: Liberals as always.\r\nOlivia: Me too!!\r\nOliver: Great,Olivia and Olivier are voting for liberals in this election.
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I was going to do lots of stuff but ended up procrastinating\r\nTim: What did you plan on doing?\r\nKim: Oh you know, uni stuff and unfucking my room\r\nKim: Maybe tomorrow I'll move my ass and do everything\r\nKim: We were going to defrost a fridge so instead of shopping I'll eat some defrosted veggies\r\nTim: For doing stuff I recommend Pomodoro technique where u use breaks for doing chores\r\nTim: It really helps\r\nKim: thanks, maybe I'll do that\r\nTim: I also like using post-its in kaban style",Kim may try the pomodoro technique recommended by Tim to get more stuff done.
3,13730747,"Edward: Rachel, I think I'm in ove with Bella..\r\nrachel: Dont say anything else..\r\nEdward: What do you mean??\r\nrachel: Open your fu**ing door.. I'm outside",Edward thinks he is in love with Bella. Rachel wants Edward to open his door. Rachel is outside.
4,13728094,"Sam: hey overheard rick say something\r\nSam: i don't know what to do :-/\r\nNaomi: what did he say??\r\nSam: he was talking on the phone with someone\r\nSam: i don't know who\r\nSam: and he was telling them that he wasn't very happy here\r\nNaomi: damn!!!\r\nSam: he was saying he doesn't like being my roommate\r\nNaomi: wow, how do you feel about it?\r\nSam: i thought i was a good rommate\r\nSam: and that we have a nice place\r\nNaomi: that's true man!!!\r\nNaomi: i used to love living with you before i moved in with me boyfriend\r\nNaomi: i don't know why he's saying that\r\nSam: what should i do???\r\nNaomi: honestly if it's bothering you that much you should talk to him\r\nNaomi: see what's going on\r\nSam: i don't want to get in any kind of confrontation though\r\nSam: maybe i'll just let it go\r\nSam: and see how it goes in the future\r\nNaomi: it's your choice sam\r\nNaomi: if i were you i would just talk to him and clear the air","Sam is confused, because he overheard Rick complaining about him as a roommate. Naomi thinks Sam should talk to Rick. Sam is not sure what to do."


Data Preprocessing

In [None]:
train_data['dialogue'][0]

"Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)"

In [None]:
train_data = train_data.sample(n=8000,random_state=42).reset_index(drop=True)
validation_data = validation_data.sample(n=500, random_state=42).reset_index(drop=True)

In [None]:
import re

def clean_text(text):
  if not isinstance(text, str):  # Check if 'text' is not a string
    text = str(text)            # Convert to string if not already
  text = re.sub(r'\r\n', ' ', text) #Remove the line breaks
  text = re.sub(r'\s+', ' ', text) #Remove the extra spaces
  text = re.sub(r'<.*?>', '', text) #Remove any XML tags
  text = text.strip().lower() #Convert to lowercase
  return text

# Apply the cleaning to dialogue and summary columns
train_data['dialogue'] = train_data['dialogue'].apply(clean_text)
train_data['summary'] = train_data['summary'].apply(clean_text)

validation_data['dialogue'] = validation_data['dialogue'].apply(clean_text)
validation_data['summary'] = validation_data['summary'].apply(clean_text)

#Display the sample after cleaning
train_data.head()

Unnamed: 0,id,dialogue,summary
0,13811908,"violet: hi! i came across this austin's article and i thought that you might find it interesting violet: claire: hi! :) thanks, but i've already read it. :) claire: but thanks for thinking about me :)",violet sent claire austin's article.
1,13716431,"pat: so does anyone know when the stream is going to happen? lou: unfortunately, no, but would really like to. kevin: i don't think i'd be interested in this. pat: y? kevin: seeing all the blood and internal organs makes me dizzy. lou: so you're so gentle? pat: c'mon! srsly? kevin: yup. had the same thing since i was a child. lou: maybe it's time to change it? pat: yeah! give it a try!",pat and lou are waiting for the stream but kevin is not interested as it makes him dizzy.
2,13810214,"jane: jane: whaddya think? shona: this ur tinder profile thing? jane: yeah, i'm updating my profile tonite. kinda nervoous though... :( jane: what if i get another guy like john? o.o shona: john was a dickhead jane: preach sistah! shona: anyhoo - this time i've got u :d no slimeballs for you jane: not again *shudders* jane: you know he forgot my birthday??!! shona: wanker","jane is updating her tinder profile tonight and together with shona they don't want to find another guy like john, who forgot jane's birthday."
3,13729823,"adam: do u have a map of paris? tom: yes, why? a trip? adam: no, it's for lisa and her mum.",tom has a map of paris.
4,13681400,"frank: hi, how's the family? mike: great! sam's moved out, finally! we thought he'd never leave.... i'm turning his room into a home gym, got to start getting fit again! you all ok? frank: yes, freda is still in the office, me still slogging away at the college. mike: you two on speaking terms yet? frank: not so you'd notice. few words now and again, y'know! mike: bloody brexit! frank: yeah, you'd never have thought a few years ago that this would have happened. 30 years, never a cross word, then that bastard referendum happened! mike: what was she thinking? frank: no idea, perhaps she was in love with nigel bloody farage! or boris! she certainly believed all the crap they were spouting! i don't know why she didn't use her head. mike: yes, i mean, the eu has done so much around her. doesn't freda swim at the new sports centre? frank: exactly, areas like ours have benefited so much from eu money! it's a crying shame whats happened. i've tried getting through to her, but she's adaman...","mike is happy, because sam's moved out. mike and frank grumble about brexit. frank is in dispute with freda, because she voted for brexit. mike will talk to freda in order to help frank."


In [None]:
train_data.shape

(8000, 3)

#Tokenization

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
import pandas as pd
import os

# Set your Hugging Face token as an environment variable
os.environ['HUGGINGFACE_TOKEN'] = 'YOUR_HUGGINGFACE_TOKEN'  # Replace 'YOUR_HUGGINGFACE_TOKEN' with your actual token

#Pass the token directly to the `from_pretrained` method:
tokenizer = T5Tokenizer.from_pretrained('t5-small', token=os.environ['HUGGINGFACE_TOKEN']) #'t5-small' to match the intended model

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
#Preprocessing function for tokenization
def preprocess_function(examples):
  #Tokenize dialogue and summary
  inputs = tokenizer(examples['dialogue'], padding = "max_length", max_length=512, truncation=True)
  targets = tokenizer(examples['summary'], padding = "max_length", max_length=128, truncation=True)
  inputs["labels"] = targets["input_ids"]
  return inputs

#Apply Preprocessing
train_dataset = train_data.apply(preprocess_function, axis=1)
val_dataset = validation_data.apply(preprocess_function, axis=1)

In [None]:
train_dataset[0]

{'input_ids': [25208, 10, 7102, 55, 3, 23, 764, 640, 48, 403, 17, 77, 31, 7, 1108, 11, 3, 23, 816, 24, 25, 429, 253, 34, 1477, 25208, 10, 3, 7997, 15, 10, 7102, 55, 3, 10, 61, 2049, 6, 68, 3, 23, 31, 162, 641, 608, 34, 5, 3, 10, 61, 3, 7997, 15, 10, 68, 2049, 21, 1631, 81, 140, 3, 10, 61, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

#Fine Tuning Model

In [None]:
# Model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for checkpoints
    num_train_epochs=6,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=50,                # how often to log training info
    save_steps=500,                  # how often to save a model checkpoint
    eval_steps=50,                   # how often to run evaluation
    eval_strategy="epoch",     # Ensure evaluation happens every `epoch`
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtadapanenisriram333[0m ([33mtadapanenisriram333-vnrvjietofficial[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.449,0.420097
2,0.4455,0.405207
3,0.4368,0.398047
4,0.4058,0.397322
5,0.4126,0.39561
6,0.3886,0.394655


TrainOutput(global_step=6000, training_loss=0.7039729096094767, metrics={'train_runtime': 2154.7756, 'train_samples_per_second': 22.276, 'train_steps_per_second': 2.785, 'total_flos': 6496406470656000.0, 'train_loss': 0.7039729096094767, 'epoch': 6.0})

#Save and Load Model

In [None]:
model.save_pretrained('./saved_summarization_model')
tokenizer.save_pretrained('./saved_summarization_model')

('./saved_summarization_model/tokenizer_config.json',
 './saved_summarization_model/special_tokens_map.json',
 './saved_summarization_model/spiece.model',
 './saved_summarization_model/added_tokens.json')

In [None]:
#Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('./saved_summarization_model')
tokenizer = T5Tokenizer.from_pretrained('./saved_summarization_model')

#Summarization System

In [None]:
# Ensure the model is on the correct device (GPU if available)
device = model.device  # Get the device the model is on

def summarize_dialogue(dialogue):
    dialogue = clean_text(dialogue)  # Assuming clean_text is defined
    inputs = tokenizer(dialogue, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate summary
    outputs = model.generate(
        inputs["input_ids"],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

#Evaluating the Model

In [None]:
!pip install rouge-score # Install the rouge-score package using pip

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=5830487d689e15e4ce1c1d3ee2ab278a998b2f49fc15053092ee95ba689e2569
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
from rouge_score import rouge_scorer
def evaluate_model(dialogues, summaries, num_examples=None): # Added num_examples parameter
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) # Added rouge2
    rouge1_scores = []
    rouge2_scores = [] # Added rouge2
    rougeL_scores = []

    # Evaluate a specified number of examples or all examples if num_examples is None
    num_examples = len(dialogues) if num_examples is None else min(num_examples, len(dialogues))

    for dialogue, reference_summary in zip(dialogues[:num_examples], summaries[:num_examples]):
        predicted_summary = summarize_dialogue(dialogue)
        scores = scorer.score(reference_summary, predicted_summary)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure) # Added rouge2
        rougeL_scores.append(scores['rougeL'].fmeasure)

    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores) # Added rouge2
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

    print(f"Average ROUGE-1 Score: {avg_rouge1}")
    print(f"Average ROUGE-2 Score: {avg_rouge2}") # Added rouge2
    print(f"Average ROUGE-L Score: {avg_rougeL}")

    return avg_rouge1, avg_rouge2, avg_rougeL # Return all scores

# Extract dialogues and summaries from the validation_data DataFrame
validation_dialogues = validation_data['dialogue'].tolist()
validation_summaries = validation_data['summary'].tolist()

# Evaluate the model (full validation set)
evaluate_model(validation_dialogues, validation_summaries)

Average ROUGE-1 Score: 0.46415185287756916
Average ROUGE-2 Score: 0.21892930774084707
Average ROUGE-L Score: 0.38036091447029424


(0.46415185287756916, 0.21892930774084707, 0.38036091447029424)

In [None]:
# Test with a sample input
sample_dialogue = """
Violet: Hey Claire! I was reading an article about Austin and thought you might find it interesting!
Violet: It's about the current trends in urban development and how cities are planning for the future.
Violet: Here, let me share the link: <file_other>
Claire: Oh wow, that sounds like an insightful read. But I've actually already read that one last week.
Claire: It was really interesting though, especially the part about sustainable architecture in cities.
Claire: You know, I've been following these urban planning discussions for a while now.
Violet: Oh, I didn’t know that! Well, I’ll look for something else then, maybe something about eco-friendly cities or tech innovations.
Claire: That would be awesome! Let me know if you find something cool.
Violet: Sure, I’ll keep you posted. Thanks for the feedback!
"""

summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)


Summary: violet was reading an article about austin and thought it might be interesting. claire has already read that one last week. claire has been following urban planning discussions for a while now.


In [None]:
# Test with a dialogue on a different topic
sample_dialogue = """
John: Hey Sarah, have you seen the latest tech gadget reviews? I found this new smartwatch that's supposed to have amazing health tracking features.
John: It tracks heart rate, blood oxygen levels, sleep patterns, and even stress levels! It sounds like something right up your alley.
Sarah: That sounds really interesting! But I’ve been trying to cut down on tech distractions. I’ve heard these devices can be really overwhelming sometimes.
Sarah: I do think it’s cool that they can track so many health metrics though. I’m curious how accurate they really are.
John: Yeah, me too! There are also some new smartphones coming out with even better cameras and longer battery life. The new flagship model from XYZ brand has some insane specs.
Sarah: Ooh, I haven’t kept up with phones recently, but I’ve heard the camera quality is getting ridiculously good. It’s almost like a professional camera in your pocket now!
Sarah: Still, I feel like I’m fine with my current phone for now. I don’t really feel the need to upgrade unless something really groundbreaking comes out.
John: Totally understand that. It’s the same with me. But I think the battery life improvements are enough to make me consider it. I hate running out of battery when I’m out and about.
Sarah: That’s fair! I’m always worried about battery life too. Honestly, I think phones should last at least two full days on a single charge by now.
John: I agree! It’s so annoying when your phone dies in the middle of the day. I wonder if we’ll ever get to a point where we don’t have to charge our phones every day.
Sarah: That would be amazing! I think as tech improves, battery tech might also catch up. Let’s hope the next generation of phones can last longer!
"""

summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)


Summary: john found a new smartwatch that tracks heart rate, blood oxygen levels, sleep patterns, and stress levels. sarah hasn't kept up with phones lately, but she's worried about battery life. john thinks phones should last at least two days on a single charge by now.


In [None]:
# Test with a dialogue on a current news topic
sample_dialogue = """
Reporter: In today's news, the latest climate change report reveals alarming global temperature rises. According to the Intergovernmental Panel on Climate Change (IPCC), the Earth’s temperature is on track to rise by 1.5°C within the next two decades.
Reporter: This is expected to lead to more frequent and severe heatwaves, flooding, and extreme weather events. Coastal cities are at particular risk due to rising sea levels.
Expert: The report emphasizes that immediate action is needed to prevent catastrophic consequences. We need to significantly reduce carbon emissions and transition to renewable energy sources.
Expert: If global temperatures increase by more than 1.5°C, we could face irreversible damage to ecosystems, agriculture, and water supply. It will have a devastating impact on biodiversity as well.
Reporter: The IPCC also stresses the importance of individual action. Governments must set stronger policies, but individuals can help by reducing waste, conserving water, and supporting green initiatives.
Expert: It's not just about the big changes; small actions like using public transportation, reducing meat consumption, and recycling can collectively make a significant difference.
Reporter: With the next UN Climate Summit coming up next month, world leaders will need to prioritize climate action. The stakes have never been higher for our planet’s future.
"""

summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)


Summary: the latest climate change report reveals alarming global temperature rises. the earth's temperature is on track to rise by 1.5°c within the next two decades. this is expected to lead to more frequent and severe heatwaves, flooding, and extreme weather events. coastal cities are at particular risk due to rising sea levels.


#Download Mode To Your Machine


In [None]:
import shutil

# Path to the directory containing the fine-tuned model
# model_dir = "results/saved_summarization_model"  # Original path
model_dir = "./saved_summarization_model"           # Correct path

# Output zip file path
output_zip_path = "saved_summarization_model.zip"

# Create a zip archive
shutil.make_archive(base_name="saved_summarization_model", format="zip", root_dir=model_dir)

'/content/saved_summarization_model.zip'

In [None]:
# prompt: Give me the saved_summarization_model to download as zip

import shutil
from google.colab import files

# Path to the directory containing the fine-tuned model
model_dir = "./saved_summarization_model"

# Output zip file path
output_zip_path = "saved_summarization_model.zip"

# Create a zip archive
shutil.make_archive(base_name="saved_summarization_model", format="zip", root_dir=model_dir)

# Download the zip file
files.download(output_zip_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from IPython.display import FileLink

# Display a download link
FileLink(output_zip_path)