In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments

In [None]:
# Load dataset (example, adjust path as needed)
train_data = pd.read_csv("samsum-train.csv")
validation_data = pd.read_csv("samsum-validation.csv")

# Display a sample
train_data.head()

Unnamed: 0,id,dialogue,summary
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
1,13728867,Olivia: Who are you voting for in this electio...,Olivia and Olivier are voting for liberals in ...
2,13681000,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...",Kim may try the pomodoro technique recommended...
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."


In [None]:
train_data = train_data.sample(n=4000,random_state=42).reset_index(drop=True)
validation_data = validation_data.sample(n=500, random_state=42).reset_index(drop=True)

# Data Preprocessing

In [None]:
# Clean the text by removing unwanted characters
import re

def clean_text(text):
    text = re.sub(r'\r\n', ' ', text)  # Remove carriage returns and line breaks
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'<.*?>', '', text)  # Remove any XML tags
    text = text.strip().lower()  # Strip and convert to lower case
    return text

# Apply cleaning to dialogue and summary columns
train_data['dialogue'] = train_data['dialogue'].apply(clean_text)
train_data['summary'] = train_data['summary'].apply(clean_text)

validation_data['dialogue'] = validation_data['dialogue'].apply(clean_text)
validation_data['summary'] = validation_data['summary'].apply(clean_text)


# Display a sample after cleaning
train_data

Unnamed: 0,id,dialogue,summary
0,13862301,nathan: i'm kinda bored nathan: do you have an...,nathan asked jake about new anime titles that ...
1,13611700,daniel: btw have you started watching the seri...,"daniel and sally enjoy ""the americans"" but dan..."
2,13828479,ian: god damn! i'm not gonna make it on time! ...,ian will be there in about an hour. phil will ...
3,13729443,emily: i saw you at the beach yesterday. what ...,emily saw richard at the beach. he was with th...
4,13682077,matt: good morning :) sophie: hello :) :* matt...,sophie will come to matt's place for breakfast...
...,...,...,...
3995,13810138,adam: hey nina how are you? whats up? nina: i ...,"adam, nina's brother, invites nina for a get-t..."
3996,13717208,james: i have a proposal for you all nicky: ho...,"james needs a date for a wedding this weekend,..."
3997,13716945,"richard: hey guy, what time will you be here? ...",charles and michelle will be there at 17:34. r...
3998,13716574,"jim: hi guys derek: hi andy: hi, man jim: have...","""infinity war"" is a great film."


# Tokenization

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Preprocessing function for tokenization
def preprocess_function(examples):
    # Tokenize the dialogue and summary
    inputs = tokenizer(examples["dialogue"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=150)
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply the preprocessing
train_dataset = train_data.apply(preprocess_function, axis=1)
val_dataset = validation_data.apply(preprocess_function, axis=1)

In [None]:
train_dataset[0]

{'input_ids': [3, 29, 9, 6736, 10, 3, 23, 31, 51, 773, 9, 19682, 3, 29, 9, 6736, 10, 103, 25, 43, 136, 126, 21705, 24, 25, 54, 1568, 58, 2662, 1050, 10, 6865, 3, 10, 61, 2662, 1050, 10, 131, 5607, 140, 125, 773, 13, 21705, 33, 25, 1638, 16, 58, 3, 29, 9, 6736, 10, 424, 659, 11, 6613, 3, 29, 9, 6736, 10, 1066, 4092, 42, 11043, 3803, 3, 29, 9, 6736, 10, 59, 3, 9, 600, 1819, 13, 17201, 18, 89, 23, 2662, 1050, 10, 410, 25, 1605, 959, 45, 48, 1590, 87, 210, 3870, 5818, 58, 3, 29, 9, 6736, 10, 59, 780, 2662, 1050, 10, 207, 6, 24, 3231, 178, 28, 128, 1245, 931, 2662, 1050, 10, 166, 13, 66, 1077, 82, 1305, 126, 1764, 96, 6279, 97, 3, 23, 530, 3, 60, 18860, 920, 38, 3, 9, 12593, 15, 121, 2662, 1050, 10, 8, 2233, 845, 66, 81, 34, 3, 29, 9, 6736, 10, 24, 31, 7, 3, 9, 17056, 564, 2662, 1050, 10, 168, 17945, 68, 8, 21705, 19, 248, 2662, 1050, 10, 34, 65, 128, 19752, 8073, 68, 167, 13, 8, 97, 34, 11331, 7, 66, 13, 39, 5598, 2662, 1050, 10, 659, 6, 6613, 11, 11043, 1898, 3, 29, 9, 6736, 10, 2993, 147

# Fine Tuning Model

In [None]:
# Model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # output directory for checkpoints
    num_train_epochs=6,              # number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir="./logs",            # directory for storing logs
    logging_steps=50,                # how often to log training info
    save_steps=500,                  # how often to save a model checkpoint
    eval_steps=50,                   # how often to run evaluation
    eval_strategy="epoch",     # Ensure evaluation happens every `epoch`
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mimadkhan01430[0m ([33mimadkhan01430-guru-ghasidas-vishwavidyalaya[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.4323,0.38174
2,0.3781,0.360181
3,0.3826,0.353632
4,0.3732,0.349299
5,0.3446,0.34898
6,0.3666,0.348536


TrainOutput(global_step=3000, training_loss=0.924922587076823, metrics={'train_runtime': 1191.7394, 'train_samples_per_second': 20.139, 'train_steps_per_second': 2.517, 'total_flos': 3248203235328000.0, 'train_loss': 0.924922587076823, 'epoch': 6.0})

In [None]:

# Save the fine-tuned model
model.save_pretrained("./saved_summary_model")
tokenizer.save_pretrained("./saved_summary_model")

('./saved_summary_model/tokenizer_config.json',
 './saved_summary_model/special_tokens_map.json',
 './saved_summary_model/spiece.model',
 './saved_summary_model/added_tokens.json')

In [None]:
# Load the saved model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./saved_summary_model")
tokenizer = T5Tokenizer.from_pretrained("./saved_summary_model")

# Summarization


In [None]:
# Ensure the model is on the correct device (GPU if available)
device = model.device  # Get the device the model is on

def summarize_dialogue(dialogue):
    dialogue = clean_text(dialogue)  # Assuming clean_text is defined
    inputs = tokenizer(dialogue, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Generate summary
    outputs = model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=4,
        early_stopping=True
    )

    # Decode the generated summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [None]:
# Test with a sample input
sample_dialogue ="""
Reporter: In today's news, the latest climate change report reveals alarming global temperature rises. According to the Intergovernmental Panel on Climate Change (IPCC), the Earth’s temperature is on track to rise by 1.5°C within the next two decades.
Reporter: This is expected to lead to more frequent and severe heatwaves, flooding, and extreme weather events. Coastal cities are at particular risk due to rising sea levels.
Expert: The report emphasizes that immediate action is needed to prevent catastrophic consequences. We need to significantly reduce carbon emissions and transition to renewable energy sources.
Expert: If global temperatures increase by more than 1.5°C, we could face irreversible damage to ecosystems, agriculture, and water supply. It will have a devastating impact on biodiversity as well.
Reporter: The IPCC also stresses the importance of individual action. Governments must set stronger policies, but individuals can help by reducing waste, conserving water, and supporting green initiatives.
Expert: It's not just about the big changes; small actions like using public transportation, reducing meat consumption, and recycling can collectively make a significant difference.
Reporter: With the next UN Climate Summit coming up next month, world leaders will need to prioritize climate action. The stakes have never been higher for our planet’s future.
"""

summary = summarize_dialogue(sample_dialogue)
print("Summary:", summary)

Summary: the latest climate change report reveals alarming global temperature rises. the earth’s temperature is on track to rise by 1.5°c within the next two decades. experts need to reduce carbon emissions and transition to renewable energy sources.


In [None]:
import shutil

# Path to the directory containing the fine-tuned model
model_dir = "./saved_summary_model"

# Output zip file path
output_zip_path = "saved_summary_model.zip"

# Create a zip archive
shutil.make_archive(base_name="saved_summary_model", format="zip", root_dir=model_dir)

'/content/saved_summary_model.zip'

In [None]:
from IPython.display import FileLink

# Display a download link
FileLink(output_zip_path)