In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
import os
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset, load_metric

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
file_names = [
    'ES2002a.transcript.txt', 'ES2002b.transcript.txt', 'ES2002c.transcript.txt', 'ES2002d.transcript.txt',
    'ES2003a.transcript.txt', 'ES2003b.transcript.txt', 'ES2003c.transcript.txt', 'ES2003d.transcript.txt',
    'ES2004a.transcript.txt', 'ES2004b.transcript.txt', 'ES2004c.transcript.txt', 'ES2004d.transcript.txt',
    'ES2005a.transcript.txt', 'ES2005b.transcript.txt', 'ES2005c.transcript.txt', 'ES2005d.transcript.txt',
    'ES2006a.transcript.txt', 'ES2006b.transcript.txt', 'ES2006c.transcript.txt', 'ES2006d.transcript.txt',
    'ES2007a.transcript.txt', 'ES2007b.transcript.txt', 'ES2007c.transcript.txt', 'ES2007d.transcript.txt',
    'ES2008a.transcript.txt', 'ES2008b.transcript.txt', 'ES2008c.transcript.txt', 'ES2008d.transcript.txt',
    'ES2009a.transcript.txt', 'ES2009b.transcript.txt', 'ES2009c.transcript.txt', 'ES2009d.transcript.txt',
    'ES2010a.transcript.txt', 'ES2010b.transcript.txt', 'ES2010c.transcript.txt', 'ES2010d.transcript.txt',
    'ES2011a.transcript.txt', 'ES2011b.transcript.txt', 'ES2011c.transcript.txt', 'ES2011d.transcript.txt',
    'ES2012a.transcript.txt', 'ES2012b.transcript.txt', 'ES2012c.transcript.txt', 'ES2012d.transcript.txt',
    'ES2013a.transcript.txt', 'ES2013b.transcript.txt', 'ES2013c.transcript.txt', 'ES2013d.transcript.txt',
    'ES2014a.transcript.txt', 'ES2014b.transcript.txt', 'ES2014c.transcript.txt', 'ES2014d.transcript.txt',
    'ES2015a.transcript.txt', 'ES2015b.transcript.txt', 'ES2015c.transcript.txt', 'ES2015d.transcript.txt',
    'ES2016a.transcript.txt', 'ES2016b.transcript.txt', 'ES2016c.transcript.txt', 'ES2016d.transcript.txt',
    'IS1000a.transcript.txt', 'IS1000b.transcript.txt', 'IS1000c.transcript.txt', 'IS1000d.transcript.txt',
    'IS1001a.transcript.txt', 'IS1001b.transcript.txt', 'IS1001c.transcript.txt', 'IS1001d.transcript.txt',
    'IS1002b.transcript.txt', 'IS1002c.transcript.txt', 'IS1002d.transcript.txt',
    'IS1003a.transcript.txt', 'IS1003b.transcript.txt', 'IS1003c.transcript.txt', 'IS1003d.transcript.txt',
    'IS1004a.transcript.txt', 'IS1004b.transcript.txt', 'IS1004c.transcript.txt', 'IS1004d.transcript.txt',
    'IS1005a.transcript.txt', 'IS1005b.transcript.txt', 'IS1005c.transcript.txt',
    'IS1006a.transcript.txt', 'IS1006b.transcript.txt', 'IS1006c.transcript.txt', 'IS1006d.transcript.txt',
    'IS1007a.transcript.txt', 'IS1007b.transcript.txt', 'IS1007c.transcript.txt', 'IS1007d.transcript.txt',
    'IS1008a.transcript.txt', 'IS1008b.transcript.txt', 'IS1008c.transcript.txt', 'IS1008d.transcript.txt',
    # Continuing the pattern for the rest of the file names...
]

# Modify the path for each file name
transcript_files= ['/content/drive/MyDrive/transcript/' + file_name for file_name in file_names]

In [None]:
# Given the challenge with directly inputting the lengthy list, let's simplify the process by manually constructing the new paths based on the given pattern.

# Manually constructing the new paths array based on the given filenames pattern
base_path = '/content/drive/MyDrive/ami/'
suffix = '.ducref.abstract'
meeting_ids = [
    'ES2002a', 'ES2002b', 'ES2002c', 'ES2002d',
    'ES2003a', 'ES2003b', 'ES2003c', 'ES2003d',
    'ES2004a', 'ES2004b', 'ES2004c', 'ES2004d',
    'ES2005a', 'ES2005b', 'ES2005c', 'ES2005d',
    'ES2006a', 'ES2006b', 'ES2006c', 'ES2006d',
    'ES2007a', 'ES2007b', 'ES2007c', 'ES2007d',
    'ES2008a', 'ES2008b', 'ES2008c', 'ES2008d',
    'ES2009a', 'ES2009b', 'ES2009c', 'ES2009d',
    'ES2010a', 'ES2010b', 'ES2010c', 'ES2010d',
    'ES2011a', 'ES2011b', 'ES2011c', 'ES2011d',
    'ES2012a', 'ES2012b', 'ES2012c', 'ES2012d',
    'ES2013a', 'ES2013b', 'ES2013c', 'ES2013d',
    'ES2014a', 'ES2014b', 'ES2014c', 'ES2014d',
    'ES2015a', 'ES2015b', 'ES2015c', 'ES2015d',
    'ES2016a', 'ES2016b', 'ES2016c', 'ES2016d',
    'IS1000a', 'IS1000b', 'IS1000c', 'IS1000d',
    'IS1001a', 'IS1001b', 'IS1001c', 'IS1001d',
    'IS1002b', 'IS1002c', 'IS1002d',
    'IS1003a', 'IS1003b', 'IS1003c', 'IS1003d',
    'IS1004a', 'IS1004b', 'IS1004c', 'IS1004d',
    'IS1005a', 'IS1005b', 'IS1005c',
    'IS1006a', 'IS1006b', 'IS1006c', 'IS1006d',
    'IS1007a', 'IS1007b', 'IS1007c', 'IS1007d',
    'IS1008a', 'IS1008b', 'IS1008c', 'IS1008d',
    'IS1009a', 'IS1009b', 'IS1009c', 'IS1009d',
    'TS3003a', 'TS3003b', 'TS3003c', 'TS3003d',
    'TS3004a', 'TS3004b', 'TS3004c', 'TS3004d',
    'TS3005a', 'TS3005b', 'TS3005c', 'TS3005d',
    'TS3006a', 'TS3006b', 'TS3006c', 'TS3006d',
    'TS3007a', 'TS3007b', 'TS3007c', 'TS3007d',
    'TS3008a', 'TS3008b', 'TS3008c', 'TS3008d',
    'TS3009a', 'TS3009b', 'TS3009c', 'TS3009d',
    'TS3010a', 'TS3010b', 'TS3010c', 'TS3010d',
    'TS3011a', 'TS3011b', 'TS3011c', 'TS3011d',
    'TS3012a', 'TS3012b', 'TS3012d'
]

abstract_files = [base_path + meeting_id + suffix for meeting_id in meeting_ids]

abstract_files[:5]  # Displaying the first 5 entries as a sample


['/content/drive/MyDrive/ami/ES2002a.ducref.abstract',
 '/content/drive/MyDrive/ami/ES2002b.ducref.abstract',
 '/content/drive/MyDrive/ami/ES2002c.ducref.abstract',
 '/content/drive/MyDrive/ami/ES2002d.ducref.abstract',
 '/content/drive/MyDrive/ami/ES2003a.ducref.abstract']

In [None]:
# Function to read a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Creating lists to store the data
file_names = []
transcripts = []
abstracts = []

# Assuming each transcript file has a corresponding abstract file
for transcript_file, abstract_file in zip(transcript_files, abstract_files):
    file_names.append(os.path.basename(transcript_file).split('.')[0])  # Assuming file name format is consistent
    transcripts.append(read_file(transcript_file))
    abstracts.append(read_file(abstract_file))

# Creating the DataFrame
dataset = pd.DataFrame({
    'File Name': file_names,
    'Transcript': transcripts,
    'Abstract': abstracts
})

dataset.head()  # Display the first few rows of the DataFrame


Unnamed: 0,File Name,Transcript,Abstract
0,ES2002a,"hi, i'm david and i'm supposed to be an indust...",The project manager introduced the upcoming pr...
1,ES2002b,"mm yeah. nope, we're all set. which which is t...",The project manager briefed the team on some n...
2,ES2002c,'s to do now is to decide how to fulfil what y...,The project manager recapped the decisions mad...
3,ES2002d,"no. ninja homer, made in japan. and there isn'...",The project manager recapped the decisions mad...
4,ES2003a,"uh, dave cochrane. user interface defin design...",The team members introduced themselves to each...


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [None]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''

        # Finally, we decode the generated texts,
        # replace the  token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]

        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]


        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
from datasets import Dataset

# Assuming `data` is your dataset in a format compatible with Hugging Face's Dataset
dataset = Dataset.from_dict(dataset)

# Split the dataset
train_testvalid = dataset.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
train_dataset = train_testvalid['train']
test_dataset = test_valid['train']
valid_dataset = test_valid['test']

In [None]:
train_dataset

Dataset({
    features: ['File Name', 'Transcript', 'Abstract'],
    num_rows: 75
})

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['Transcript'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['Abstract'], max_length = 128, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

train_data = train_dataset.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/75 [00:00<?, ? examples/s]



In [None]:
train_data

Dataset({
    features: ['File Name', 'Transcript', 'Abstract', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 75
})

In [None]:
validation_data = valid_dataset.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [None]:
pip install transformers[torch]



In [None]:
pip install accelerate -U



In [None]:
from transformers import TrainingArguments, Trainer

learning_rate = 2e-5  # Example: Start with a commonly recommended LR for fine-tuning
num_train_epochs = 15  # Adjust based on your dataset size and model performance
per_device_train_batch_size = 4  # Adjust based on your hardware
per_device_eval_batch_size = 4

trainer_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    warmup_steps=400,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=1000,
    learning_rate=learning_rate,
    save_strategy='no',
    gradient_accumulation_steps=8,
)

In [None]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=train_data,
                  eval_dataset=validation_data)

In [None]:
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

89

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=30, training_loss=4.598560078938802, metrics={'train_runtime': 143.9486, 'train_samples_per_second': 7.815, 'train_steps_per_second': 0.208, 'total_flos': 2739212264669184.0, 'train_loss': 4.598560078938802, 'epoch': 12.63})

In [None]:
test_data = test_dataset.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [None]:
rouge_metric = load_metric('rouge')

rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

score = calculate_metric_on_test_ds(
   test_data , rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'Transcript', column_summary= 'Abstract'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )

  rouge_metric = load_metric('rouge')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5/5 [00:09<00:00,  1.96s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.02061,0.0,0.020454,0.020567


In [None]:
model_pegasus.save_pretrained("pegasus-model")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [None]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
def generate_summary(text, model, tokenizer, max_output_length=500):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    # Preprocess the text
    input_encodings = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length")

    # Generate the summary
    summary_ids = model.generate(input_encodings['input_ids'].to(device),
                                 attention_mask=input_encodings['attention_mask'].to(device),
                                 max_length=max_output_length,
                                 num_beams=6,  # Increased beam width
                                 temperature=2,  # Adjusted temperature
                                 repetition_penalty=1,  # Increased repetition penalty
                                 length_penalty=10,  # Adjust length penalty
                                 early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [None]:
generated_summaries = [generate_summary(text, model_pegasus, tokenizer) for text in test_dataset['Transcript']]

In [None]:
for generated in generated_summaries:
    print("-------")
    print(generated)

-------
We're trying to lock ourselves into a particular kind of technology.<n>We don't look at the technology, but at end of the day we don't look at the technology.<n>i think twenty five to thirty five is is is fair to say that more than half your group are willing to at least try and use your technology.
-------
This is the first in a series of in-depth interviews with leading design experts.<n>In the first part of the series we look at how to design a new product.<n>The second part is a look at the technical aspects of a new product.
-------
Play-doh is designed to be used by left-handed people.<n>Play-doh is made out of rubber and comes with a mute button.<n>Play-doh comes in at under ten euros a unit.
-------
This week we look at the design of the mot mote remote control.<n>We go through the minutes of the last meeting and see which decisions were made.<n>We also go through the finance evaluation of the of the cost of the thing.
-------
kate, kate, kate, kate, kate, kate.<n>Kate,

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(orig, gen) for orig, gen in zip(test_dataset['Abstract'], generated_summaries)]

# Example to print ROUGE-1 scores
for score in scores:
    print(f"ROUGE-1: {score['rouge1'].fmeasure:.4f}")


ROUGE-1: 0.1058
ROUGE-1: 0.1731
ROUGE-1: 0.1279
ROUGE-1: 0.2430
ROUGE-1: 0.0000
ROUGE-1: 0.2060
ROUGE-1: 0.2255
ROUGE-1: 0.1845
ROUGE-1: 0.0966
