In [None]:
!nvidia-smi

Wed Apr  5 18:57:29 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    33W /  70W |  12947MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:

from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import pipeline, set_seed

import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_dataset, load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import nltk
from nltk.tokenize import sent_tokenize

from tqdm import tqdm
import torch

nltk.download("punkt")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_ckpt = "facebook/bart-large-cnn"

##uses a predefined tokenizer that pegasus can interpret
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

##This line loads the pre-trained PEGASUS model specified by model_ckpt
##using the from_pretrained() method of the AutoModelForSeq2SeqLM class.
##It then moves the model to the device specified by device (either "cuda" or "cpu").
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

##uses the Hugging Face Transformers library to load a pre-trained PEGASUS model
##for sequence-to-sequence language modeling, along with its corresponding tokenizer.

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    #split the dataset into smaller batches that we can process simultaneously
    #Yield successive batch-sized chunks from list_of_elements
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]


##Computes summary accuracy "rogue score" using the pretrained model, tokenizer, and dataset
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=16, device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=0.8, num_beams=8, max_length=128)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the <n> token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
dataset_samsum = load_dataset("samsum")

split_lengths = [len(dataset_samsum[split])for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")

print(dataset_samsum["test"][0]["dialogue"])

print("\nSummary:")

print(dataset_samsum["test"][0]["summary"])



  0%|          | 0/3 [00:00<?, ?it/s]

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [None]:
#Uses the Hugging Face Transformers library to create a summarization pipeline

pipe = pipeline('summarization', model = model_ckpt)

#Applies pipeline to dialogue column of dataset. 
#This means our pretrained model as is will summarize every column entry in dialogue
pipe_out = pipe(dataset_samsum['test'][0]['dialogue'] )

print(pipe_out)

Your max_length is set to 142, but you input_length is only 139. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)


[{'summary_text': 'Hannah asks Amanda for Betty\'s number. Amanda can\'t find it. Hannah asks Larry. Amanda asks Larry to text him. Hannah says she\'ll text him back. Hannah calls it a day and says she\'s going to go home. Hannah: "Bye bye"'}]


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

rouge_metric = load_metric('rouge')

#Calculate the rogue score for the test dataset using the pegasus model
score = calculate_metric_on_test_ds(dataset_samsum['test'], rouge_metric, model_pegasus, tokenizer, column_text = 'dialogue', column_summary='summary', batch_size=8 )


100%|██████████| 103/103 [11:04<00:00,  6.45s/it]


In [None]:
rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['pegasus'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.300192,0.097804,0.226614,0.22661


In [None]:
#Prepares data for training

#Function takes in example batch of data as input, 
#Encodes the data batch using the pretrained tokenizer with a max length of 1024,
#returns a dictionary with the input token IDs, attention masks, and target token IDs

def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 200, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

#applies function to entire samsum dataset
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)



Map:   0%|          | 0/819 [00:00<?, ? examples/s]



In [None]:
#Data collector that can be used to collate and preprocess input and target sequences for Seq2Seq training.
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)
from transformers import TrainingArguments, Trainer

#Training Arguments object is defined which contains varius hyperparameters and settings for training the model.
#output_dir: The output directory where the trained model checkpoints and logs will be saved.
#num_train_epochs: The number of training epochs to run.
#per_device_train_batch_size: The batch size to use for training.
#gradient_accumulation_steps: The number of gradient accumulation steps to take before backpropagation.
#logging_steps: The frequency (in steps) at which to log training metrics.
#evaluation_strategy: The evaluation strategy to use during training, which is set to evaluate every eval_steps steps.
#eval_steps: The frequency (in steps) at which to evaluate the model on the validation set.
#save_steps: The frequency (in steps) at which to save model checkpoints during training.

training_args = TrainingArguments(
    output_dir='bart-samsum', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16,
    push_to_hub=True
) 

In [None]:
#Creates trainer object
#model: The Pegasus model to be trained.
#args: The training arguments defined earlier.
#tokenizer: The tokenizer used to preprocess the input and target sequences.
#data_collator: The data collator used to collate and preprocess the input and target sequences.
#train_dataset: The preprocessed training dataset, which is a Dataset object created using the map method of a Dataset object representing the original dataset.
#eval_dataset: The preprocessed validation dataset, which is also a Dataset object created using the map method of a Dataset object representing the original dataset.

trainer = Trainer(model=model_pegasus, args= training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=dataset_samsum_pt["train"], 
                  eval_dataset=dataset_samsum_pt["validation"])

Cloning https://huggingface.co/derekepp/bart-samsum into local empty directory.


In [None]:
trainer.train()

score = calculate_metric_on_test_ds(
    dataset_samsum['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = [f'pegasus'] )



Step,Training Loss,Validation Loss
500,1.4698,1.443466


100%|██████████| 410/410 [13:34<00:00,  1.99s/it]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.38775,0.191022,0.298044,0.297963


In [None]:
model_pegasus.push_to_hub("derekepp/bart-samsum")

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/derekepp/bart-samsum/commit/8eca0cebd97c81e18a4baa0dd9913e5a0948c576', commit_message='Upload BartForConditionalGeneration', commit_description='', oid='8eca0cebd97c81e18a4baa0dd9913e5a0948c576', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
import re

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}

def clean_transcript(transcript):
    # Remove timestamps
    transcript = re.sub(r'\d+:\d+', '', transcript)
    
    # Remove line breaks
    transcript = re.sub(r'\n', ' ', transcript)
    return transcript

 ##dataset_samsum["test"][0]["dialogue"]

sample_text = """phase four more like phase snore
0:05
that's right guys this one's been a long
0:07
time coming I was gonna make this video
0:08
last year after black panther came out
0:10
but I kind of got a little busy and you
0:12
know black panther 2 left me a little
0:14
more optimistic about the future of the
0:16
MCU it was one of the better movies in
0:19
phase four but then the movies got bad
0:22
again so here we go it's ramp time I'm
0:25
gonna talk about the current state of
0:26
the MCU and how I think that things
0:29
should change pretty soon or else I
0:31
don't see people sticking around for
0:32
much longer see when I look back on the
0:35
fourth phase of the MCU I do not have
0:37
strong feelings about most of the things
0:39
that I have watched it's disappointing
0:41
that right after end game they chose to
0:43
just flood the market with a plethora of
0:45
mediocre shows and movies the common
0:48
word I hear around the MCU nowadays is
0:51
mid and honestly I think it's probably
0:54
the most accurate use of that word
0:56
because mid doesn't really mean bad it
0:59
just means mediocre middle of the road
1:01
and yeah the recent MCU entries usually
1:04
aren't horrible every once in a while
1:06
you watch something pretty fun and cool
1:08
but most of the time it's just fine it
1:12
might be entertaining in the moment but
1:14
usually it's nothing that will blow you
1:15
away or leave a lasting impression the
1:18
infinity soccer was for the most part
1:20
well planned out and well executed and
1:22
of course there are a lot of mid movies
1:24
in that collection but there are also a
1:27
lot of iconic action-adventure movies in
1:29
there too and if you want to follow that
1:31
up you kind of have to go bigger and
1:34
Bolder and Marvel is not doing that
1:36
right now I will never understand why
1:38
they followed up the most popular movie
1:41
of the 21st century with this
1:52
dude I'm not gonna lie the Multiverse
1:54
Saga has potential to be more
1:56
interesting than the previous
1:57
overarching plot an evil time traveling
2:00
Mastermind with multiple incarnations of
2:03
himself trying to stop the Avengers is a
2:06
much more interesting story than purple
2:08
man wants to kill everybody but that's
2:11
only if you execute it well you kinda
2:14
have to plant seeds for this story at
2:16
every opportunity and so far about half
2:19
of the movies and shows from phase four
2:22
have had nothing to do with Kang or the
2:24
Multiverse or anything like that and
2:26
honestly Marvel's running out of time
2:27
because everybody's got a
2:29
Multiverse now and Marvel started this
2:31
train with spider-verse which wasn't an
2:33
MCU thing but still it led to no way
2:35
home which kind of got us in this
2:37
position but after that movie they've
2:40
kind of just been around
2:41
normally I'm okay with having a
2:43
standalone story every once in a while
2:45
that doesn't affect the overarching plot
2:47
but so far the last like dozen entries
2:50
really haven't had any connecting tissue
2:52
everything has been a standalone story
2:55
kind of feels like Marvel was just kind
2:57
of throwing at the wall and seeing
2:59
what sticks and obviously the thing that
3:01
everybody thought would be popular was
3:04
the most popular entry people very
3:06
clearly react well to the Multiverse
3:08
stuff but I don't think that's going to
3:10
last much longer especially since
3:13
everybody's copying it now instead of
3:15
following up phase 3 with stories people
3:17
want to see we have a movie about a dead
3:19
lady we have a movie where Thor kind of
3:21
around for a few hours we have a
3:23
movie that tries to introduce a bunch of
3:25
characters that have no personality a
3:27
movie so bad that Marvel's basically
3:29
pretending it never happened gee that's
3:32
a nice Multiverse Saga but tell me how
3:35
does Cersei the Eternal fit into all of
3:37
this
3:39
that movie ends in probably the funniest
3:41
way with her getting kidnapped by a
3:44
giant space god and it's just crazy that
3:48
not a single character in the Marvel
3:50
Cinematic Universe has brought up the
3:53"""

sample_text = clean_transcript(sample_text)

reference = dataset_samsum["test"][0]["summary"]

pipe = pipeline("summarization", model=model_ckpt)

## 
#print("Dialogue:")
#print(sample_text)


##print("\nReference Summary:")
##print(reference)


print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])


Model Summary:
The recent MCU entries usually  aren't horrible every once in a while  you watch something pretty fun and cool  but most of the time it's just fine. I will never understand why  they followed up the most popular movie  of the 21st century with this dude. The Multiverse Saga has potential to be more interesting than the previous overarching plot an evil time traveling Mastermind with multiple incarnations of himself trying to stop the Avengers.
