In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# The below command displays the current status of the NVIDIA GPU(s) on the system

-If the GPU is not being utilized, configure the machine learning framework to use the GPU
-If the code is running slowly, optimize the code to better utilize the GPU
-This command is a useful tool for debugging and optimizing machine learning projects that utilize NVIDIA GPUs

In [None]:
# Check if the GPU is being utilized by the code
!nvidia-smi

In [None]:
# Install necessary packages for the project
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

# The above command installs the following packages:
# - transformers: a popular library for natural language processing (NLP) tasks such as text classification and language translation
# - datasets: a collection of datasets for NLP tasks, including the popular Hugging Face datasets
# - sacrebleu: a library for computing BLEU scores, a metric for evaluating the quality of machine-translated text
# - rouge_score: a library for computing ROUGE scores, another metric for evaluating the quality of machine-translated text
# - py7zr: a library for working with 7z archives, a type of compressed file format

In [None]:
# Install and upgrade necessary packages for the project
!pip install --upgrade accelerate
!pip uninstall -y transformers accelerate
!pip install transformers accelerate

# The above commands install and upgrade the following packages:
# - accelerate: a library for optimizing PyTorch and TensorFlow code for CPU and GPU performance
# - transformers: a popular library for natural language processing (NLP) tasks such as text classification and language translation

# The second command uninstalls the previously installed versions of transformers and accelerate to ensure that the latest versions are installed.

# These packages are necessary to optimize their code for CPU and GPU performance and perform NLP tasks such as text classification and language translation.

In [None]:
# Import necessary packages for the project
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_metric
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch

# The above packages are likely necessary for the project and will enable the Programmer to perform natural language processing (NLP) tasks such as text classification and language translation.
# - transformers: a popular library for NLP tasks such as text classification and language translation
# - datasets: a collection of datasets for NLP tasks, including the popular Hugging Face datasets
# - matplotlib: a library for creating visualizations in Python
# - pandas: a library for data manipulation and analysis
# - nltk: a library for natural language processing tasks such as tokenization and stemming
# - tqdm: a library for adding progress bars to Python loops
# - torch: a library for machine learning tasks such as neural network training and inference

In [None]:
nltk.download("punkt")
# The above code also downloads the "punkt" tokenizer from the nltk library, which is used for tokenizing text into sentences.

In [None]:
# Load a pre-trained Pegasus model for sequence-to-sequence language modeling
# Set the device to use for running the model (either "cuda" or "cpu")
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "google/bigbird-pegasus-large-arxiv"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_bigbird = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# The above code loads a pre-trained Pegasus model for sequence-to-sequence language modeling and sets the device to use for running the model.
# - torch: a library for machine learning tasks such as neural network training and inference
# - AutoTokenizer: a class for automatically selecting the appropriate tokenizer based on the checkpoint name
# - AutoModelForSeq2SeqLM: a class for automatically selecting the appropriate model based on the checkpoint name
# - "google/bigbird-pegasus-large-arxiv": the checkpoint name for the pre-trained BigBird model
# - device: the device to use for running the model (either "cuda" or "cpu")

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer,model=model_bigbird)

In [None]:
# Download and extract the summarizer data
!wget https://github.com/InsiderCloud/summarizer-datasets/raw/main/dataset_samsum.zip
!wget https://github.com/InsiderCloud/summarizer-datasets/raw/main/dataset_scrolls_books.zip
!wget https://github.com/InsiderCloud/summarizer-datasets/raw/main/dataset_scrolls_dialogue.zip
!wget https://github.com/InsiderCloud/summarizer-datasets/raw/main/meeting_bank.zip


# The above commands download and extract the summarizer data from a GitHub repository.
# - wget: a command-line utility for downloading files from the web
# - unzip: a command-line utility for extracting files from a zip archive

In [None]:
!unzip dataset_samsum.zip
!unzip dataset_scrolls_books.zip
!unzip dataset_scrolls_dialogue.zip
!unzip meeting_bank.zip

# List all available datasets in hugginface

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import list_datasets
print([dataset.id for dataset in list_datasets()])

## Load the samsum dataset

The samsum dataset is loaded from disk using the `load_from_disk` function from the `datasets` package. This dataset likely contains the necessary data for the project and will enable the Programmer to train and test their summarization model.

The code to load the samsum dataset is shown below:

In [None]:
# Load the samsum dataset from disk
dataset_meeting_bank = load_from_disk("dataset_meeting_bank")
dataset_meeting_bank

In [None]:
# @title Default title text
dataset_samsum = load_from_disk("dataset_samsum")
dataset_samsum

In [None]:
dataset_scrolls_books = load_from_disk("dataset_scrolls_books")
dataset_scrolls_books

In [None]:
dataset_scrolls_dialogue = load_from_disk("dataset_scrolls_dialogue")
dataset_scrolls_dialogue

## Print information about the samsum dataset

The following code prints information about the samsum dataset, including the length of each split, the column names, and an example dialogue and summary.

In [None]:
# Print information about the samsum dataset
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]
print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print("\nDialogue:")
print(dataset_samsum["test"][1]["dialogue"])
print("\nSummary:")
print(dataset_samsum["test"][1]["summary"])

## Convert examples to features for training the summarization model

The following code defines a function for converting a batch of examples to features for training the summarization model. The function tokenizes the input dialogue and target summary using the tokenizer and returns the input IDs, attention mask, and target labels as a dictionary.

In [13]:
# Convert a batch of examples to features for training the summarization model
def convert_examples_to_features(example_batch,text_field,summary_field):
  input_encode = tokenizer(example_batch[text_field], max_length=4096, truncation=True)
  with tokenizer.as_target_tokenizer():
    target_encode = tokenizer(example_batch[summary_field], max_length=512, truncation=True)
  return {
      'input_ids': input_encode['input_ids'],
      'attention_mask': input_encode['attention_mask'],
      'labels': target_encode['input_ids']
  }

In [None]:
dataset_samsum_pt = dataset_samsum.map(
    lambda examples: convert_examples_to_features(examples, 'dialogue', 'summary'),
    batched = True)

In [None]:
dataset_meeting_bank_pt = dataset_meeting_bank.map(
    lambda examples: convert_examples_to_features(examples, 'source', 'reference'),
    batched = True)

In [None]:
dataset_scrolls_books_pt = dataset_scrolls_books.map(
    lambda examples: convert_examples_to_features(examples, 'input', 'output'),
    batched = True)

In [None]:
dataset_scrolls_dialogue_pt = dataset_scrolls_dialogue.map(
    lambda examples: convert_examples_to_features(examples, 'input', 'output'),
    batched = True)

**Training**

In [16]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='bigbird-samsum',
    num_train_epochs=2,
    warmup_steps=5000,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=16
)

Train on samsum dataset

In [17]:
trainer = Trainer(
    model=model_bigbird,
    args = trainer_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_samsum_pt['train'],
    eval_dataset=dataset_samsum_pt["validation"])

In [None]:
trainer.train()

You're using a PegasusTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Attention type 'block_sparse' is not possible if sequence_length: 203 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Step,Training Loss,Validation Loss
500,4.8844,4.660931
1000,4.227,4.031272
1500,3.6863,3.697585


Train on Bank_meetings

In [None]:
trainer = Trainer(
    model=model_bigbird,
    args = trainer_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_meeting_bank_pt['train'],
    eval_dataset=dataset_meeting_bank_pt["validation"])

In [None]:
trainer.train()

Evaluation

In [None]:
from sqlalchemy import column
def generate_batch_sized_chunks(list_of_elements,batch_size):
  for i in range(0,len(list_of_elements),batch_size):
    yield list_of_elements[i:i+batch_size]

def calculate_metric_on_test_ds(dataset,metric,model,tokenizer,
                                batch_size=16,device=device,
                                column_text="transcribe",
                                column_summary="highlights"):
  transcribe_batches = list(generate_batch_sized_chunks(dataset[column_text],batch_size))
  target_batches = list(generate_batch_sized_chunks(dataset[column_summary],batch_size))

  for transcribe_batch, target_batch in tqdm( zip(transcribe_batches,target_batches),total=len(transcribe_batches)):
    inputs = tokenizer(transcribe_batch,max_length=4096,truncation= True,padding="max_length",return_tensors="pt")

    summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                               attention_mask=inputs["attention_mask"].to(device),
                               length_penalty=0.8,num_beams=8,max_length=512)

    decoded_summaries = [tokenizer.decode(s,skip_special_tokens=True,clean_up_tokenization_spaces=True) for s in summaries]

    decoded_summaries = [d.replace(""," ") for d in decoded_summaries]

    metric.add_batch(predictions=decoded_summaries,references=target_batch)


  score = metric.compute()
  return score


In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

In [None]:
device = 'cuda'
import torch, gc
import os
gc.collect()
torch.cuda.empty_cache()

In [None]:
score = calculate_metric_on_test_ds(
    dataset_samsum['test'],
    rouge_metric,
    trainer.model,
    tokenizer,
    batch_size = 2,
    column_text = 'dialogue',
    column_summary= 'summary'
)

rouge_dict = dict((rn,score[rn].mid.fmeasure) for rn in rouge_names)

pd.DataFrame(rouge_dict,index=[f'bigbird'])

In [None]:
model_bigbird.save_pretrained("pt_bigbird-samsum-model")

In [None]:
tokenizer.save_pretrained("pt_bigbird-samsum-tokenizer")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

Prediction

In [None]:
sample_text = """
Steve: Welcome back,
Brad. This is the second part of our interview, and I want to get through these as quickly as possible because I know you have somewhere to be, so …
Brad: Thank you. It's my pleasure.
Steve: It says on the website you worked doing graphic design for a local branding agency before branching out and starting your own business. Was … was that
Brad: Yes.
Steve: So was that a conscious choice, or ...?
Brad: Yes. Well, ac
Steve: Sorry, uh, one sec. Can you, um ... We can hear the fan.
Charlene:
Steve: N-no, no, that's perfect. Thanks, Charlene. Good. So, uh, was starting your own company intentional, or did you just sort of fall into it?
Brad: Actually, sort of both. I actually started out doing it as a favor for a friend. He … I didn't really know what I was doing at the time, but, uh, at some point I found out I was having some success with that, and, um, so I actually started doing it for local businesses and restaurants, and, uh, so then it kind of took off from there, and then I figured, well, if I'm going to be taking on all these new clients I mi-might as well get a website, you know, going, and make something out of this, you know?
Steve: Sure. Sure. So what kind of challenges did you experience when you were starting out, that you weren't expecting?
Brad: Hmm, challenges I wasn't expecting. Uh …
Steve: [laughs] I keep putting you on the spot. I don't think I put that one in the questions either. We're … we're just ad-libbing here.
Brad: N-no, it's fine. So, um, challenges …
Steve: Yeah. It could be, like, funding, or anything. So I forgot to ask you earlier, did you bootstrap?
Brad: Oh, yeah. Oh, yeah. I had, like, literally, like, $200 in the bank when I started out. And that's Canadian cur… Canadian dollars, so, that's like, what, uh, two cents US? [laughs] Nah, just kidding. Pretty sad though.
Steve: Oh, really? Was that
Brad: Yeah. Well, and an AdWords credit, maybe $50 or $100 or something like that, but I had no idea what I was doing at the time.
Steve: Right. So where did you get ... I guess, how did you get the word out about your business and what you were doing?
Brad: Uh ... phew. Well, besides the referrals I was getting, I, um … I actually enlisted my first few clients by this post on Facebook and asking my friend, um ... My one friend, he's got, like, 8,000 Facebook friends and I have, like, two, and one of them's my mom.
Steve: [laughing]
Brad: No, serious,
Steve. So, I actually asked this buddy to help me out and put this, like, thing up for me, and he actually went and … He's [coughs] and he has a lot of friends who do that kind of thing, and so I actually got a few leads out of the deal that I still do work for sometimes, and one of them is actually my biggest client, so yeah.
Steve: Nice. Nice.
Brad: Yeah. Th-then the other thing I did, actually, was sort of local outreach, uh, with local magazines and newspapers and whatnot, and I got a couple of mentions that way that led to my first "real" clients. You know, like ... [clears throat] Not that they weren't real before, but sometimes when they're your friends it kind of feels like they're doing a favor, you know? And yeah, and I did actually have some luck also with networking and meetup groups and stuff like that – you know, local – but, uh, I'm actually a bit of an introvert, so it took me a while to get around to it. [laughs]
Steve: No way. I don't believe it. Do you have a Twitter?
Brad: Yeah, I have a Twitter. [laughs] "A Twitter." But, uh, I don't think I've ever posted a tweet. I can't even remember my password. I'm too old for that stuff,
Steve. I'm pushing 50 next month.
Steve: [laughs]
Brad: Well,
Steve, you know what they say. Nothing beats good old-fashioned PR for a local business. I don't know who "they" are.


"""

In [None]:
gen_kwargs = {'length_penalty':0.9,'num_beams':16,'max_length':512}

sample_text1 = dataset_samsum['test'][0]["dialogue"]

reference = dataset_samsum['test'][0]["summary"]

pipe = pipeline("summarization",model="bigbird-samsum-model",tokenizer=tokenizer)

print("Dialoge:")
print(sample_text)

print("\n reference Sumamry:")
print(reference)

print("\nModel Summary:")

In [None]:
print(pipe(sample_text,**gen_kwargs)[0]["summary_text"])

this is the first part of an interview with the owner of a local graphic design business.
