
AUTHOR: Lok Yee Joey Cheung

This file is used to test the uploaded HuggingFace Model

### Installation and data preparation

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
%%capture
%pip install transformers
%pip install datasets
%pip install evaluate
%pip install rouge-score
%pip install py7zr
%pip install --upgrade pyarrow

In [None]:
# import transformers libraries
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments,GenerationConfig
from transformers import pipeline, DataCollatorForSeq2Seq
import torch
import evaluate
from datasets import Dataset
import numpy as np
#from textblob import TextBlob
#from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
nltk.download('punkt')
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
pd.set_option('display.max_colwidth', 1000)

seed = 42

In [None]:
#Python version
import sys
import platform
import torch
import pandas as pd
import sklearn as sk

has_gpu = torch.cuda.is_available()
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("NVIDIA/CUDA GPU is", "available" if has_gpu else "NOT AVAILABLE")
print("MPS (Apple Metal) is", "AVAILABLE" if has_mps else "NOT AVAILABLE")
print(f"Target device is {device}")

Python Platform: Linux-6.1.85+-x86_64-with-glibc2.35
PyTorch Version: 2.4.0+cu121

Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0]
Pandas 2.1.4
Scikit-Learn 1.3.2
NVIDIA/CUDA GPU is available
MPS (Apple Metal) is NOT AVAILABLE
Target device is cuda


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
test = pd.read_csv('/content/drive/MyDrive/data7901,7903/data/samsum-test.csv')

### Data Cleaning

Remove null values

In [None]:
# removing null values
test = test.dropna()

### Load Vanilla BART-LARGE-XSUM

In [None]:
#pip install transformers -U
#pip install accelerate -U
#pip install --upgrade transformers
pip install accelerate==0.27.2



In [None]:
# load model with bart-large-xsum model
summarizer = pipeline('summarization', model = 'facebook/bart-large-xsum')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



### Load my model for testing


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

model_repo_id = "joeycly/fine-tuned-bart"

model = AutoModelForSeq2SeqLM.from_pretrained(model_repo_id)
tokenizer = AutoTokenizer.from_pretrained(model_repo_id)

nlp = pipeline("summarization", model=model, tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/999k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [None]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load('rouge')
def test_rouge(generated_summary,reference_summary): # Compute ROUGE scores
  results = rouge.compute(
      predictions=[generated_summary],
      references=[reference_summary],
      use_stemmer=True,
      rouge_types=["rouge1", "rouge2", "rougeL"]
  )
  # Display ROUGE scores
  for rouge_type in ["rouge1", "rouge2", "rougeL"]:
      score = results[rouge_type]
      #print(score)
      print(f"{rouge_type.upper()}: {score:.4f}")

Example 1

In [None]:
text1 = '''Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye'''

In [None]:
# Get candidate summary 
summary1 = nlp(text1)
print(summary1)

[{'summary_text': "Hannah is looking for Betty's number. Amanda tells her to ask Larry. Hannah doesn't know him but Amanda thinks she should text him."}]


In [None]:
summary1 = summary1[0]['summary_text']

In [None]:
reference1 = "Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."

# Evaluate the test performance
test_rouge(summary1,reference1)

ROUGE1: 0.5366
ROUGE2: 0.2051
ROUGEL: 0.3902


Example 2

In [None]:
text4 = '''Rita: I'm so bloody tired. Falling asleep at work. :-(
Tina: I know what you mean.
Tina: I keep on nodding off at my keyboard hoping that the boss doesn't notice..
Rita: The time just keeps on dragging on and on and on....
Rita: I keep on looking at the clock and there's still 4 hours of this drudgery to go.
Tina: Times like these I really hate my work.
Rita: I'm really not cut out for this level of boredom.
Tina: Neither am I.'''

In [None]:
# Get candidate summary
summarizer(text4)

[{'summary_text': 'Tina and Rita have been talking about the boredom of working at the same job.'}]

In [None]:
summary4 = nlp(text4)[0]['summary_text']
print(summary4)

Rita and Tina hate the boredom of their jobs.


In [None]:
# Reference summary
reference4 = "Rita and Tina are bored at work and have still 4 hours left."

# Evaluate the test performance
test_rouge(summary4,reference4)

ROUGE1: 0.2727
ROUGE2: 0.2000
ROUGEL: 0.2727


Example 3

In [None]:
text5 = '''Ivan: hey eric
Eric: yeah man
Ivan: so youre coming to the wedding
Eric: your brother's
Ivan: yea
Eric: i dont know mannn
Ivan: YOU DONT KNOW??
Eric: i just have a lot to do at home, plus i dont know if my parents would let me
Ivan: ill take care of your parents
Eric: youre telling me you have the guts to talk to them XD
Ivan: thats my problem
Eric: okay man, if you say so
Ivan: yea just be there
Eric: alright'''

In [None]:
# Get candidate summary
summarizer(text5)

[{'summary_text': 'A friend of mine is getting married this weekend, and he has been talking to his brother about it.'}]

In [None]:
summary5 = nlp(text5)[0]['summary_text']
print(summary5)

Eric is not sure whether to go to Ivan's brother's wedding as he has lots to do at home. Ivan will talk to Eric's parents.


In [None]:
reference5 = "Eric doesn't know if his parents let him go to Ivan's brother's wedding. Ivan will talk to them."

# Evaluate test result
test_rouge(summary5,reference5)

ROUGE1: 0.5306
ROUGE2: 0.3830
ROUGEL: 0.4898
