In [1]:
!nvidia-smi

Sat Mar  9 19:41:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch

torch.__version__

'2.1.0+cu121'

In [3]:
# Installing necessary libraries

!pip install transformers datasets evaluate scikit-learn
!pip install accelerate -U
!pip install pandas polars pickle5 tabulate
!pip install wandb tqdm

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, multiproc

In [4]:
# Checking if system is running on cuda

if torch.cuda.is_available():
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

print(device)

cuda:0


Task1: Implementation of Decoding Algorithms (Jaeeun Lee)

In [5]:
# Importing libraries
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)
from transformers import logging
import torch
import pandas as pd

logging.set_verbosity_error()# Suppress warning

In [6]:
# Choosing tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.generation_config.pad_token_id = model.generation_config.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# Prompts and decoding algorithms
prompts = [
    "Today I believe we can finally",
    "In a world full of chaos",
    "The future of technology will bring",
    "When the sun sets",
    "Once upon a time"
]
decoding_algorithms = [
    ("Greedy Search", {"do_sample":False, "max_length": 30}),
    ("Beam Search", {"num_beams": 5, "max_length": 30, "early_stopping": True}),
    ("Top-K Sampling", {"do_sample": True, "max_length": 30, "top_k": 50}),
    ("Top-P Sampling", {"do_sample": True, "max_length": 30, "top_p": 0.75})]


In [8]:
# Function to calculate perplexity
def calculate_perplexity(max_length, stride, input_ids):
    seq_len = input_ids.size(1)

    # max_length = 2 #max length of context
    # stride = 1

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl

In [9]:
# Output text and perplexity for each method
results = []

for prompt in prompts:
    row = {"Prompt": prompt}
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    dict_parameters = {}
    dict_perplexity = {}

    # Iterate through all the decoding algorithms
    for algorithm_name, params in decoding_algorithms:
        outputs = model.generate(input_ids, **params)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        output_ids = tokenizer(generated_text, return_tensors="pt").input_ids
        perplexity = calculate_perplexity(2,1,output_ids)
        row[algorithm_name] = generated_text

        # Add the parameters and perplexity of each decoding algorithm to dict_parameters and dict_perplexity, respectively
        dict_parameters[algorithm_name] = params
        dict_perplexity[algorithm_name] = perplexity.item()

    row["Parameters"] = dict_parameters
    row["Perplexity"] = dict_perplexity
    results.append(row)

df = pd.DataFrame(results)

In [10]:
# Saving dataframe as tab1 (task_1) to an excel file
df.to_excel('./TeamNLPitch_HW3.xlsx', sheet_name='task_1', engine='openpyxl')

Task2: Decoding for downstream generation tasks (Jiyoon Pyo)

In [11]:
# Loading the XSUM dataset from Huggingface and converting it into a dataframe
# Since we are using a pretrained model, will only load the test dataset and filter out the first 50
import polars as pl
from datasets import load_dataset

dataset = load_dataset("xsum")
test_data = dataset['test']

pl_xsum_dataset = pl.DataFrame(
    {
        'input_text':test_data['document'],
        'reference': test_data['summary']
    }
).head(50)

print(pl_xsum_dataset)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

shape: (50, 2)
┌───────────────────────────────────┬───────────────────────────────────┐
│ input_text                        ┆ reference                         │
│ ---                               ┆ ---                               │
│ str                               ┆ str                               │
╞═══════════════════════════════════╪═══════════════════════════════════╡
│ Prison Link Cymru had 1,099 refe… ┆ There is a "chronic" need for mo… │
│ Officers searched properties in … ┆ A man has appeared in court afte… │
│ Jordan Hill, Brittany Covington … ┆ Four people accused of kidnappin… │
│ The 48-year-old former Arsenal g… ┆ West Brom have appointed Nicky H… │
│ …                                 ┆ …                                 │
│ Kremlin spokesman Dmitry Peskov … ┆ Russia has said it will carry on… │
│ Hooker Hughes, 20, featured for … ┆ Wales Under-20 Grand Slam winner… │
│ Russia is fuming, in the words o… ┆ The expulsion of 35 Russian dipl… │
│ Instead of fees risin

In [12]:
# Loading T% (small) model pretrained on the XSUM Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("adasnew/t5-small-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("adasnew/t5-small-xsum")

tokenizer_config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [13]:
def generating_summary(input_text:dict) -> dict:
    """
    Generates a set of sumamries  based on the input text.
    Uses the same parameters for Greedy Search, Beam Search, Top-K Sampling, Top-P Sampling as done previously in Task 1

    : param: input_text | type:dict = dictionary type object with the raw input text
    : return: dict_generated_summary | type:dict = dictionary type object with four summaries generated with the various decoding algorithms
    """
    raw_text = input_text['input_text']
    input_ids = tokenizer(raw_text, return_tensors='pt').input_ids

    dict_generated_summary = {}

    for algorithm_name, params in decoding_algorithms:
        outputs = model.generate(input_ids, **params)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        dict_generated_summary[algorithm_name] = generated_text

    return dict_generated_summary

In [14]:
# Adding four columns (Greedy Search, Beam Search, Top-K Sampling, Top-P Sampling) of summaries to the original dataframe

import time
start_time = time.time()

pl_xsum_output = pl_xsum_dataset.with_columns(
    struct_output = pl.struct(pl.col('input_text')).map_elements(generating_summary)
).unnest(
    'struct_output'
)

print(pl_xsum_output)
print(f"Time_elapsed on generating 50 summary samples based on four decoding algorithms: {time.time() - start_time}s")

shape: (50, 6)
┌────────────────┬────────────────┬────────────────┬───────────────┬───────────────┬───────────────┐
│ input_text     ┆ reference      ┆ Greedy Search  ┆ Beam Search   ┆ Top-K         ┆ Top-P         │
│ ---            ┆ ---            ┆ ---            ┆ ---           ┆ Sampling      ┆ Sampling      │
│ str            ┆ str            ┆ str            ┆ str           ┆ ---           ┆ ---           │
│                ┆                ┆                ┆               ┆ str           ┆ str           │
╞════════════════╪════════════════╪════════════════╪═══════════════╪═══════════════╪═══════════════╡
│ Prison Link    ┆ There is a     ┆ A homeless     ┆ One-bedroom   ┆ A homeless    ┆ An            │
│ Cymru had      ┆ "chronic" need ┆ charity has    ┆ flats in      ┆ charity is    ┆ ex-offenders  │
│ 1,099 refe…    ┆ for mo…        ┆ said it i…     ┆ Wales could…  ┆ offering s…   ┆ has been      │
│                ┆                ┆                ┆               ┆        

In [15]:
# Convert polars dataframe to pandas
df_xsum_output = pl_xsum_output.to_pandas()
del pl_xsum_output

# Saving dataframe with sheet_name task_2
with pd.ExcelWriter('./TeamNLPitch_HW3.xlsx', engine='openpyxl', mode='a') as writer:
    df_xsum_output.to_excel(writer, sheet_name='task_2')

In [16]:
!pwd

/content


Task 3.1 Automatic evaluation (Yongtian Ou)

In [17]:
!pip install rouge_score
!pip install bert_score
import evaluate
import numpy as np

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=38574d678eab686bb353f7d55b8955d45740253e3ea7ab8ee664718b3cf3ddda
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bert_score
Successfully installed bert_score-0.3.13


In [18]:
# Load in evaluation metrics
rouge = evaluate.load('rouge')
bert = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [19]:
# inherit previous output
df_metrics = df_xsum_output

#df_metrics = df_metrics.drop(columns=['Greedy Search rougeL','Greedy Search bertf1',	'Beam Search rougeL',	'Top-K Sampling rougeL',	'Top-P Sampling rougeL'])


In [20]:
# Calculate metrics for each algorithm and each sample

algorithm_names = ['Greedy Search','Beam Search', 'Top-K Sampling','Top-P Sampling']

for algo in algorithm_names:
  predictions_list =df_metrics[algo].tolist()
  rouge_name = algo+' rougeL'
  bert_name = algo+' bertf1'
  df_metrics[rouge_name] = np.zeros([50,1])
  df_metrics[bert_name] = np.zeros([50,1])
  for idx in range(len(df_metrics)):
    rouge_score = rouge.compute(predictions=[predictions_list[idx]], references=[df_xsum_output['reference'][idx]])
    bert_score = bert.compute(predictions=[predictions_list[idx]], references=[df_xsum_output['reference'][idx]],lang='en')
    df_metrics[rouge_name][idx] = rouge_score['rougeL']
    df_metrics[bert_name][idx] = bert_score['f1'][0]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metrics[rouge_name][idx] = rouge_score['rougeL']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metrics[bert_name][idx] = bert_score['f1'][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metrics[rouge_name][idx] = rouge_score['rougeL']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metrics[bert

In [21]:
# Calculate average scores
avg_scores = df_metrics.mean(numeric_only=True)
print(avg_scores)

Greedy Search rougeL     0.243809
Greedy Search bertf1     0.880895
Beam Search rougeL       0.228699
Beam Search bertf1       0.881913
Top-K Sampling rougeL    0.180165
Top-K Sampling bertf1    0.868944
Top-P Sampling rougeL    0.198815
Top-P Sampling bertf1    0.873077
dtype: float64


In [22]:
df_metrics.head()

Unnamed: 0,input_text,reference,Greedy Search,Beam Search,Top-K Sampling,Top-P Sampling,Greedy Search rougeL,Greedy Search bertf1,Beam Search rougeL,Beam Search bertf1,Top-K Sampling rougeL,Top-K Sampling bertf1,Top-P Sampling rougeL,Top-P Sampling bertf1
0,"Prison Link Cymru had 1,099 referrals in 2015-...","There is a ""chronic"" need for more housing for...","A homeless charity has said it is ""desperate"" ...",One-bedroom flats in Wales could save the publ...,"A homeless charity is offering support for ""cr...",An ex-offenders has been released from prison ...,0.235294,0.90872,0.235294,0.890671,0.3,0.89008,0.27027,0.886038
1,Officers searched properties in the Waterfront...,"A man has appeared in court after firearms, am...",A man has appeared in court charged with firea...,A man has appeared in court charged with firea...,A man has appeared in court charged with murde...,"Police in Edinburgh have recovered a £25,000 f...",0.62069,0.94419,0.62069,0.94419,0.625,0.943164,0.181818,0.885192
2,"Jordan Hill, Brittany Covington and Tesfaye Co...",Four people accused of kidnapping and torturin...,Two men have appeared in court charged with ag...,Two men have appeared in court charged with ag...,A couple have appeared in court charged with t...,A man accused of beating a white victim with a...,0.2,0.885391,0.2,0.885391,0.181818,0.891356,0.222222,0.89402
3,The 48-year-old former Arsenal goalkeeper play...,West Brom have appointed Nicky Hammond as tech...,West Brom have appointed defender John Ayrda a...,West Brom have appointed West Brom's former go...,West Brom defender David Burnley has been appo...,West Brom have appointed former West Brom keep...,0.413793,0.907166,0.352941,0.91093,0.266667,0.900518,0.294118,0.881147
4,Restoring the function of the organ - which he...,The pancreas can be triggered to regenerate it...,The diet of mice that eat fasting has reversed...,People with type 1 and type 2 diabetes could b...,It is an incredibly fasting diet that regenera...,A study has shown that a diet that mimics eati...,0.166667,0.883899,0.102564,0.865332,0.108108,0.877159,0.097561,0.869659


In [23]:

# Saving dataframe with sheet_name task_3.1
with pd.ExcelWriter('./TeamNLPitch_HW3.xlsx', engine='openpyxl', mode='a') as writer:
    df_metrics.to_excel(writer, sheet_name='task_3.1')
