In [1]:
!nvidia-smi

Fri Mar  8 12:04:29 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          On   | 00000000:48:00.0 Off |                    0 |
|  0%   27C    P8    22W / 300W |      0MiB / 45634MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch

torch.__version__

'2.1.2+cu118'

In [3]:
# Installing necessary libraries

!pip install transformers datasets evaluate scikit-learn
!pip install accelerate -U
!pip install pandas polars pickle5 tabulate
!pip install wandb tqdm

Defaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable


[0mDefaulting to user installation because normal site-packages is not writeable
[0mDefaulting to user installation because normal site-packages is not writeable


[0m

In [4]:
# Checking if system is running on cuda

if torch.cuda.is_available():
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

print(device)

cuda:0


Task1: Implementation of Decoding Algorithms (Jaeeun Lee)

In [5]:
# Importing libraries 
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)
from transformers import logging
import torch
import pandas as pd

logging.set_verbosity_error()# Suppress warning

In [6]:
# Choosing tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.generation_config.pad_token_id = model.generation_config.eos_token_id

In [7]:
# Prompts and decoding algorithms
prompts = [
    "Today I believe we can finally",
    "In a world full of chaos",
    "The future of technology will bring",
    "When the sun sets",
    "Once upon a time"
]
decoding_algorithms = [
    ("Greedy Search", {"do_sample":False, "max_length": 30}),
    ("Beam Search", {"num_beams": 5, "max_length": 30, "early_stopping": True}),
    ("Top-K Sampling", {"do_sample": True, "max_length": 30, "top_k": 50}),
    ("Top-P Sampling", {"do_sample": True, "max_length": 30, "top_p": 0.75})]
     

In [8]:
# Function to calculate perplexity
def calculate_perplexity(max_length, stride, input_ids):
    seq_len = input_ids.size(1)

    # max_length = 2 #max length of context
    # stride = 1

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean())
    return ppl

In [9]:
# Output text and perplexity for each method
results = []

for prompt in prompts:
    row = {"Prompt": prompt}
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    
    dict_parameters = {}
    dict_perplexity = {}
    
    # Iterate through all the decoding algorithms
    for algorithm_name, params in decoding_algorithms:
        outputs = model.generate(input_ids, **params)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        output_ids = tokenizer(generated_text, return_tensors="pt").input_ids
        perplexity = calculate_perplexity(2,1,output_ids)
        row[algorithm_name] = generated_text
        
        # Add the parameters and perplexity of each decoding algorithm to dict_parameters and dict_perplexity, respectively
        dict_parameters[algorithm_name] = params
        dict_perplexity[algorithm_name] = perplexity.item()
        
    row["Parameters"] = dict_parameters
    row["Perplexity"] = dict_perplexity
    results.append(row)

df = pd.DataFrame(results)



In [10]:
# Saving dataframe as tab1 (task_1) to an excel file
df.to_excel('./TeamNLPitch_HW3.xlsx', sheet_name='task_1', engine='openpyxl')

Task2: Decoding for downstream generation tasks (Jiyoon Pyo)

In [11]:
# Loading the XSUM dataset from Huggingface and converting it into a dataframe
# Since we are using a pretrained model, will only load the test dataset and filter out the first 50
import polars as pl
from datasets import load_dataset

dataset = load_dataset("xsum")
test_data = dataset['test']

pl_xsum_dataset = pl.DataFrame(
    {
        'input_text':test_data['document'],
        'reference': test_data['summary']
    }
).head(50)

print(pl_xsum_dataset)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


shape: (50, 2)
┌───────────────────────────────────┬───────────────────────────────────┐
│ input_text                        ┆ reference                         │
│ ---                               ┆ ---                               │
│ str                               ┆ str                               │
╞═══════════════════════════════════╪═══════════════════════════════════╡
│ Prison Link Cymru had 1,099 refe… ┆ There is a "chronic" need for mo… │
│ Officers searched properties in … ┆ A man has appeared in court afte… │
│ Jordan Hill, Brittany Covington … ┆ Four people accused of kidnappin… │
│ The 48-year-old former Arsenal g… ┆ West Brom have appointed Nicky H… │
│ …                                 ┆ …                                 │
│ Kremlin spokesman Dmitry Peskov … ┆ Russia has said it will carry on… │
│ Hooker Hughes, 20, featured for … ┆ Wales Under-20 Grand Slam winner… │
│ Russia is fuming, in the words o… ┆ The expulsion of 35 Russian dipl… │
│ Instead of fees risin

In [12]:
# Loading T% (small) model pretrained on the XSUM Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("adasnew/t5-small-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("adasnew/t5-small-xsum")

In [13]:
def generating_summary(input_text:dict) -> dict:
    """
    Generates a set of sumamries  based on the input text.
    Uses the same parameters for Greedy Search, Beam Search, Top-K Sampling, Top-P Sampling as done previously in Task 1
    
    : param: input_text | type:dict = dictionary type object with the raw input text
    : return: dict_generated_summary | type:dict = dictionary type object with four summaries generated with the various decoding algorithms
    """
    raw_text = input_text['input_text']
    input_ids = tokenizer(raw_text, return_tensors='pt').input_ids
    
    dict_generated_summary = {}
    
    for algorithm_name, params in decoding_algorithms:
        outputs = model.generate(input_ids, **params)
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        dict_generated_summary[algorithm_name] = generated_text
        
    return dict_generated_summary

In [14]:
# Adding four columns (Greedy Search, Beam Search, Top-K Sampling, Top-P Sampling) of summaries to the original dataframe

import time
start_time = time.time()

pl_xsum_output = pl_xsum_dataset.with_columns(
    struct_output = pl.struct(pl.col('input_text')).map_elements(generating_summary)
).unnest(
    'struct_output'
)

print(pl_xsum_output)
print(f"Time_elapsed on generating 50 summary samples based on four decoding algorithms: {time.time() - start_time}s")

shape: (50, 6)
┌────────────────┬────────────────┬────────────────┬───────────────┬───────────────┬───────────────┐
│ input_text     ┆ reference      ┆ Greedy Search  ┆ Beam Search   ┆ Top-K         ┆ Top-P         │
│ ---            ┆ ---            ┆ ---            ┆ ---           ┆ Sampling      ┆ Sampling      │
│ str            ┆ str            ┆ str            ┆ str           ┆ ---           ┆ ---           │
│                ┆                ┆                ┆               ┆ str           ┆ str           │
╞════════════════╪════════════════╪════════════════╪═══════════════╪═══════════════╪═══════════════╡
│ Prison Link    ┆ There is a     ┆ A homeless     ┆ One-bedroom   ┆ A homeless    ┆ A homeless    │
│ Cymru had      ┆ "chronic" need ┆ charity has    ┆ flats in      ┆ charity has   ┆ charity has   │
│ 1,099 refe…    ┆ for mo…        ┆ said it i…     ┆ Wales could…  ┆ said the …    ┆ said a la…    │
│ Officers       ┆ A man has      ┆ A man has      ┆ A man has     ┆ A man h

In [15]:
# Convert polars dataframe to pandas
df_xsum_output = pl_xsum_output.to_pandas()
del pl_xsum_output

# Saving dataframe with sheet_name task_2
with pd.ExcelWriter('./TeamNLPitch_HW3.xlsx', engine='openpyxl', mode='a') as writer:  
    df_xsum_output.to_excel(writer, sheet_name='task_2')

In [16]:
!pwd

/panfs/jay/groups/32/csci5541/pyo00005/NLP_A3
