# How to Run HellaSwag with Generative Models (Hugging Face Transformers)
Based on: 
- https://github.com/FranxYao/chain-of-thought-hub/blob/main/MMLU/run_mmlu_open_source.py
- https://github.com/shinwookim/research/blob/eabdf92eb534596a5cb65d106aca18a7221d9e93/notebooks/mmlu.ipynb

In [1]:
import json
import os
import time
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc

In [2]:
choices = ["A", "B", "C", "D"]

In [3]:
!git clone https://github.com/rowanz/hellaswag

Cloning into 'hellaswag'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 45 (delta 0), reused 2 (delta 0), pack-reused 41[K
Unpacking objects: 100% (45/45), 17.69 MiB | 8.88 MiB/s, done.


In [4]:
DATA_DIR = "/kaggle/working/hellaswag/data/hellaswag"

In [5]:
# Helper Functions: Largely unchanged from https://github.com/FranxYao/chain-of-thought-hub/blob/main/MMLU/run_mmlu_open_source.py

def compute_metric(output_filename):
    df = pd.read_csv(output_filename)

  # Extract pred_answers and gold_answers columns
    pred_answers = df['pred_answers'].tolist()
    gold_answers = df['gold_answers'].tolist()
    total_acc = 0
    total_num = 0
    acc = 0

    for pred, gold in zip(pred_answers, gold_answers):
        if pred == gold: acc += 1
    print("Accuracy for HellaSwag: %.4f" % ( acc/len(gold_answers)))

def prepare_input(tokenizer, prompts):
    input_tokens = tokenizer.batch_encode_plus(prompts, return_tensors="pt", padding=True)
    input_tokens = {k:input_tokens[k] for k in input_tokens if k in ["input_ids", "attention_mask"]}
    for t in input_tokens:
        if torch.is_tensor(input_tokens[t]):
            input_tokens[t] = input_tokens[t].to('cuda')

    return input_tokens

def batch_split(prompts, batch_num):
    batch_prompts = []
    mini_batch = []
    for prompt in prompts:
        mini_batch.append(prompt)
        if len(mini_batch) == batch_num:
            batch_prompts.append(mini_batch)
            mini_batch = []
    if len(mini_batch) != 0:
        batch_prompts.append(mini_batch)
    return batch_prompts

def batch_infer(model, tokenizer, prompts):
    batch_size = 8
    answers = []
    for batch_input in tqdm(batch_split(prompts, batch_size)):
        encode_inputs = prepare_input(tokenizer, batch_input)
        with torch.no_grad():
            outputs = model.generate(**encode_inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id)
            answers.extend(tokenizer.batch_decode(outputs, skip_special_tokens=True))
    answers = [answer[-1] for answer in answers]
    return answers


In [6]:
def format_example(df, idx, include_answer=True):
    """
    Formats a single example from the DataFrame into a prompt string.

    Args:
      df: The pandas DataFrame containing data.
      idx: The index of the example to format.
      include_answer: Whether to include the answer in the prompt (default: True).

    Returns:
      A string containing the formatted prompt.
    """
    prompt = df.loc[idx, 'ctx']  # Use loc for cleaner indexing
    k = len(df.loc[idx, "endings"])  # Calculate number of ending options
    for j in range(k):
        prompt += "\n{}. {}".format(choices[j], df["endings"][idx][j])
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}".format(chr(ord('A') + df.loc[idx, 'label']))  # Access label using loc
    prompt += "\n\n"
    return prompt

def gen_prompt(train_df, subject, k=-1):
    """
    Generates a prompt string containing multiple choice questions from the DataFrame.

    Args:
      train_df: The pandas DataFrame containing training data.
      subject: The subject of the prompt (optional).
      k: The number of examples to include (default: all).

    Returns:
      A string containing the formatted prompt.
    """
    prompt = "The following are multiple choice questions (with answers).\nThe answer can only be A, B, C or D.\n\n"
    if k == -1:
        k = train_df.shape[0]
    for i in range(k):
        prompt += format_example(train_df, i)
    return prompt


# import pandas as pd

# data = {
#   "ctx": "Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles.",
#   "endings": [[", the man adds wax to the windshield and cuts it.", ", a person boards a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.", ", the man puts on a christmas coat, knitted with netting.", ", the man puts on a christmas coat, knitted with netting."]],
#   "label": 3
# }

# df = pd.DataFrame(data)

# # Test format_example function
# example_prompt = format_example(df, 0)
# print(example_prompt)

# # Test gen_prompt function
# full_prompt = gen_prompt(df, "Sample Subject")
# print(full_prompt)

In [7]:
def load(checkpoint, model_type):
    n_gpus = torch.cuda.device_count()
    
    model = AutoModelForCausalLM.from_pretrained(checkpoint).cuda()
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side='left')      
    model.eval()
    return model, tokenizer

In [8]:
import pandas as pd

def jsonl_to_dataframe(filepath):
    """
    Reads a JSONL file and converts it to a pandas DataFrame.

    Args:
      filepath (str): Path to the JSONL file.

    Returns:
      pandas.DataFrame: DataFrame containing the extracted data.
    """
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            json_data = json.loads(line.strip())  # Load JSON data inside the loop
            data.append({"ctx": json_data["ctx"], "endings": json_data["endings"], "label": json_data["label"]})
    return pd.DataFrame(data)

# filepath = "/kaggle/working/hellaswag/data/hellaswag_train.jsonl"  # Replace with your actual file path
# df = jsonl_to_dataframe(filepath)

# print(df.head(1))  # Print the first few rows of the DataFrame


In [9]:
import pandas as pd


def select_5_samples(data):
    """
    Selects 5 examples from a dataframe, ensuring 1 from each label (0-3) and an extra random one

    Args:
      data: A pandas dataframe containing 'text' and 'label' columns

    Returns:
      A pandas dataframe containing 5 rows, 1 from each label and an extra random one
    """

    label_counts = data['label'].value_counts()
#     print(label_counts)
    min_samples_per_label = 1
    samples_to_select = min_samples_per_label * len(label_counts) + 1

    # Check if enough data is available
    if samples_to_select > len(data):
        raise ValueError("Dataframe has less samples than required number to select")

    # Efficiently select one sample from each label using sample
    selected_data = pd.concat([data[data["label"] == label].sample(1, random_state=42) for label in label_counts.index])
#     print(selected_data)
    # Select remaining samples randomly from the entire DataFrame
    remaining_samples = samples_to_select - len(selected_data)
#     print(remaining_samples)
    selected_data = pd.concat([selected_data, data.sample(remaining_samples, random_state=42)])

    # Return the first 5 rows (or all if less than 5 were selected)
    return selected_data.reset_index(drop=True)


# Example usage
# if __name__ == "__main__":
#   # Assuming you have a function to create your DataFrame from JSONL (not shown here)
#     data = jsonl_to_dataframe(DATA_DIR + "_train.jsonl")
#     result = select_5_samples(data.copy())
#     print(result)


In [10]:
def main(ckpt_dir: str, param_size: str, model_type: str):
    
    run_results = {}
    output_filename = 'run_results_%s_%sb.csv' % (model_type, param_size)
    evaluation_filename = 'evaluation_%s_%sb.csv' % (model_type, param_size)  # New filename

    model, tokenizer = load(ckpt_dir, model_type)
    start_time = time.time()

    print('Evaluating ...')
    records = []
    
    train_all_df = jsonl_to_dataframe(DATA_DIR + "_train.jsonl")
    train_df = select_5_samples(train_all_df)
#     print(train_df)
    val_df = jsonl_to_dataframe(DATA_DIR + "_val.jsonl")
    for i in range(val_df.shape[0]):
        # get prompt and make sure it fits
        k = 5
#         print(val_df)
        prompt_end = format_example(val_df, i, include_answer=False)
        train_prompt = gen_prompt(train_df, k)
        prompt = train_prompt + prompt_end
        while len(tokenizer.tokenize(prompt)) + 1> 2048: # bos token
            prompt_split = prompt.split("\n\n")
            prompt_split.pop(1)
            prompt = '\n\n'.join(prompt_split)
        label = val_df.iloc[i, val_df.shape[1]-1]
        records.append({
          'prompt': prompt,
          'answer': chr(ord('A') + label),  # Convert label to answer char (A, B, C, ...)
          'predicted_answer': None  # Add field for predicted answer
        })
    pred_answers = batch_infer(model, tokenizer, [record['prompt'] for record in records])
    for i, record in enumerate(records):
        record['predicted_answer'] = pred_answers[i]  # Assign predicted answer
    gold_answers = [record['answer'] for record in records]
    run_results = {'pred_answers': pred_answers, 'gold_answers': gold_answers}
    
    # Create DataFrame from dictionary
    df = pd.DataFrame(run_results)
    # Save DataFrame to CSV file
    df.to_csv(output_filename, index=False)    
    # Create record df
    record_df = pd.DataFrame(records)
    record_df.to_csv(evaluation_filename, index=False)
    compute_metric(output_filename)
    end_time = time.time()
    print("total run time %.2f" % (end_time - start_time))
    del model
    del tokenizer
    return (end_time - start_time)

In [11]:
model = "facebook/opt-350m"
rslt = dict()


rslt[model] =  main(model, model.replace("facebook/opt-", ""), "OPT")



config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Evaluating ...


100%|██████████| 1256/1256 [32:30<00:00,  1.55s/it]


Accuracy for HellaSwag: 0.2104
total run time 1990.98
