In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from SimilarityHelpers import get_cos_similarity_score, get_rouge_scores
from sentence_transformers import SentenceTransformer
from enum import Enum
from typing import List, Dict, Optional
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
import json
import pickle
from detect_pretraining import detect_pretraining_batch

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [3]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
random_seed = 42

## Load Data

Steps:
- Load the dataset(s)
- (Optional) Do any splitting if needed
- (Optional) Filter any low quality input/output pairs
- Rename all article/text/content/body columns to "input"
- Rename all summary/label/output columns to "summary" 


Notes:
- Renaming is done so its easier during inference time to make model calls
- Some datasets are too large to load reliably so we split and use only a portion of the data

In [5]:
LOAD_WEBIS = False
LOAD_SAMSUM = True
LOAD_CNNDM = True
LOAD_XSUM = True

In [31]:
if LOAD_SAMSUM:
    samsum = load_dataset("Samsung/samsum", trust_remote_code=True)['test']
    samsum = samsum.rename_columns({"dialogue": "input"})

samsum

Dataset({
    features: ['id', 'input', 'summary'],
    num_rows: 819
})

In [6]:
xsum = load_dataset("EdinburghNLP/xsum", trust_remote_code=True)['test']
xsum_split = xsum.shuffle(seed=random_seed).train_test_split(test_size=0.15)
xsum_test = xsum_split['test']
xsum_test = xsum_test.rename_columns({"document": "input"})
xsum_test

Dataset({
    features: ['input', 'summary', 'id'],
    num_rows: 1701
})

In [32]:
if LOAD_CNNDM:
    cnn = load_dataset("ccdv/cnn_dailymail", "3.0.0", trust_remote_code=True)
    cnn_split = cnn['test'].shuffle(seed=random_seed).train_test_split(test_size=0.12)
    cnn_test = cnn_split['test']

    cnn_test = cnn_test.rename_columns({"article": "input"})
    cnn_test = cnn_test.rename_columns({"highlights": "summary"})

cnn_test

Dataset({
    features: ['input', 'summary', 'id'],
    num_rows: 1379
})

In [33]:

if LOAD_WEBIS:
    webis = load_dataset("webis/tldr-17", trust_remote_code=True)

    # Take a random 10% of Webis
    webis_sel = webis['train'].shuffle(seed=random_seed).train_test_split(test_size=0.01)
    webis_test = webis_sel['test']


    # Normalize the dataset input/output columns so it is input for the model input and summary for the gold label
    webis_test = webis_test.rename_columns({"content": "input"})

    # Trigger garbage collection
    webis = None
    webis_sel = None

    webis_test

## Model Inference Helpers

In [7]:
class TaskPrefix(Enum):
    """
    """
    INSTRUCTION_AT_TOP = "Summarize the following text: "

    def __str__(self):
        return self.value

In [8]:
class Model(Enum):
    """
    Models supported by HuggingFace we can use for testing.
    We picked models that come in multiple sizes.
    - HuggingFaceTB/SmolLM2-135M
    """
    SMOL_LM2_135M = "HuggingFaceTB/SmolLM2-135M"
    SMOL_LM2_135M_INSTRUCT = "HuggingFaceTB/SmolLM2-135M-Instruct"
    SMOL_LM2_360M_INSTRUCT = "HuggingFaceTB/SmolLM2-360M-Instruct"
    SMOL_LM2_1p7B_INSTRUCT = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
    
    VICUNA_7B = "lmsys/vicuna-7b-v1.5"
    VICUNA_13B = "lmsys/vicuna-13b-v1.3"

    PHI_3p5_MINI_INSTRUCT = "microsoft/Phi-3.5-mini-instruct"
    PHI_3_SMALL_128K_INSTRUCT = "microsoft/Phi-3-small-128k-instruct"
    PHI_3_MEDIUM_128K_INSTRUCT = "microsoft/Phi-3-medium-128k-instruct"

    QWEN2p5_0p5B_INSTRUCT = "Qwen/Qwen2.5-0.5B-Instruct"
    QWEN2p5_1p5B_INSTRUCT = "Qwen/Qwen2.5-1.5B-Instruct"
    QWEN2p5_3B_INSTRUCT = "Qwen/Qwen2.5-3B-Instruct"
    QWEN2p5_7B_INSTRUCT = "Qwen/Qwen2.5-7B-Instruct"
    QWEN2p5_14B_INSTRUCT = "Qwen/Qwen2.5-14B-Instruct"
    QWEN2p5_32B_INSTRUCT = "Qwen/Qwen2.5-32B-Instruct"

    LLAMA2_7B_CHAT = "meta-llama/Llama-2-7b-chat"
    LLAMA2_13B_CHAT = "meta-llama/Llama-2-13b-chat"

    LLAMA3p2_1B_INSTRUCT = "meta-llama/Llama-3.2-1B-Instruct"
    LLAMA3p2_3B_INSTRUCT = "meta-llama/Llama-3.2-3B-Instruct"
    LLAMA3p1_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct"
    LLAMA3p1_70B_INSTRUCT = "meta-llama/Llama-3.1-70B-Instruct"

    DEEPSEEK_R1_DISTILL_QWEN_1p5B = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    DEEPSEEK_R1_DISTILL_QWEN_7B   = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
    DEEPSEEK_R1_DISTILL_QWEN_14B  = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
    DEEPSEEK_R1_DISTILL_QWEN_32B  = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"

    # OPT Series
    OPT_125M = "facebook/opt-125m"
    OPT_350M = "facebook/opt-350m" 
    OPT_1B = "facebook/opt-1.3b"
    OPT_2B = "facebook/opt-2.7b"
    OPT_6B = "facebook/opt-6.7b"
    OPT_13B = "facebook/opt-13b"

    # FLAN-T5 Progression
    FLAN_T5_SMALL = "google/flan-t5-small"
    FLAN_T5_BASE = "google/flan-t5-base"
    FLAN_T5_LARGE = "google/flan-t5-large" 
    FLAN_T5_XL = "google/flan-t5-xl"
    FLAN_T5_XXL = "google/flan-t5-xxl"

    # MobileLLM Variants
    MOBILE_LLM_125M = "facebook/mobilellm-125m"
    MOBILE_LLM_350M = "facebook/mobilellm-350m"

    # Cerebras Models
    BTLM_3B = "cerebras/btlm-3b-8k-base"

    # GPT-Neo Series
    GPT_NEO_125M = "EleutherAI/gpt-neo-125m"
    GPT_NEO_1B = "EleutherAI/gpt-neo-1.3B"
    GPT_NEO_2B = "EleutherAI/gpt-neo-2.7B"

    def __str__(self):
        return self.value


In [9]:
class HuggingFaceInference:
    def __init__(self, model: Model):
        self.tokenizer = AutoTokenizer.from_pretrained(model.value)
        self.model = AutoModelForCausalLM.from_pretrained(model.value).to(DEVICE)

        # Add a padding token to the tokenizer if it doesn't exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "<pad>"})

    def predict(self,  prompt: str, options: Optional[Dict] = None) -> str:
        """
        Predicts the output of a model given a prompt using HuggingFace transformers.
        """
        if options is None:
            options = {}
        try:
            messages = [{"role": "user", "content": prompt}]

            # Tokenize and generate
            if not self.tokenizer.chat_template:
                inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
            else:
                input_text = self.tokenizer.apply_chat_template(messages,  tokenize=False)

                inputs = self.tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

            outputs = self.model.generate(
                **inputs,
                temperature=options.get("temperature", 0.001),
                top_p=options.get("top_p", 0.9),
                do_sample=True,
            )

            # Extract just the new generated text, not the original prompt
            prompt_length = len(inputs["input_ids"][0])
            generated_ids = outputs[0][prompt_length:]

            return self.tokenizer.decode(generated_ids, skip_special_tokens=True)

        except Exception as e:
            print(f"Error during prediction: {e}")
            return "None"

    def predict_batch(self, prompts: List[str], options: Optional[Dict] = {}) -> List[str]:
        """
        Batch prediction using HuggingFace transformers.
        """
        return [self.predict(prompt, options) for prompt in prompts]

In [11]:
model_obj = HuggingFaceInference(Model.SMOL_LM2_135M_INSTRUCT)
model_obj.predict("Capital of France?", options={})

'assistant\nThe capital of France is Paris.'

In [10]:
def create_model_results_row():
    return {
        "Model": None,
        "Task_Prefix": None,
        "Dataset_Name": None,
        "Model_Responses": None,
        "Gold_Labels": None,
        "Min_K_Responses": [],
        "cos_sim_scores": [],
        "rouge_sim_scores": []
    }


## Model Evaluation Helpers

In [11]:
def custom_encoder(obj):
    """
    Custom JSON encoder to handle types that the default JSON encoder
    cannot handle, like NumPy arrays and PyTorch tensors.
    """
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    if isinstance(obj, torch.Tensor):
        return obj.tolist()
    # If an object has a 'tolist' method, use it
    if hasattr(obj, 'tolist'):
        return obj.tolist()
    # If all else fails, raise a TypeError
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

In [12]:
def save_dict(data, filename="output"):
    # Attempt to save as CSV using pandas DataFrame
    try:
        # Wrap the dict in a list to create a single-row DataFrame.
        df = pd.DataFrame(data)
        df.to_csv(f"{filename}.csv", index=False)
        print(f"Data saved as CSV: {filename}.csv")
        return
    except Exception as e_csv:
        print("Saving as CSV failed:", e_csv)

    # Attempt to save as JSON
    try:
        with open(f"{filename}.json", "w") as f:
            json.dump(data, f, default=custom_encoder)
        print(f"Data saved as JSON: {filename}.json")
        return
    except Exception as e_json:
        print("Saving as JSON failed:", e_json)

    # Attempt to save as Pickle
    try:
        with open(f"{filename}.pkl", "wb") as f:
            pickle.dump(data, f)
        print(f"Data saved as Pickle: {filename}.pkl")
    except Exception as e_pickle:
        print("Saving as Pickle failed:", e_pickle)

## Collect LLM Responses

In [13]:
MODEL_EVALS = [
    {
        "Model": Model.SMOL_LM2_360M_INSTRUCT,
        "Options": {}
    },
    {
        "Model": Model.GPT_NEO_125M,
        "Options": {}
    },
    {
        "Model": Model.OPT_125M,
        "Options": {}
    },
    {
        "Model": Model.OPT_350M,
        "Options": {}
    },
]

In [14]:
DATASET_EVALS = [
#    {
#        "Name": "Samsum",
#        "Dataset": samsum,
#    }
#    {
#        "Name": "Webis",
#        "Dataset": webis_test,
#    }
#    {
#        "Name": "CNN-DailyMail",
#        "Dataset": cnn_test,
#    }
        {
        "Name": "Xsum",
        "Dataset": xsum_test,
    }
]


In [15]:
last_run = {} # Used for debugging

In [None]:
for dataset in DATASET_EVALS:
    for model in MODEL_EVALS:
        run_key = f"{model['Model']}_{dataset['Name']}"

        # Start a timer
        start_time = time.perf_counter()

        print("-----------------------------------------------------------------")
        print(f"Evaluating {model['Model']} on {dataset['Name']}")

        # Load the model
        model_obj = HuggingFaceInference(model['Model'])

        # Get the prompts
        inputs = dataset['Dataset']['input']

        # TODO: We should maybe vary this somehow
        # Map the inputs with the task prefix
        prompts = [TaskPrefix.INSTRUCTION_AT_TOP.value + "\n" + input for input in inputs]

        # Get the gold labels
        summaries = dataset['Dataset']['summary']

        # Grab the min-k% responses and calculations
        print("Generating Min-K metrics.....")
        min_k_responses = detect_pretraining_batch(summaries, model_obj)

        # Get the model responses
        print("Running inference over inputs.....")
        model_responses = model_obj.predict_batch(prompts, model['Options'] if model['Options'] else {})

        # Save the results
        model_results_row = create_model_results_row()
        model_results_row['Model'] = model['Model']
        model_results_row['Task_Prefix'] = TaskPrefix.INSTRUCTION_AT_TOP.value
        model_results_row['Dataset_Name'] = dataset['Name']
        model_results_row['Model_Responses'] = model_responses
        model_results_row['Min_K_Responses'] = min_k_responses
        
        model_results_row['Gold_Labels'] = summaries

        print(f"Calculating cos/rouge scores for run {run_key}")
        for (prediction, gold_label) in zip(model_results_row['Model_Responses'], model_results_row['Gold_Labels']):
            model_results_row['cos_sim_scores'].append(get_cos_similarity_score(embedder.encode(prediction), embedder.encode(gold_label)))
            model_results_row['rouge_sim_scores'].append(get_rouge_scores(prediction, gold_label))

        print(f"Attempting to save file for run {run_key}....")
        file_path_prefix = "./output/"
        file_name = run_key.replace("//", "_").replace("/", "_")
        full_file_path = f"{file_path_prefix}/{file_name}"
        save_dict(model_results_row, full_file_path)
    
        # Print how long it took
        end_time = time.perf_counter()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds for run {run_key}")

        last_run = model_results_row

-----------------------------------------------------------------
Evaluating HuggingFaceTB/SmolLM2-360M-Instruct on Xsum
Generating Min-K metrics.....


In [32]:
save_dict(last_run, full_file_path)

Data saved as CSV: ./output//HuggingFaceTB_SmolLM2-135M-Instruct_Samsum.csv
