In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from SimilarityHelpers import get_cos_similarity_score, get_rouge_scores
from sentence_transformers import SentenceTransformer
from enum import Enum
from typing import List, Dict, Optional
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
from detect_pretraining import detect_pretraining_batch

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [3]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
random_seed = 42

## Load Data

Steps:
- Load the dataset(s)
- (Optional) Do any splitting if needed
- (Optional) Filter any low quality input/output pairs
- Rename all article/text/content/body columns to "input"
- Rename all summary/label/output columns to "summary" 


Notes:
- Renaming is done so its easier during inference time to make model calls
- Some datasets are too large to load reliably so we split and use only a portion of the data

In [5]:
samsum = load_dataset("Samsung/samsum", trust_remote_code=True)['test']
samsum

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 819
})

In [None]:
webis = load_dataset("webis/tldr-17", trust_remote_code=True)
webis

In [None]:
# Take a random 10% of Webis
webis_sel = webis['train'].shuffle(seed=random_seed).train_test_split(test_size=0.01)
webis_test = webis_sel['test']
webis_test

In [None]:
# Normalize the dataset input/output columns so it is input for the model input and summary for the gold label
webis_test = webis_test.rename_columns({"content": "input"})
webis_test

In [6]:
samsum = samsum.rename_columns({"dialogue": "input"})
samsum

Dataset({
    features: ['id', 'input', 'summary'],
    num_rows: 819
})

## Model Inference Helpers

In [40]:
class TaskPrefix(Enum):
    """
    """
    INSTRUCTION_AT_TOP = "Summarize the following text: "

    def __str__(self):
        return self.value

In [41]:
class Model(Enum):
    """
    Models supported by HuggingFace we can use for testing.
    We picked models that come in multiple sizes.
    - HuggingFaceTB/SmolLM2-135M
    """
    SMOL_LM2_135M = "HuggingFaceTB/SmolLM2-135M"
    SMOL_LM2_135M_Instruct = "HuggingFaceTB/SmolLM2-135M-Instruct"
    SMOL_LM2_360M_Instruct = "HuggingFaceTB/SmolLM2-360M-Instruct"
    SMOL_LM2_1p7B = "HuggingFaceTB/SmolLM2-1.7B"
    SMOL_LM2_1p7B_Instruct = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
    PHI_3p5_Mini_Instruct = "microsoft/Phi-3.5-mini-instruct"

    def __str__(self):
        return self.value

In [42]:
class HuggingFaceInference:
    def __init__(self, model: Model):
        self.tokenizer = AutoTokenizer.from_pretrained(model.value)
        self.model = AutoModelForCausalLM.from_pretrained(model.value).to(DEVICE)

        # Add a padding token to the tokenizer if it doesn't exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "<pad>"})

    def predict(self,  prompt: str, options: Optional[Dict] = None) -> str:
        """
        Predicts the output of a model given a prompt using HuggingFace transformers.
        """
        if options is None:
            options = {}
        try:
            messages = [{"role": "user", "content": prompt}]

            # Tokenize and generate
            input_text = self.tokenizer.apply_chat_template(messages,  tokenize=False)

            inputs = self.tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

            outputs = self.model.generate(
                **inputs,
                temperature=options.get("temperature", 0.001),
                top_p=options.get("top_p", 0.9),
                do_sample=True,
                max_new_tokens = 1000
            )

            print(f"Decoded Output: {self.tokenizer.decode(outputs[0])}")
            
            # Extract just the new generated text, not the original prompt
            prompt_length = len(inputs["input_ids"][0])
            generated_ids = outputs[0][prompt_length:]
            
            return self.tokenizer.decode(generated_ids, skip_special_tokens=True)

        except Exception as e:
            print(f"Error during prediction: {e}")
            return "None"

    def predict_batch(self, prompts: List[str], options: Optional[Dict] = {}) -> List[str]:
        """
        Batch prediction using HuggingFace transformers.
        """
        return [self.predict(prompt, options) for prompt in prompts]

In [43]:
model_obj = HuggingFaceInference(Model.SMOL_LM2_135M_Instruct)
model_obj.predict("Capital of France?", options={})

Decoded Output: <|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
Capital of France?<|im_end|>
<|im_start|>assistant
The capital of France is Paris.<|im_end|>


'assistant\nThe capital of France is Paris.'

In [44]:
def create_model_results_row():
    return {
        "Model": None,
        "Task_Prefix": None,
        "Dataset_Name": None,
        "Model_Responses": None,
        "Gold_Labels": None,
        "Min_K_Responses": None,
    }


## Model Evaluation Helpers

## Collect LLM Responses

In [45]:
MODEL_EVALS = [
    {
        "Model": Model.SMOL_LM2_135M_Instruct,
        "Options": {}
    },
    {
        "Model": Model.SMOL_LM2_360M_Instruct,
        "Options": {}
    }
]

In [46]:
DATASET_EVALS = [
    {
        "Name": "Samsum",
        "Dataset": samsum,
    }
#    {
#        "Name": "Webis",
#        "Dataset": webis_test,
#    }
]


In [47]:
model_results = {}

In [None]:
for dataset in DATASET_EVALS:
    for model in MODEL_EVALS:
        # Start a timer
        start_time = time.perf_counter()

        print(f"Evaluating {model['Model']} on {dataset['Name']}")

        # Load the model
        model_obj = HuggingFaceInference(model['Model'])

        # Get the prompts
        inputs = dataset['Dataset']['input']

        # TODO: We should maybe vary this somehow
        # Map the inputs with the task prefix
        prompts = [TaskPrefix.INSTRUCTION_AT_TOP.value + "\n" + input for input in inputs]

        # Get the gold labels
        summaries = dataset['Dataset']['summary']

        # Get the model responses
        model_responses = model_obj.predict_batch(prompts, model['Options'] if model['Options'] else {})

        # Grab the min-k% responses and calculations 
        min_k_responses = detect_pretraining_batch(prompts, model['Model'])

        # Save the results
        model_results_row = create_model_results_row()
        model_results_row['Model'] = model['Model']
        model_results_row['Task_Prefix'] = TaskPrefix.INSTRUCTION_AT_TOP.value
        model_results_row['Dataset_Name'] = dataset['Name']
        model_results_row['Model_Responses'] = model_responses
        model_results_row['Min_K_Responses'] = min_k_responses
        
        model_results_row['Gold_Labels'] = summaries

        model_results[f"{model['Model']}_{dataset['Name']}"] = model_results_row
    
        # Print how long it took
        end_time = time.perf_counter()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds")


Evaluating HuggingFaceTB/SmolLM2-135M-Instruct on Samsum
Decoded Output: <|im_start|>system
You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
<|im_start|>user
Summarize the following text: 
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye<|im_end|>
<|im_start|>assistant
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so...
Hannah: I'd rather you texted him
Amanda:

In [None]:
model_results

{'HuggingFaceTB/SmolLM2-135M-Instruct_Samsum': {'Model': <Model.SMOL_LM2_135M_Instruct: 'HuggingFaceTB/SmolLM2-135M-Instruct'>,
  'Task_Prefix': 'Summarize the following text: ',
  'Dataset_Name': 'Samsum',
  'Model_Responses': ["Summarize the following text: \nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\nAmanda: <file_gif>\nAmanda: Thanks\nAmanda: <file",
   "Summarize the following text: \nEric: MACHINE!\r\nRob: That's so gr8!\r\nEric: I know! And shows how Americans see Russian ;)\r\nRob: And it's really funny!\r\nEric: I know! I especially like the train part!\r\nRob: Hahaha! No one talks to the machine like th

## Evaluate LLM Responses for contamination

In [None]:
for model_result_key in model_results:
    model_result = model_results[model_result_key]
    model_result['cos_sim_scores'] = []
    model_result['rouge_sim_scores'] = []
    for (prediction, gold_label) in zip(model_result['Model_Responses'], model_result['Gold_Labels']):
        model_result['cos_sim_scores'].append(get_cos_similarity_score(embedder.encode(prediction), embedder.encode(gold_label)))
        model_result['rouge_sim_scores'].append(get_rouge_scores(prediction, gold_label))

In [None]:
for model_result_key in model_results:
    model_result = model_results[model_result_key]
    df = pd.DataFrame(model_result)
    file_name = model_result_key.replace("//", "_").replace("/", "_")
    df.to_csv(f"./output/{file_name}")