In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from SimilarityHelpers import get_cos_similarity_score, get_rouge_scores
from sentence_transformers import SentenceTransformer
from enum import Enum
from typing import List, Dict, Optional
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
from detect_pretraining import detect_pretraining_batch

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [3]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
random_seed = 42

## Load Data

Steps:
- Load the dataset(s)
- (Optional) Do any splitting if needed
- (Optional) Filter any low quality input/output pairs
- Rename all article/text/content/body columns to "input"
- Rename all summary/label/output columns to "summary" 


Notes:
- Renaming is done so its easier during inference time to make model calls
- Some datasets are too large to load reliably so we split and use only a portion of the data

In [5]:
samsum = load_dataset("Samsung/samsum", trust_remote_code=True)['test']
samsum

README.md:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


samsum.py:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

corpus.7z:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 819
})

In [6]:
webis = load_dataset("webis/tldr-17", trust_remote_code=True)
webis

README.md:   0%|          | 0.00/9.14k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tldr-17.py:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/2.79k [00:00<?, ?B/s]

corpus-webis-tldr-17.zip:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3848330 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['author', 'body', 'normalizedBody', 'subreddit', 'subreddit_id', 'id', 'content', 'summary'],
        num_rows: 3848330
    })
})

In [7]:
# Take a random 10% of Webis
webis_sel = webis['train'].shuffle(seed=random_seed).train_test_split(test_size=0.01)
webis_test = webis_sel['test']
webis_test

Dataset({
    features: ['author', 'body', 'normalizedBody', 'subreddit', 'subreddit_id', 'id', 'content', 'summary'],
    num_rows: 38484
})

In [8]:
# Normalize the dataset input/output columns so it is input for the model input and summary for the gold label
webis_test = webis_test.rename_columns({"content": "input"})
webis_test

Dataset({
    features: ['author', 'body', 'normalizedBody', 'subreddit', 'subreddit_id', 'id', 'input', 'summary'],
    num_rows: 38484
})

In [9]:
samsum = samsum.rename_columns({"dialogue": "input"})
samsum

Dataset({
    features: ['id', 'input', 'summary'],
    num_rows: 819
})

## Model Inference Helpers

In [10]:
class TaskPrefix(Enum):
    """
    """
    INSTRUCTION_AT_TOP = "Summarize the following text: "

    def __str__(self):
        return self.value

In [12]:
class Model(Enum):
    """
    Models supported by HuggingFace we can use for testing.
    We picked models that come in multiple sizes.
    - HuggingFaceTB/SmolLM2-135M
    """
    SMOL_LM2_135M = "HuggingFaceTB/SmolLM2-135M"
    SMOL_LM2_135M_Instruct = "HuggingFaceTB/SmolLM2-135M-Instruct"
    SMOL_LM2_360M_Instruct = "HuggingFaceTB/SmolLM2-360M-Instruct"
    SMOL_LM2_1p7B = "HuggingFaceTB/SmolLM2-1.7B"
    SMOL_LM2_1p7B_Instruct = "HuggingFaceTB/SmolLM2-1.7B-Instruct"
    PHI_3p5_Mini_Instruct = "microsoft/Phi-3.5-mini-instruct"

    def __str__(self):
        return self.value

In [13]:
class HuggingFaceInference:
    def __init__(self, model: Model):
        self.tokenizer = AutoTokenizer.from_pretrained(model.value)
        self.model = AutoModelForCausalLM.from_pretrained(model.value).to(DEVICE)

        # Add a padding token to the tokenizer if it doesn't exist
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "<pad>"})

    def predict(self,  prompt: str, options: Optional[Dict] = None) -> str:
        """
        Predicts the output of a model given a prompt using HuggingFace transformers.
        """
        if options is None:
            options = {}
            
        try:
            # Tokenize and generate
            inputs = self.tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
            outputs = self.model.generate(
                **inputs,
                temperature=options.get("temperature", 0.001),
                top_p=options.get("top_p", 0.9),
                do_sample=True
            )
            
            return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        except Exception as e:
            print(f"Error during prediction: {e}")
            return "None"

    def predict_batch(self, prompts: List[str], options: Optional[Dict] = {}) -> List[str]:
        """
        Batch prediction using HuggingFace transformers.
        """
        return [self.predict(prompt, options) for prompt in prompts]

In [14]:
def create_model_results_row():
    return {
        "Model": None,
        "Task_Prefix": None,
        "Dataset_Name": None,
        "Model_Responses": None,
        "Gold_Labels": None,
        "Min_K_Responses": None,
    }


## Model Evaluation Helpers

## Collect LLM Responses

In [15]:
MODEL_EVALS = [
    {
        "Model": Model.SMOL_LM2_135M_Instruct,
        "Options": {}
    },
    {
        "Model": Model.SMOL_LM2_360M_Instruct,
        "Options": {}
    },
    {
        "Model": Model.SMOL_LM2_1p7B,
        "Options": {}
    },
    {
        "Model": Model.SMOL_LM2_1p7B_Instruct,
        "Options": {}
    },
    {
        "Model": Model.PHI_3p5_Mini_Instruct,
        "Options": {}
    }
]

In [16]:
DATASET_EVALS = [
    {
        "Name": "Samsum",
        "Dataset": samsum,
    }
#    {
#        "Name": "Webis",
#        "Dataset": webis_test,
#    }
]


In [17]:
model_results = {}

In [None]:
for dataset in DATASET_EVALS:
    for model in MODEL_EVALS:
        # Start a timer
        start_time = time.perf_counter()

        print(f"Evaluating {model['Model']} on {dataset['Name']}")

        # Load the model
        model_obj = HuggingFaceInference(model['Model'])

        # Get the prompts
        inputs = dataset['Dataset']['input']

        # TODO: We should maybe vary this somehow
        # Map the inputs with the task prefix
        prompts = [TaskPrefix.INSTRUCTION_AT_TOP.value + "\n" + input for input in inputs]

        # Get the gold labels
        summaries = dataset['Dataset']['summary']

        # Get the model responses
        model_responses = model_obj.predict_batch(prompts, model['Options'] if model['Options'] else {})

        # Grab the min-k% responses and calculations 
        min_k_responses = detect_pretraining_batch(prompts, model['Model'])

        # Save the results
        model_results_row = create_model_results_row()
        model_results_row['Model'] = model['Model']
        model_results_row['Task_Prefix'] = TaskPrefix.INSTRUCTION_AT_TOP.value
        model_results_row['Dataset_Name'] = dataset['Name']
        model_results_row['Model_Responses'] = model_responses
        model_results_row['Min_K_Responses'] = min_k_responses
        
        model_results_row['Gold_Labels'] = summaries

        model_results[f"{model['Model']}_{dataset['Name']}"] = model_results_row
    
        # Print how long it took
        end_time = time.perf_counter()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time} seconds")


Evaluating HuggingFaceTB/SmolLM2-135M-Instruct on Samsum


tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
model_results

## Evaluate LLM Responses for contamination

In [None]:
for model_result_key in model_results:
    model_result = model_results[model_result_key]
    model_result['cos_sim_scores'] = []
    model_result['rouge_sim_scores'] = []
    for (prediction, gold_label) in zip(model_result['Model_Responses'], model_result['Gold_Labels']):
        model_result['cos_sim_scores'].append(get_cos_similarity_score(embedder.encode(prediction), embedder.encode(gold_label)))
        model_result['rouge_sim_scores'].append(get_rouge_scores(prediction, gold_label))

In [None]:
for model_result_key in model_results:
    model_result = model_results[model_result_key]
    df = pd.DataFrame(model_result)
    file_name = model_result_key.replace("//", "_").replace("/", "_")
    df.to_csv(f"./output/{file_name}")