In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
from scipy.special import kl_div
import numpy as np
import torch
import torch.nn.functional as F
from typing import Any
from utils import clear_cuda, load_benchmark_prompts_and_answers
from sklearn.metrics import accuracy_score, cohen_kappa_score, balanced_accuracy_score
from data.Benchmark.benchmark import benchmark_and_evaluate_models

In [3]:
llama_models = ['meta-llama/Llama-3.1-8B-Instruct',
                'unsloth/Llama-3.1-8B-Instruct-bnb-4bit',
                'hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4']
unquantized_model = llama_models[0]
precisions = ['bfloat16', 'int4', 'int4']
precision_map = dict(zip(llama_models, precisions))

In [6]:
prompts, answers = load_benchmark_prompts_and_answers()

In [7]:
results = benchmark_and_evaluate_models(llama_models,prompts,answers, precision_map, unquantized_model)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  return isinstance(obj, torch.Tensor)
  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model.layers.10.self_attn.v_proj.bias', 'model.layers.11.mlp.down_proj.bias', 'mode

In [8]:
results

{'benchmark_df':                                                   model  \
 0                      meta-llama/Llama-3.1-8B-Instruct   
 1                      meta-llama/Llama-3.1-8B-Instruct   
 2                      meta-llama/Llama-3.1-8B-Instruct   
 3                      meta-llama/Llama-3.1-8B-Instruct   
 4                      meta-llama/Llama-3.1-8B-Instruct   
 ...                                                 ...   
 2035  hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ...   
 2036  hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ...   
 2037  hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ...   
 2038  hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ...   
 2039  hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ...   
 
                                           probabilities answer  \
 0     [[tensor(6.1118e-10, device='cuda:0', dtype=to...     Ja   
 1     [[tensor(1.0277e-10, device='cuda:0', dtype=to...     Ne   
 2     [[tensor(9.8225e-10, device='cuda:0', dtype=to...     

In [8]:
def benchmark_model(data_dict:dict[str,list[Any]],used_model:Any, used_tokenizer:Any, benchmark_prompts:list[str], benchmark_answers:list[str], model_precision:str, model_name:str) -> dict[str,Any]:
    """
    Benchmarks a language model's top-1 and top-5 predictions across a set of prompt-answer pairs.

    For each prompt in `benchmark_prompts`, the model is used to generate a single-token response.
    The top 5 predicted tokens and their probabilities are collected and top-1 accuracy is estimated
    by comparing the highest-probability token to the expected answer.

    The results are appended to the `data_dict`, which should already be initialized with keys:
        - 'model', 'probabilities', 'answer', 'answer_confidence', 'top_5_probabilities',
          'question', 'correct_answer', 'model_precision'

    :param data_dict: Dictionary used to collect benchmark results.
    :param used_model: The preloaded language model (must support `.eval()` and `.logits` output).
    :param used_tokenizer: The tokenizer corresponding to the model (must support chat template formatting).
    :param benchmark_prompts: A list of prompts/questions to pass to the model.
    :param benchmark_answers: A list of correct answers, matched to `benchmark_prompts`.
    :param model_precision: A string identifier for the model's precision type (e.g., "fp16", "int8").
    :param model_name: The name of the benchmarked language model.
    :return: Updated `data_dict` containing predictions and metadata for all evaluated prompts.
    """
    used_model.eval()
    with torch.inference_mode(): # disables gradient calculations, dropout and other training only calculations/settings
        for prompt, expected_answer in zip(benchmark_prompts, benchmark_answers):
            inputs = used_tokenizer.apply_chat_template(
                prompt,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                return_dict=True,
            ).to("cuda")

            next_token_logits = used_model(**inputs).logits[:, -1, :]
            probabilities = F.softmax(next_token_logits, dim=-1)
            topk_probs, topk_indices = torch.topk(probabilities, k=5)
            tokens = used_tokenizer.convert_ids_to_tokens(topk_indices.tolist()[0])
            top_5_token_probabilities = dict(zip(tokens, topk_probs.squeeze().tolist()))
            data_dict['model'].append(model_name)
            data_dict['probabilities'].append(probabilities)
            data_dict['answer'].append(tokens[0])
            data_dict['answer_confidence'].append(top_5_token_probabilities[tokens[0]])
            data_dict['top_5_probabilities'].append(top_5_token_probabilities)
            data_dict['question'].append(prompt)
            data_dict['correct_answer'].append(expected_answer)
            data_dict['model_precision'].append(model_precision)
    return data_dict

In [None]:
data = {'model': [], 'probabilities': [], 'answer': [], 'top_5_probabilities': [], 'question': [], 'correct_answer': [], 'answer_confidence': [], 'model_precision': []}
for model_name, precision in zip(llama_models, precisions):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map='cuda')
    data = benchmark_model(data,model, tokenizer, prompts, answers, precision, model_name)
    del model, tokenizer
    clear_cuda()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
df_quantization_benchmark = pd.DataFrame.from_dict(data)

In [None]:
#df_quantization_benchmark.to_feather('quantization_benchmark_raw_data.feather')

In [14]:
def convert_answer_to_bool(answer:str) -> bool|float:
    """
    Converts the answer to a boolean value or NaN.

    :param answer: String of model answer to a yes/no question.
    :return: True if answer starts with 'j' or 'J',
             False if it starts with 'n' or 'N',
             np.nan otherwise.
    """
    if answer.lower().startswith('j'):
        return True
    elif answer.lower().startswith('n'):
        return False
    else:
        return np.nan


In [25]:
df_quantization_benchmark['bool_answer'] = df_quantization_benchmark['answer'].apply(convert_answer_to_bool)
df_quantization_benchmark['correct_bool_answer'] = df_quantization_benchmark['correct_answer'].apply(convert_answer_to_bool)

In [18]:
df_quantization_benchmark.columns

Index(['model', 'probabilities', 'answer', 'top_5_probabilities', 'question',
       'correct_answer', 'answer_confidence', 'model_precision', 'bool_answer',
       'expected_bool_answer'],
      dtype='object')

In [68]:
def evaluate_models(df:pd.DataFrame, model_list:list[str], include_baseline:bool=True):
    """
    Evaluate classification performance metrics for a list of models using a DataFrame of predictions.

    For each model in the provided list, this function computes:
      - Accuracy
      - Balanced accuracy
      - Cohen's kappa score

    Optionally, it also evaluates a baseline model that always predicts `False`, using the same
    subset of data as the first model in `model_list`.

    :param df: A pandas DataFrame containing model predictions. Must include the columns:
               - 'model': model identifier
               - 'correct_bool_answer': ground truth boolean labels
               - 'bool_answer': model's predicted boolean labels
    :param model_list: A list of model names (as strings) to evaluate.
    :param include_baseline: If True, evaluates a baseline model that always predicts False.
    :return: A dictionary where keys are model names and values are dictionaries containing:
             - 'accuracy': standard accuracy score
             - 'balanced_accuracy': balanced accuracy score
             - 'cohen_kappa': Cohen's kappa score
    """
    results = {}

    for model_name in model_list:
        model_df = df[df['model'] == model_name]
        y_true = model_df['correct_bool_answer']
        y_pred = model_df['bool_answer']

        acc = accuracy_score(y_true, y_pred)
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
        kappa = cohen_kappa_score(y_true, y_pred)

        results[model_name] = {
            "accuracy": acc,
            "balanced_accuracy": balanced_acc,
            "cohen_kappa": kappa
        }

    if include_baseline:
        baseline_df = df[df['model'] == llama_models[0]].copy()
        baseline_df['bool_answer'] = False
        y_true = baseline_df['correct_bool_answer']
        y_pred = baseline_df['bool_answer']

        acc = accuracy_score(y_true, y_pred)
        balanced_acc = balanced_accuracy_score(y_true, y_pred)
        kappa = cohen_kappa_score(y_true, y_pred)

        results["always_false_baseline"] = {
            "accuracy": acc,
            "balanced_accuracy": balanced_acc,
            "cohen_kappa": kappa
        }

    return results

Classification report for meta-llama/Llama-3.1-8B-Instruct:
              precision    recall  f1-score   support

       False       0.87      0.66      0.75       544
        True       0.31      0.62      0.42       136

    accuracy                           0.65       680
   macro avg       0.59      0.64      0.58       680
weighted avg       0.76      0.65      0.69       680

Balanced Accuracy: 0.6397
Cohen's Kappa: 0.2049
------------------------------------------------------------
Classification report for unsloth/Llama-3.1-8B-Instruct-bnb-4bit:
              precision    recall  f1-score   support

       False       0.85      0.72      0.78       544
        True       0.31      0.49      0.38       136

    accuracy                           0.68       680
   macro avg       0.58      0.61      0.58       680
weighted avg       0.74      0.68      0.70       680

Balanced Accuracy: 0.6075
Cohen's Kappa: 0.1754
------------------------------------------------------------
Cl

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
eval_results = evaluate_models(df_quantization_benchmark, llama_models)

In [None]:
def compute_distribution_divergences(
    df: pd.DataFrame,
    llama_models: list[str],
    unquantized_model: str
) -> tuple[dict[str, float], dict[str, float]]:
    """
    Computes the KL divergence and Hellinger distance between the probability distributions
    of quantized models and a reference unquantized model.

    :param df: DataFrame containing a 'model' column and a 'probabilities' column (each entry is a tensor).
    :param llama_models: List of model names to compare.
    :param unquantized_model: Name of the reference (unquantized) model.
    :return: Tuple of two dictionaries:
             - KL divergence results: {model_name: mean_kl_divergence}
             - Hellinger distance results: {model_name: mean_hellinger_distance}
    """
    ref_probs = df[df['model'] == unquantized_model]['probabilities'].to_list()
    ref_probs = torch.stack(ref_probs, dim=-1).detach().cpu().float().squeeze().numpy()

    kl_results = {}
    hellinger_results = {}

    for model_name in llama_models:
        if model_name == unquantized_model:
            continue

        data_subset = df[df['model'] == model_name]
        comp_probs = data_subset['probabilities'].to_list()
        comp_probs = torch.stack(comp_probs, dim=-1).detach().cpu().float().squeeze().numpy()

        if ref_probs.shape != comp_probs.shape:
            print(f"Shape mismatch between {unquantized_model} and {model_name}")
            continue

        model_kl_divs = np.sum(kl_div(comp_probs, ref_probs), axis=0)
        sqrt_diff = np.sqrt(comp_probs) - np.sqrt(ref_probs)
        hellinger_distance = np.sqrt(np.sum(sqrt_diff ** 2, axis=0)) / np.sqrt(2)

        kl_results[model_name] = np.mean(model_kl_divs)
        hellinger_results[model_name] = np.mean(hellinger_distance)

    return kl_results, hellinger_results

In [65]:
kl_results, hellinger_results = compute_distribution_divergences(df_quantization_benchmark, llama_models, unquantized_model)


In [54]:
kl_results

{'unsloth/Llama-3.1-8B-Instruct-bnb-4bit': np.float32(0.07237416),
 'hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4': np.float32(0.3089346)}

In [62]:
hellinger_results

{'unsloth/Llama-3.1-8B-Instruct-bnb-4bit': np.float64(0.0995063487363991),
 'hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4': np.float64(0.2423799285647464)}