To do:
- benchmark llama
- benchmark Sonnet

In [123]:
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
from sklearn.utils import resample
from datasets import load_dataset, DatasetDict, Dataset
from tqdm import tqdm
import random

In [223]:
def recode_labels(df, label_col = 'label'):
    # Recode to binary: 1 for entailment, 0 for neutral/contradiction
    example[label_col] = 1 if example[label_col] == 0 else 0
    return df

def get_metrics(y_true, y_pred):
    return {
        'MCC': matthews_corrcoef(y_true, y_pred),
        'Accuracy': accuracy_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred, average='weighted')
    }

def bootstrapped_errors(results_df, n_bootstrap=1000):
    """
    Calculate bootstrapped standard errors for MCC, Accuracy, and F1.
    
    Args:
        results_df (pd.DataFrame): DataFrame containing 'entailment' (true labels) and 'label' (predictions).
        n_bootstrap (int): Number of bootstrap samples to generate.
    
    Returns:
        dict: Standard errors for MCC, Accuracy, and F1.
    """
    mcc_scores = []
    accuracy_scores = []
    f1_scores = []
    
    y_true = results_df['entailment']
    y_pred = results_df['label']
    
    for _ in range(n_bootstrap):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        
        # Calculate metrics for the resampled data
        mcc_scores.append(matthews_corrcoef(y_true_resampled, y_pred_resampled))
        accuracy_scores.append(accuracy_score(y_true_resampled, y_pred_resampled))
        f1_scores.append(f1_score(y_true_resampled, y_pred_resampled, average='weighted'))
    
    # Calculate standard errors
    return {
        'MCC_SE': np.std(mcc_scores),
        'Accuracy_SE': np.std(accuracy_scores),
        'F1_SE': np.std(f1_scores)
    }

def benchmark(model, data, device = 'mps', textcol = 'premise', hypcol = 'hypothesis', labelcol = 'label'):
    
    pipe = pipeline("zero-shot-classification", model = model, device = device)
    
    # Initialize results storage
    predictions = []
    actual_labels = []
    scores = []
    
    # Process one example at a time
    for i in range(len(data)):
        premise = data[i][textcol]
        hypothesis = data[i][hypcol]
        true_label = data[i][labelcol]
        
        # Run classifier
        result = pipe(
            premise,
            candidate_labels=[hypothesis],
            multi_label=False
        )
        
        # Store results
        prediction = result['scores'][0]  # Get confidence score
        predictions.append(0 if prediction >= 0.5 else 1)  # Convert score to binary prediction
        actual_labels.append(true_label)
        scores.append(prediction)
    
    # Calculate metrics
    results_df = pd.DataFrame({
        'label': predictions,
        'entailment': actual_labels,
        'score': scores
    })
    
    return results_df, get_metrics(results_df['entailment'], results_df['label'])

def benchmark_datasets(dataset_dict, model, device):
    results = []
    
    # Add progress bar for datasets
    for dataset_name, dataset in tqdm(dataset_dict.items(), desc="Datasets Complete"):
        # Recode labels
        #dataset = dataset.map(recode_labels, fn_kwargs={'label_col': 'labels'})
        
        # Benchmark the dataset
        results_df, metrics = benchmark(model, dataset, device, textcol='text', labelcol='labels')
        
        # Calculate bootstrapped standard errors
        errors = bootstrapped_errors(results_df)
        
        # Save results
        results.append({
            'Dataset': dataset_name,
            'Observations': len(dataset),
            'MCC': metrics['MCC'],
            'Accuracy': metrics['Accuracy'],
            'F1': metrics['F1'],
            'MCC_SE': errors['MCC_SE'],
            'Accuracy_SE': errors['Accuracy_SE'],
            'F1_SE': errors['F1_SE'],
            'Model': model
        })
    
    # Convert results to DataFrame
    return pd.DataFrame(results)

# OOD validation

In [215]:
ood = pd.read_csv('../data/out_domain_bench.csv')

## DEBATE Base

In [None]:
model = "mlburnham/Political_DEBATE_base_v1.0"
res, metrics = benchmark(model = model, data = Dataset.from_pandas(ood), textcol = 'text', hypcol = 'hypothesis', labelcol = 'labels')
ood['debate_base'] = res['label']

## DeBERTa Base

In [229]:
model = 'MoritzLaurer/deberta-v3-base-zeroshot-v2.0'
res, metrics = benchmark(model = model, data = Dataset.from_pandas(ood), textcol = 'text', hypcol = 'hypothesis', labelcol = 'labels')
ood['deberta_base'] = res['label']

{'MCC': 0.8428277147812402,
 'Accuracy': 0.927020921324035,
 'F1': 0.9265993784470345}

## Table

In [252]:
df = ood

results = []
for column in ['debate_base', 'deberta_base']:
    metrics = get_metrics(df['labels'], df[column])
    metrics_se = bootstrapped_errors(pd.DataFrame({
        'entailment': df['labels'],
        'label': df[column]
    }))
    combined_results = {**metrics, **metrics_se, 'Model': column}
    results.append(combined_results)

# Calculate metrics by task
tasks = df['task_name'].unique()
for task in tasks:
    task_df = df[df['task_name'] == task]
    for column in ['debate_base', 'deberta_base']:
        metrics = get_metrics(task_df['labels'], task_df[column])
        metrics_se = bootstrapped_errors(pd.DataFrame({
            'entailment': task_df['labels'],
            'label': task_df[column]
        }))
        combined_results = {**metrics, **metrics_se, 'Model': column, 'Task': task}
        results.append(combined_results)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
results_df.loc[results_df['Task'].isna(), 'Task'] = 'Overall'

In [253]:
# Create a pivot table for F1 and F1_SE
f1_table = results_df.pivot(index='Model', columns='Task', values=['F1', 'F1_SE'])

# Identify the largest F1 scores per task
max_f1_per_task = f1_table['F1'].idxmax()

# Format the F1 scores with standard errors and highlight the max values
def format_f1(row, max_index):
    return row['F1'].combine(row['F1_SE'], 
        lambda f1, se, task=row.name: f"$\\mathbf{{{f1:.3f}}} \\pm {se:.3f}$" if task in max_index.values else f"${f1:.3f} \\pm {se:.3f}$")

formatted_f1_table = f1_table.apply(
    lambda row: format_f1(row, max_f1_per_task), axis=1
)

# Reset index for better formatting
formatted_f1_table = formatted_f1_table.reset_index()

# Convert to LaTeX
latex_table = formatted_f1_table.to_latex(index=False, escape=False, multicolumn=True, float_format="%.3f")

print(latex_table)

\begin{tabular}{llllllllll}
\toprule
Model & Overall & agnews & amazonpolarity & emotiondair & go_emotions & rottentomatoes & sst2 & tweet_topic & yahootopics \\
\midrule
debate_base & $\mathbf{0.916} \pm 0.003$ & $\mathbf{0.963} \pm 0.005$ & $\mathbf{0.939} \pm 0.006$ & $\mathbf{0.942} \pm 0.005$ & $\mathbf{0.846} \pm 0.011$ & $\mathbf{0.874} \pm 0.009$ & $\mathbf{0.903} \pm 0.008$ & $\mathbf{0.964} \pm 0.004$ & $\mathbf{0.935} \pm 0.006$ \\
deberta_base & $\mathbf{0.927} \pm 0.003$ & $\mathbf{0.966} \pm 0.005$ & $\mathbf{0.974} \pm 0.004$ & $\mathbf{0.966} \pm 0.004$ & $\mathbf{0.887} \pm 0.010$ & $\mathbf{0.951} \pm 0.006$ & $\mathbf{0.965} \pm 0.005$ & $\mathbf{0.819} \pm 0.010$ & $\mathbf{0.964} \pm 0.005$ \\
\bottomrule
\end{tabular}



# NLI

In [237]:
nli = pd.read_csv('../data/nli_bench.csv')

In [239]:
model = "mlburnham/Political_DEBATE_base_v1.0"
res, metrics = benchmark(model = model, data = Dataset.from_pandas(nli), textcol = 'text', hypcol = 'hypothesis', labelcol = 'labels')
nli['debate_base'] = res['label']

In [240]:
model = 'MoritzLaurer/deberta-v3-base-zeroshot-v2.0'
res, metrics = benchmark(model = model, data = Dataset.from_pandas(nli), textcol = 'text', hypcol = 'hypothesis', labelcol = 'labels')
nli['deberta_base'] = res['label']

## Results Table

In [250]:
df = nli

results = []
for column in ['debate_base', 'deberta_base']:
    metrics = get_metrics(df['labels'], df[column])
    metrics_se = bootstrapped_errors(pd.DataFrame({
        'entailment': df['labels'],
        'label': df[column]
    }))
    combined_results = {**metrics, **metrics_se, 'Model': column}
    results.append(combined_results)

# Calculate metrics by task
tasks = df['task_name'].unique()
for task in tasks:
    task_df = df[df['task_name'] == task]
    for column in ['debate_base', 'deberta_base']:
        metrics = get_metrics(task_df['labels'], task_df[column])
        metrics_se = bootstrapped_errors(pd.DataFrame({
            'entailment': task_df['labels'],
            'label': task_df[column]
        }))
        combined_results = {**metrics, **metrics_se, 'Model': column, 'Task': task}
        results.append(combined_results)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
results_df.loc[results_df['Task'].isna(), 'Task'] = 'Overall'

In [251]:
# Create a pivot table for F1 and F1_SE
f1_table = results_df.pivot(index='Model', columns='Task', values=['F1', 'F1_SE'])

# Identify the largest F1 scores per task
max_f1_per_task = f1_table['F1'].idxmax()

# Format the F1 scores with standard errors and highlight the max values
def format_f1(row, max_index):
    return row['F1'].combine(row['F1_SE'], 
        lambda f1, se, task=row.name: f"$\\mathbf{{{f1:.3f}}} \\pm {se:.3f}$" if task in max_index.values else f"${f1:.3f} \\pm {se:.3f}$")

formatted_f1_table = f1_table.apply(
    lambda row: format_f1(row, max_f1_per_task), axis=1
)

# Reset index for better formatting
formatted_f1_table = formatted_f1_table.reset_index()

# Convert to LaTeX
latex_table = formatted_f1_table.to_latex(index=False, escape=False, multicolumn=True, float_format="%.3f")

print(latex_table)

\begin{tabular}{lllll}
\toprule
Model & Overall & anli_r1 & mnli_m & wanli \\
\midrule
debate_base & $0.660 \pm 0.009$ & $0.556 \pm 0.016$ & $0.771 \pm 0.013$ & $0.649 \pm 0.015$ \\
deberta_base & $\mathbf{0.857} \pm 0.006$ & $\mathbf{0.822} \pm 0.012$ & $\mathbf{0.945} \pm 0.007$ & $\mathbf{0.804} \pm 0.012$ \\
\bottomrule
\end{tabular}

