To do:
- benchmark llama
- benchmark Sonnet

In [3]:
import pandas as pd
import numpy as np
import torch
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
from sklearn.utils import resample
from datasets import load_dataset, DatasetDict, Dataset
from tqdm import tqdm
import random

  Referenced from: <EB3FF92A-5EB1-3EE8-AF8B-5923C1265422> /Users/mb7336/miniforge3/envs/sandbox/lib/python3.11/site-packages/torchvision/image.so
  warn(


In [4]:
def metrics(df, preds, group_by=None):
    """
    Calculate MCC, Accuracy, F1 for predictions.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: DataFrame with calculated metrics, optionally grouped by `group_by`.
    """
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='weighted')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')

def bootstrapped_errors(y_true, y_pred, n_bootstrap=1000):
    """
    Calculate bootstrapped standard errors for MCC, Accuracy, and F1.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.

    Returns:
        dict: Standard errors for MCC, Accuracy, and F1.
    """
    mcc_scores = []
    accuracy_scores = []
    f1_scores = []
    
    for _ in range(n_bootstrap):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        
        # Calculate metrics for the resampled data
        mcc_scores.append(matthews_corrcoef(y_true_resampled, y_pred_resampled))
        accuracy_scores.append(accuracy_score(y_true_resampled, y_pred_resampled))
        f1_scores.append(f1_score(y_true_resampled, y_pred_resampled, average='weighted'))
    
    # Calculate standard errors
    return {
        'MCC_SE': np.std(mcc_scores),
        'Accuracy_SE': np.std(accuracy_scores),
        'F1_SE': np.std(f1_scores)
    }

def metrics_with_errors(df, preds, n_bootstrap=1000, group_by=None):
    """
    Calculate metrics and bootstrapped standard errors for predictions, optionally grouped.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: Combined DataFrame of metrics, standard errors, and confidence intervals.
    """
    # Step 1: Calculate metrics for each model
    metrics_df = metrics(df, preds, group_by=group_by)

    # Step 2: Calculate bootstrapped errors for each model or group
    errors = []
    if group_by not in ['dataset', 'task']:
        for col in preds:
            y_true = df['entailment']
            y_pred = df[col]
            errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
            errors_dict['Column'] = col
            errors.append(errors_dict)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                y_true = group['entailment']
                y_pred = group[col]
                errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
                errors_dict['Column'] = col
                errors_dict[group_by.capitalize()] = group_name
                errors.append(errors_dict)

    errors_df = pd.DataFrame(errors)

    if group_by in ['dataset', 'task']:
        errors_df = errors_df.set_index(['Column', group_by.capitalize()])
    else:
        errors_df = errors_df.set_index('Column')

    # Step 3: Merge metrics and errors DataFrames
    combined_df = metrics_df.merge(errors_df, left_index=True, right_index=True)

    # Step 4: Calculate confidence intervals (upper and lower bounds)
    combined_df['MCC_Lower'] = combined_df['MCC'] - combined_df['MCC_SE']
    combined_df['MCC_Upper'] = combined_df['MCC'] + combined_df['MCC_SE']

    combined_df['Accuracy_Lower'] = combined_df['Accuracy'] - combined_df['Accuracy_SE']
    combined_df['Accuracy_Upper'] = combined_df['Accuracy'] + combined_df['Accuracy_SE']

    combined_df['F1_Lower'] = combined_df['F1'] - combined_df['F1_SE']
    combined_df['F1_Upper'] = combined_df['F1'] + combined_df['F1_SE']

    return combined_df

def label_docs(model, docs_dict, batch_size = 8, device = 'cuda'):
    """
    Passes documents through the pipeline. Returns a list of entail, not_entail labels
    """
    pipe = pipeline(task = 'text-classification', model = model, 
                    batch_size = batch_size, device = device, 
                    max_length = 512, truncation = True, 
                    torch_dtype = torch.bfloat16)
    res = pipe(docs_dict)
    res = [result['label'] for result in res]
    return res

In [223]:
def recode_labels(df, label_col = 'label'):
    # Recode to binary: 1 for entailment, 0 for neutral/contradiction
    example[label_col] = 1 if example[label_col] == 0 else 0
    return df

def get_metrics(y_true, y_pred):
    return {
        'MCC': matthews_corrcoef(y_true, y_pred),
        'Accuracy': accuracy_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred, average='weighted')
    }

def bootstrapped_errors(y_true, y_pred, n_bootstrap=1000):
    """
    Calculate bootstrapped standard errors for MCC, Accuracy, and F1.
    
    Args:
        results_df (pd.DataFrame): DataFrame containing 'entailment' (true labels) and 'label' (predictions).
        n_bootstrap (int): Number of bootstrap samples to generate.
    
    Returns:
        dict: Standard errors for MCC, Accuracy, and F1.
    """
    mcc_scores = []
    accuracy_scores = []
    f1_scores = []
    
    for _ in range(n_bootstrap):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        
        # Calculate metrics for the resampled data
        mcc_scores.append(matthews_corrcoef(y_true_resampled, y_pred_resampled))
        accuracy_scores.append(accuracy_score(y_true_resampled, y_pred_resampled))
        f1_scores.append(f1_score(y_true_resampled, y_pred_resampled, average='weighted'))
    
    # Calculate standard errors
    return {
        'MCC_SE': np.std(mcc_scores),
        'Accuracy_SE': np.std(accuracy_scores),
        'F1_SE': np.std(f1_scores)
    }

def label_docs(model, docs_dict, batch_size = 32, device = 'cuda'):
    """
    Passes documents through the pipeline. Returns a list of entail, not_entail labels
    """
    pipe = pipeline(task = 'text-classification', model = model, 
                    batch_size = batch_size, device = device, 
                    max_length = 512, truncation = True, 
                    torch_dtype = torch.float16)
    res = pipe(docs_dict)
    res = [result['label'] for result in res]
    return res

def benchmark(model, data, device = 'mps', textcol = 'premise', hypcol = 'hypothesis', labelcol = 'label'):
    
    pipe = pipeline("zero-shot-classification", model = model, device = device)
    
    # Initialize results storage
    predictions = []
    actual_labels = []
    scores = []
    
    # Process one example at a time
    for i in range(len(data)):
        premise = data[i][textcol]
        hypothesis = data[i][hypcol]
        true_label = data[i][labelcol]
        
        # Run classifier
        result = pipe(
            premise,
            candidate_labels=[hypothesis],
            multi_label=False
        )
        
        # Store results
        prediction = result['scores'][0]  # Get confidence score
        predictions.append(0 if prediction >= 0.5 else 1)  # Convert score to binary prediction
        actual_labels.append(true_label)
        scores.append(prediction)
    
    # Calculate metrics
    results_df = pd.DataFrame({
        'label': predictions,
        'entailment': actual_labels,
        'score': scores
    })
    
    return results_df, get_metrics(results_df['entailment'], results_df['label'])

def benchmark_datasets(dataset_dict, model, device):
    results = []
    
    # Add progress bar for datasets
    for dataset_name, dataset in tqdm(dataset_dict.items(), desc="Datasets Complete"):
        # Recode labels
        #dataset = dataset.map(recode_labels, fn_kwargs={'label_col': 'labels'})
        
        # Benchmark the dataset
        results_df, metrics = benchmark(model, dataset, device, textcol='text', labelcol='labels')
        
        # Calculate bootstrapped standard errors
        errors = bootstrapped_errors(results_df)
        
        # Save results
        results.append({
            'Dataset': dataset_name,
            'Observations': len(dataset),
            'MCC': metrics['MCC'],
            'Accuracy': metrics['Accuracy'],
            'F1': metrics['F1'],
            'MCC_SE': errors['MCC_SE'],
            'Accuracy_SE': errors['Accuracy_SE'],
            'F1_SE': errors['F1_SE'],
            'Model': model
        })
    
    # Convert results to DataFrame
    return pd.DataFrame(results)

# OOD validation

In [9]:
ood = pd.read_csv('../data/out_domain_bench.csv')
ood.rename({'task_name':'task', 'labels':'entailment'}, axis = 1, inplace = True)
docs_dict = [{'text':ood.loc[i, 'text'], 'text_pair':ood.loc[i, 'hypothesis']} for i in ood.index]

# models that will be tested
models = ["MoritzLaurer/deberta-v3-base-zeroshot-v2.0", 
          "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
          "mlburnham/Political_DEBATE_DeBERTa_base_v1.1",
          "mlburnham/Political_DEBATE_large_v1.0",
          "mlburnham/Political_DEBATE_ModernBERT_base_v1.0",
          "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"]

# column names that will hold results
columns = ['base_nli',
           'large_nli',
           'base_debate',
           'large_debate',
           'base_modern',
           'large_modern']

In [11]:
%%time
# for each model, classify documents and return labels to the test dataframe
for modname, col in zip(models, columns):
    res = label_docs(modname, docs_dict, batch_size = 8, device = 'mps')
    ood[col] = res
    ood[col] = ood[col].replace({'entailment': 0, 'not_entailment': 1})
    print(modname + ' complete.')

Device set to use mps


MoritzLaurer/deberta-v3-base-zeroshot-v2.0 complete.


Device set to use mps


MoritzLaurer/deberta-v3-large-zeroshot-v2.0 complete.


Device set to use mps


mlburnham/Political_DEBATE_DeBERTa_base_v1.1 complete.


Device set to use mps


mlburnham/Political_DEBATE_large_v1.0 complete.


Device set to use mps
Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.


mlburnham/Political_DEBATE_ModernBERT_base_v1.0 complete.


Device set to use mps


mlburnham/Political_DEBATE_ModernBERT_large_v1.0 complete.
CPU times: user 9min 43s, sys: 1min 26s, total: 11min 10s
Wall time: 15min 30s




In [14]:
%%time
# Calculate performance metrics with bootstrapped standard errors. n_bootstrap == 1000
overall = metrics_with_errors(ood, columns, group_by = None)
task = metrics_with_errors(ood, columns, group_by = 'task')

CPU times: user 1min 31s, sys: 884 ms, total: 1min 32s
Wall time: 1min 32s


In [15]:
overall.reset_index(inplace = True)
overall['Task'] = 'overall'
task.reset_index(inplace = True)

combined = pd.concat([overall, task])
combined.rename({'Column':'Model'}, axis = 1, inplace = True)
combined

Unnamed: 0,Model,MCC,Accuracy,F1,MCC_SE,Accuracy_SE,F1_SE,MCC_Lower,MCC_Upper,Accuracy_Lower,Accuracy_Upper,F1_Lower,F1_Upper,Task
0,base_nli,0.837733,0.924664,0.924041,0.005686,0.002667,0.002708,0.832046,0.843419,0.921997,0.92733,0.921334,0.926749,overall
1,large_nli,0.815963,0.914448,0.913286,0.005966,0.002807,0.002878,0.809997,0.821929,0.911642,0.917255,0.910408,0.916164,overall
2,base_debate,0.812598,0.91268,0.912558,0.005875,0.002738,0.002749,0.806722,0.818473,0.909942,0.915419,0.909809,0.915307,overall
3,large_debate,0.845425,0.9282,0.927611,0.005562,0.00261,0.002651,0.839863,0.850986,0.92559,0.930809,0.924961,0.930262,overall
4,base_modern,0.777288,0.89667,0.89618,0.006534,0.003025,0.003055,0.770755,0.783822,0.893645,0.899696,0.893125,0.899235,overall
5,large_modern,0.827164,0.919556,0.919379,0.005966,0.002772,0.002781,0.821197,0.83313,0.916784,0.922328,0.916597,0.92216,overall
0,base_nli,0.900427,0.964126,0.964741,0.014383,0.005234,0.005059,0.886045,0.91481,0.958892,0.969359,0.959682,0.9698,agnews
1,base_nli,0.947675,0.973852,0.973859,0.008174,0.004094,0.004092,0.939501,0.955849,0.969758,0.977946,0.969768,0.977951,amazonpolarity
2,base_nli,0.73574,0.960876,0.96564,0.032763,0.005433,0.00431,0.702977,0.768502,0.955443,0.96631,0.96133,0.96995,emotiondair
3,base_nli,0.766265,0.894636,0.894847,0.020706,0.009403,0.009359,0.745559,0.786971,0.885233,0.904039,0.885488,0.904206,go_emotions


## Table

In [88]:
combined.replace({'base_nli': 'NLI Base', 
                'large_nli':'NLI Large', 
                'base_debate':'DEBATE Base',
                'large_debate':'DEBATE Large',
                'base_modern': 'DEBATE Base (MB)',
                'large_modern': 'DEBATE Large (MB)',
                'tweet_topic': 'Tweet Topics',
                'yahootopics': 'Yahoo Topics',
                'amazonpolarity': 'Amazon Polarity',
                'rottentomatoes': 'Rotten Tomatoes',
                'agnews': 'AG News',
                'emotiondair': 'DAIR AI Emotions',
                'sst2': 'Stanford Sentiment 2',
                'go_emotions': 'Google Emotions',
                'overall': 'Overall'
           }, inplace = True)

In [119]:
combined['F1'] = combined['F1'] * 100
combined['F1_SE'] = combined['F1_SE'] * 100

combined['Accuracy'] = combined['Accuracy'] * 100
combined['Accuracy_SE'] = combined['Accuracy_SE'] * 100

In [126]:
# Create a pivot table for F1 and F1_SE
f1_table = combined.pivot(index='Model', columns='Task', values=['F1', 'F1_SE'])
f1_table = f1_table.reindex(['NLI Base', 'NLI Large', 'DEBATE Base (MB)', 'DEBATE Large (MB)', 'DEBATE Base', 'DEBATE Large'])

cols = [(   'F1', 'AG News'),
        (   'F1', 'Tweet Topics'),
        (   'F1', 'Yahoo Topics'),
        (   'F1', 'Amazon Polarity'),
        (   'F1', 'Rotten Tomatoes'),
        (   'F1', 'Stanford Sentiment 2'),
        (   'F1', 'DAIR AI Emotions'),
        (   'F1', 'Google Emotions'),
        (   'F1', 'Overall'),
        ('F1_SE', 'AG News'),
        ('F1_SE', 'Tweet Topics'),
        ('F1_SE', 'Yahoo Topics'),
        ('F1_SE', 'Amazon Polarity'),
        ('F1_SE', 'Rotten Tomatoes'),
        ('F1_SE', 'Stanford Sentiment 2'),
        ('F1_SE', 'DAIR AI Emotions'),
        ('F1_SE', 'Google Emotions'),
        ('F1_SE', 'Overall')
]

f1_table = f1_table.reindex(cols, axis = 1)

# Identify the largest F1 scores per task
max_f1_per_task = f1_table['F1'].idxmax()

# Format the F1 scores with standard errors and highlight the max values
def format_f1_table(f1_table, max_index):
    # Initialize the formatted table with string type
    formatted_table = f1_table['F1'].copy().astype(str)
    for col in f1_table['F1'].columns:  # Iterate over each task (column in F1)
        max_model = max_index[col]  # Get the model with the maximum F1 score
        for idx in f1_table.index:
            f1 = f1_table.at[idx, ('F1', col)]
            se = f1_table.at[idx, ('F1_SE', col)]
            if idx == max_model:
                # Bold the maximum F1 score
                formatted_table.at[idx, col] = f"$\\mathbf{{{f1:.1f}}} \ ({se:.1f})$"
            else:
                # Format other scores without bold
                formatted_table.at[idx, col] = f"${f1:.1f} \ ({se:.1f})$"
    return formatted_table

# Apply the formatting function
formatted_f1_table = format_f1_table(f1_table, max_f1_per_task)

# Reset index for better formatting
formatted_f1_table = formatted_f1_table.reset_index()

# Convert to LaTeX
latex_table = formatted_f1_table.to_latex(index=False, escape=False, multicolumn=True, float_format="%.1f")

print(latex_table)

\begin{tabular}{llllllllll}
\toprule
Model & AG News & Tweet Topics & Yahoo Topics & Amazon Polarity & Rotten Tomatoes & Stanford Sentiment 2 & DAIR AI Emotions & Google Emotions & Overall \\
\midrule
NLI Base & $96.5 \ (0.5)$ & $79.4 \ (1.0)$ & $96.4 \ (0.5)$ & $97.4 \ (0.4)$ & $95.5 \ (0.6)$ & $\mathbf{96.7} \ (0.5)$ & $\mathbf{96.6} \ (0.4)$ & $89.5 \ (0.9)$ & $92.4 \ (0.3)$ \\
NLI Large & $\mathbf{97.4} \ (0.4)$ & $69.2 \ (1.4)$ & $\mathbf{97.3} \ (0.5)$ & $\mathbf{98.3} \ (0.3)$ & $\mathbf{96.0} \ (0.5)$ & $96.6 \ (0.5)$ & $95.7 \ (0.5)$ & $\mathbf{90.0} \ (0.9)$ & $91.3 \ (0.3)$ \\
DEBATE Base (MB) & $95.8 \ (0.5)$ & $85.5 \ (0.9)$ & $94.2 \ (0.6)$ & $95.3 \ (0.6)$ & $90.6 \ (0.8)$ & $89.2 \ (0.8)$ & $93.5 \ (0.5)$ & $80.9 \ (1.2)$ & $89.6 \ (0.3)$ \\
DEBATE Large (MB) & $95.4 \ (0.6)$ & $90.9 \ (0.7)$ & $95.3 \ (0.6)$ & $97.8 \ (0.4)$ & $93.1 \ (0.7)$ & $91.8 \ (0.7)$ & $92.1 \ (0.6)$ & $86.0 \ (1.1)$ & $91.9 \ (0.3)$ \\
DEBATE Base & $97.4 \ (0.4)$ & $\mathbf{93.9} \ (0.5)$ & $

Difference plot for models?

In [127]:
formatted_f1_table

Task,Model,AG News,Tweet Topics,Yahoo Topics,Amazon Polarity,Rotten Tomatoes,Stanford Sentiment 2,DAIR AI Emotions,Google Emotions,Overall
0,NLI Base,$96.5 \ (0.5)$,$79.4 \ (1.0)$,$96.4 \ (0.5)$,$97.4 \ (0.4)$,$95.5 \ (0.6)$,$\mathbf{96.7} \ (0.5)$,$\mathbf{96.6} \ (0.4)$,$89.5 \ (0.9)$,$92.4 \ (0.3)$
1,NLI Large,$\mathbf{97.4} \ (0.4)$,$69.2 \ (1.4)$,$\mathbf{97.3} \ (0.5)$,$\mathbf{98.3} \ (0.3)$,$\mathbf{96.0} \ (0.5)$,$96.6 \ (0.5)$,$95.7 \ (0.5)$,$\mathbf{90.0} \ (0.9)$,$91.3 \ (0.3)$
2,DEBATE Base (MB),$95.8 \ (0.5)$,$85.5 \ (0.9)$,$94.2 \ (0.6)$,$95.3 \ (0.6)$,$90.6 \ (0.8)$,$89.2 \ (0.8)$,$93.5 \ (0.5)$,$80.9 \ (1.2)$,$89.6 \ (0.3)$
3,DEBATE Large (MB),$95.4 \ (0.6)$,$90.9 \ (0.7)$,$95.3 \ (0.6)$,$97.8 \ (0.4)$,$93.1 \ (0.7)$,$91.8 \ (0.7)$,$92.1 \ (0.6)$,$86.0 \ (1.1)$,$91.9 \ (0.3)$
4,DEBATE Base,$97.4 \ (0.4)$,$\mathbf{93.9} \ (0.5)$,$94.6 \ (0.6)$,$94.8 \ (0.6)$,$88.3 \ (0.9)$,$89.5 \ (0.8)$,$92.0 \ (0.6)$,$85.5 \ (1.1)$,$91.3 \ (0.3)$
5,DEBATE Large,$97.4 \ (0.4)$,$87.6 \ (0.8)$,$96.5 \ (0.5)$,$97.7 \ (0.4)$,$93.0 \ (0.7)$,$93.4 \ (0.7)$,$95.3 \ (0.5)$,$87.9 \ (1.0)$,$\mathbf{92.8} \ (0.3)$


# Scrap

# NLI

In [237]:
nli = pd.read_csv('../data/nli_bench.csv')

In [239]:
model = "mlburnham/Political_DEBATE_base_v1.0"
res, metrics = benchmark(model = model, data = Dataset.from_pandas(nli), textcol = 'text', hypcol = 'hypothesis', labelcol = 'labels')
nli['debate_base'] = res['label']

In [240]:
model = 'MoritzLaurer/deberta-v3-base-zeroshot-v2.0'
res, metrics = benchmark(model = model, data = Dataset.from_pandas(nli), textcol = 'text', hypcol = 'hypothesis', labelcol = 'labels')
nli['deberta_base'] = res['label']

## Results Table

In [250]:
df = nli

results = []
for column in ['debate_base', 'deberta_base']:
    metrics = get_metrics(df['labels'], df[column])
    metrics_se = bootstrapped_errors(pd.DataFrame({
        'entailment': df['labels'],
        'label': df[column]
    }))
    combined_results = {**metrics, **metrics_se, 'Model': column}
    results.append(combined_results)

# Calculate metrics by task
tasks = df['task_name'].unique()
for task in tasks:
    task_df = df[df['task_name'] == task]
    for column in ['debate_base', 'deberta_base']:
        metrics = get_metrics(task_df['labels'], task_df[column])
        metrics_se = bootstrapped_errors(pd.DataFrame({
            'entailment': task_df['labels'],
            'label': task_df[column]
        }))
        combined_results = {**metrics, **metrics_se, 'Model': column, 'Task': task}
        results.append(combined_results)

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
results_df.loc[results_df['Task'].isna(), 'Task'] = 'Overall'

In [251]:
# Create a pivot table for F1 and F1_SE
f1_table = results_df.pivot(index='Model', columns='Task', values=['F1', 'F1_SE'])

# Identify the largest F1 scores per task
max_f1_per_task = f1_table['F1'].idxmax()

# Format the F1 scores with standard errors and highlight the max values
def format_f1(row, max_index):
    return row['F1'].combine(row['F1_SE'], 
        lambda f1, se, task=row.name: f"$\\mathbf{{{f1:.3f}}} \\pm {se:.3f}$" if task in max_index.values else f"${f1:.3f} \\pm {se:.3f}$")

formatted_f1_table = f1_table.apply(
    lambda row: format_f1(row, max_f1_per_task), axis=1
)

# Reset index for better formatting
formatted_f1_table = formatted_f1_table.reset_index()

# Convert to LaTeX
latex_table = formatted_f1_table.to_latex(index=False, escape=False, multicolumn=True, float_format="%.3f")

print(latex_table)

\begin{tabular}{lllll}
\toprule
Model & Overall & anli_r1 & mnli_m & wanli \\
\midrule
debate_base & $0.660 \pm 0.009$ & $0.556 \pm 0.016$ & $0.771 \pm 0.013$ & $0.649 \pm 0.015$ \\
deberta_base & $\mathbf{0.857} \pm 0.006$ & $\mathbf{0.822} \pm 0.012$ & $\mathbf{0.945} \pm 0.007$ & $\mathbf{0.804} \pm 0.012$ \\
\bottomrule
\end{tabular}

