In [19]:
import pandas as pd
import numpy as np
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
from sklearn.utils import resample
from datasets import load_dataset
import torch

In [6]:
# load data
ds = load_dataset('mlburnham/PolNLI')
train = ds['train'].to_pandas()
test = pd.read_csv('../data/polnli_test_results.csv')

# convert to dictionary of document pairs to pass through the pipeline
docs_dict = [{'text':test.loc[i, 'premise'], 'text_pair':test.loc[i, 'augmented_hypothesis']} for i in test.index]

Using the latest cached version of the dataset since mlburnham/PolNLI couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/mb7336/.cache/huggingface/datasets/mlburnham___pol_nli/default/0.0.0/062724c0a0e601ffc5fba98a5546cc2237a766fa (last modified on Wed Oct  2 14:23:31 2024).


In [22]:
def metrics(df, preds, group_by=None):
    """
    Calculate MCC, Accuracy, F1 for predictions.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: DataFrame with calculated metrics, optionally grouped by `group_by`.
    """
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='weighted')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')

def bootstrapped_errors(y_true, y_pred, n_bootstrap=1000):
    """
    Calculate bootstrapped standard errors for MCC, Accuracy, and F1.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.

    Returns:
        dict: Standard errors for MCC, Accuracy, and F1.
    """
    mcc_scores = []
    accuracy_scores = []
    f1_scores = []
    
    for _ in range(n_bootstrap):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        
        # Calculate metrics for the resampled data
        mcc_scores.append(matthews_corrcoef(y_true_resampled, y_pred_resampled))
        accuracy_scores.append(accuracy_score(y_true_resampled, y_pred_resampled))
        f1_scores.append(f1_score(y_true_resampled, y_pred_resampled, average='weighted'))
    
    # Calculate standard errors
    return {
        'MCC_SE': np.std(mcc_scores),
        'Accuracy_SE': np.std(accuracy_scores),
        'F1_SE': np.std(f1_scores)
    }

def metrics_with_errors(df, preds, n_bootstrap=1000, group_by=None):
    """
    Calculate metrics and bootstrapped standard errors for predictions, optionally grouped.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: Combined DataFrame of metrics, standard errors, and confidence intervals.
    """
    # Step 1: Calculate metrics for each model
    metrics_df = metrics(df, preds, group_by=group_by)

    # Step 2: Calculate bootstrapped errors for each model or group
    errors = []
    if group_by not in ['dataset', 'task']:
        for col in preds:
            y_true = df['entailment']
            y_pred = df[col]
            errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
            errors_dict['Column'] = col
            errors.append(errors_dict)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                y_true = group['entailment']
                y_pred = group[col]
                errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
                errors_dict['Column'] = col
                errors_dict[group_by.capitalize()] = group_name
                errors.append(errors_dict)

    errors_df = pd.DataFrame(errors)

    if group_by in ['dataset', 'task']:
        errors_df = errors_df.set_index(['Column', group_by.capitalize()])
    else:
        errors_df = errors_df.set_index('Column')

    # Step 3: Merge metrics and errors DataFrames
    combined_df = metrics_df.merge(errors_df, left_index=True, right_index=True)

    # Step 4: Calculate confidence intervals (upper and lower bounds)
    combined_df['MCC_Lower'] = combined_df['MCC'] - combined_df['MCC_SE']
    combined_df['MCC_Upper'] = combined_df['MCC'] + combined_df['MCC_SE']

    combined_df['Accuracy_Lower'] = combined_df['Accuracy'] - combined_df['Accuracy_SE']
    combined_df['Accuracy_Upper'] = combined_df['Accuracy'] + combined_df['Accuracy_SE']

    combined_df['F1_Lower'] = combined_df['F1'] - combined_df['F1_SE']
    combined_df['F1_Upper'] = combined_df['F1'] + combined_df['F1_SE']

    return combined_df

def label_docs(model, docs_dict, batch_size = 32, device = 'cuda'):
    """
    Passes documents through the pipeline. Returns a list of entail, not_entail labels
    """
    pipe = pipeline(task = 'text-classification', model = model, 
                    batch_size = batch_size, device = device, 
                    max_length = 512, truncation = True, 
                    torch_dtype = torch.bfloat16)
    res = pipe(docs_dict)
    res = [result['label'] for result in res]
    return res

# Benchmark

In [13]:
# models that will be tested
models = ["MoritzLaurer/deberta-v3-base-zeroshot-v2.0", 
          "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
          "mlburnham/Political_DEBATE_DeBERTa_base_v1.1",
          "mlburnham/Political_DEBATE_large_v1.0",
          "mlburnham/Political_DEBATE_ModernBERT_base_v1.0",
          "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"]

# column names that will hold results
columns = ['base_nli',
           'large_nli',
           'base_debate',
           'large_debate',
           'base_modern',
           'large_modern']

In [7]:
%%time
# for each model, classify documents and return labels to the test dataframe
for modname, col in zip(models, columns):
    res = label_docs(modname, docs_dict)
    test[col] = res
    test[col] = test[col].replace({'entailment': 0, 'not_entailment': 1})
    print(modname + ' complete.')

Device set to use cuda


MoritzLaurer/deberta-v3-base-zeroshot-v2.0 complete.


Device set to use cuda
Device set to use cuda


MoritzLaurer/deberta-v3-large-zeroshot-v2.0 complete.




training_base/checkpoint-96354 complete.


Device set to use cuda


mlburnham/Political_DEBATE_large_v1.0 complete.


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Device set to use cuda
Device set to use cuda


training_ModernBase/checkpoint-96354 complete.
training_ModernLarge/checkpoint-74935 complete.
CPU times: user 3min 10s, sys: 5.22 s, total: 3min 16s
Wall time: 3min 11s




In [40]:
res

Unnamed: 0_level_0,Unnamed: 1_level_0,MCC,Accuracy,F1,MCC_SE,Accuracy_SE,F1_SE,MCC_Lower,MCC_Upper,Accuracy_Lower,Accuracy_Upper,F1_Lower,F1_Upper
Column,Task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
base_nli,event extraction,0.538591,0.753841,0.753169,0.014605,0.007998,0.008023,0.523987,0.553196,0.745843,0.761838,0.745146,0.761192
base_nli,hatespeech and toxicity,0.550569,0.858095,0.845361,0.018456,0.006428,0.007405,0.532113,0.569024,0.851666,0.864523,0.837956,0.852766
base_nli,stance detection,0.530711,0.775285,0.770202,0.011904,0.005824,0.00608,0.518807,0.542616,0.769461,0.781109,0.764122,0.776282
base_nli,topic classification,0.871001,0.935212,0.93455,0.007078,0.003681,0.003756,0.863922,0.878079,0.931531,0.938893,0.930794,0.938306
large_nli,event extraction,0.723042,0.852304,0.852599,0.012123,0.0068,0.006793,0.710919,0.735165,0.845505,0.859104,0.845806,0.859391
large_nli,hatespeech and toxicity,0.553551,0.85443,0.848143,0.018858,0.006269,0.00682,0.534693,0.572409,0.848161,0.860699,0.841323,0.854962
large_nli,stance detection,0.585007,0.797717,0.789429,0.011278,0.005741,0.006135,0.573729,0.596285,0.791976,0.803458,0.783293,0.795564
large_nli,topic classification,0.8964,0.948081,0.947663,0.006109,0.003171,0.003219,0.890291,0.902509,0.94491,0.951251,0.944444,0.950882
base_debate,event extraction,0.765923,0.878492,0.878934,0.01113,0.005935,0.005896,0.754793,0.777052,0.872557,0.884426,0.873038,0.884831
base_debate,hatespeech and toxicity,0.856644,0.9507,0.950405,0.011426,0.003972,0.004018,0.845218,0.868071,0.946728,0.954671,0.946387,0.954423


# Results

In [15]:
# add llama and sonnet to the models list
columns.extend(['llama', 'sonnet'])

In [25]:
%%time
# Calculate performance metrics with bootstrapped standard errors. n_bootstrap == 1000
overall = metrics_with_errors(test, columns, group_by = None)
task = metrics_with_errors(test, columns, group_by = 'task')
dataset = metrics_with_errors(test, columns, group_by = 'dataset')

CPU times: user 4min 17s, sys: 1.33 s, total: 4min 18s
Wall time: 4min 18s


In [None]:
# rename columns and concat results
overall.reset_index(inplace = True)
overall['Task'] = 'overall'
task.reset_index(inplace = True)
dataset.reset_index(inplace = True)
dataset.rename({'Dataset':'Task'}, axis = 1, inplace = True)

combined = pd.concat([overall, task, dataset])
combined.rename({'Column':'Model'}, axis = 1, inplace = True)

# clean up dataset names
combined['Task'] = combined['Task'].str.replace('mlburnham/', '')

In [38]:
combined.to_csv('../data/results_matrix.csv', index = False)