In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
from sklearn.utils import resample
from datasets import load_dataset, DatasetDict, Dataset
from tqdm import tqdm
import random

In [2]:
def metrics(df, preds, group_by=None):
    """
    Calculate MCC, Accuracy, F1 for predictions.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: DataFrame with calculated metrics, optionally grouped by `group_by`.
    """
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='macro')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')

def bootstrapped_errors(y_true, y_pred, n_bootstrap=1000):
    """
    Calculate bootstrapped standard errors for MCC, Accuracy, and F1.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted labels.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.

    Returns:
        dict: Standard errors for MCC, Accuracy, and F1.
    """
    mcc_scores = []
    accuracy_scores = []
    f1_scores = []
    
    for _ in range(n_bootstrap):
        # Resample with replacement
        y_true_resampled, y_pred_resampled = resample(y_true, y_pred)
        
        # Calculate metrics for the resampled data
        mcc_scores.append(matthews_corrcoef(y_true_resampled, y_pred_resampled))
        accuracy_scores.append(accuracy_score(y_true_resampled, y_pred_resampled))
        f1_scores.append(f1_score(y_true_resampled, y_pred_resampled, average='weighted'))
    
    # Calculate standard errors
    return {
        'MCC_SE': np.std(mcc_scores),
        'Accuracy_SE': np.std(accuracy_scores),
        'F1_SE': np.std(f1_scores)
    }

def metrics_with_errors(df, preds, n_bootstrap=1000, group_by=None):
    """
    Calculate metrics and bootstrapped standard errors for predictions, optionally grouped.

    Args:
        df (pd.DataFrame): The input DataFrame containing true and predicted labels.
        preds (list): List of column names containing model predictions.
        n_bootstrap (int, optional): Number of bootstrap samples. Defaults to 1000.
        group_by (str, optional): Column name to group by ('dataset' or 'task'). Defaults to None.

    Returns:
        pd.DataFrame: Combined DataFrame of metrics, standard errors, and confidence intervals.
    """
    # Step 1: Calculate metrics for each model
    metrics_df = metrics(df, preds, group_by=group_by)

    # Step 2: Calculate bootstrapped errors for each model or group
    errors = []
    if group_by not in ['dataset', 'task']:
        for col in preds:
            y_true = df['entailment']
            y_pred = df[col]
            errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
            errors_dict['Column'] = col
            errors.append(errors_dict)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                y_true = group['entailment']
                y_pred = group[col]
                errors_dict = bootstrapped_errors(y_true, y_pred, n_bootstrap=n_bootstrap)
                errors_dict['Column'] = col
                errors_dict[group_by.capitalize()] = group_name
                errors.append(errors_dict)

    errors_df = pd.DataFrame(errors)

    if group_by in ['dataset', 'task']:
        errors_df = errors_df.set_index(['Column', group_by.capitalize()])
    else:
        errors_df = errors_df.set_index('Column')

    # Step 3: Merge metrics and errors DataFrames
    combined_df = metrics_df.merge(errors_df, left_index=True, right_index=True)

    # Step 4: Calculate confidence intervals (upper and lower bounds)
    combined_df['MCC_Lower'] = combined_df['MCC'] - combined_df['MCC_SE']
    combined_df['MCC_Upper'] = combined_df['MCC'] + combined_df['MCC_SE']

    combined_df['Accuracy_Lower'] = combined_df['Accuracy'] - combined_df['Accuracy_SE']
    combined_df['Accuracy_Upper'] = combined_df['Accuracy'] + combined_df['Accuracy_SE']

    combined_df['F1_Lower'] = combined_df['F1'] - combined_df['F1_SE']
    combined_df['F1_Upper'] = combined_df['F1'] + combined_df['F1_SE']

    return combined_df

def label_docs(model, docs_dict, batch_size = 8, device = 'cuda'):
    """
    Passes documents through the pipeline. Returns a list of entail, not_entail labels
    """
    pipe = pipeline(task = 'text-classification', model = model, 
                    batch_size = batch_size, device = device, 
                    max_length = 512, truncation = True, 
                    torch_dtype = torch.bfloat16)
    res = pipe(docs_dict)
    res = [result['label'] for result in res]
    return res

# UKP Stance

In [6]:
ukp = pd.read_csv('../data/ukp_stance.csv')

In [8]:
docs_dict = [{'text':ukp.loc[i, 'text'], 'text_pair':ukp.loc[i, 'hypothesis']} for i in ukp.index]

# models that will be tested
models = ["MoritzLaurer/deberta-v3-base-zeroshot-v2.0", 
          "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
          "mlburnham/Political_DEBATE_DeBERTa_base_v1.1",
          "mlburnham/Political_DEBATE_DeBERTa_large_v1.1",
          "mlburnham/Political_DEBATE_ModernBERT_base_v1.0",
          "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"]

# column names that will hold results
columns = ['base_nli',
           'large_nli',
           'base_debate',
           'large_debate',
           'base_modern',
           'large_modern']

In [11]:
%%time
# for each model, classify documents and return labels to the test dataframe
for modname, col in zip(models, columns):
    res = label_docs(modname, docs_dict, batch_size = 8, device = 'cuda')
    ukp[col] = res
    ukp[col] = ukp[col].replace({'entailment': 0, 'not_entailment': 1})
    print(modname + ' complete.')

Device set to use cuda


MoritzLaurer/deberta-v3-base-zeroshot-v2.0 complete.


Device set to use cuda


MoritzLaurer/deberta-v3-large-zeroshot-v2.0 complete.


Device set to use cuda


mlburnham/Political_DEBATE_DeBERTa_base_v1.1 complete.


Device set to use cuda


mlburnham/Political_DEBATE_DeBERTa_large_v1.1 complete.


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Device set to use cuda


mlburnham/Political_DEBATE_ModernBERT_base_v1.0 complete.


Device set to use cuda


mlburnham/Political_DEBATE_ModernBERT_large_v1.0 complete.
CPU times: user 52.8 s, sys: 10.3 s, total: 1min 3s
Wall time: 55.6 s




In [12]:
%%time
# Calculate performance metrics with bootstrapped standard errors. n_bootstrap == 1000
overall = metrics_with_errors(ukp, columns, group_by = None)
overall

CPU times: user 9.66 s, sys: 0 ns, total: 9.66 s
Wall time: 7.9 s


Unnamed: 0_level_0,MCC,Accuracy,F1,MCC_SE,Accuracy_SE,F1_SE,MCC_Lower,MCC_Upper,Accuracy_Lower,Accuracy_Upper,F1_Lower,F1_Upper
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
base_nli,0.598784,0.828659,0.798812,0.012397,0.005373,0.005465,0.586387,0.611181,0.823286,0.834032,0.793347,0.804277
large_nli,0.720828,0.880488,0.850004,0.010343,0.004645,0.00508,0.710485,0.731171,0.875842,0.885133,0.844924,0.855083
base_debate,0.771101,0.896951,0.884403,0.009471,0.004323,0.004237,0.76163,0.780572,0.892628,0.901274,0.880166,0.88864
large_debate,0.854828,0.934146,0.926161,0.007594,0.003516,0.003433,0.847234,0.862422,0.930631,0.937662,0.922728,0.929593
base_modern,0.682965,0.858537,0.840708,0.010909,0.004877,0.004791,0.672056,0.693874,0.85366,0.863414,0.835916,0.845499
large_modern,0.826447,0.922561,0.912572,0.008298,0.003701,0.003638,0.818149,0.834745,0.91886,0.926262,0.908934,0.916211


In [14]:
ukp.to_csv('../data/ukp_stance.csv', index = False)

# UKP Topic

In [15]:
topic = pd.read_csv('../data/ukp_topic.csv')

In [17]:
docs_dict = [{'text':topic.loc[i, 'text'], 'text_pair':topic.loc[i, 'hypothesis']} for i in topic.index]

In [18]:
# models that will be tested
models = ["MoritzLaurer/deberta-v3-base-zeroshot-v2.0", 
          "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
          "mlburnham/Political_DEBATE_DeBERTa_base_v1.1",
          "mlburnham/Political_DEBATE_DeBERTa_large_v1.1",
          "mlburnham/Political_DEBATE_ModernBERT_base_v1.0",
          "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"]

# column names that will hold results
columns = ['base_nli',
           'large_nli',
           'base_debate',
           'large_debate',
           'base_modern',
           'large_modern']

In [19]:
%%time
# for each model, classify documents and return labels to the test dataframe
for modname, col in zip(models, columns):
    res = label_docs(modname, docs_dict, batch_size = 8, device = 'cuda')
    topic[col] = res
    topic[col] = topic[col].replace({'entailment': 0, 'not_entailment': 1})
    print(modname + ' complete.')

Device set to use cuda


MoritzLaurer/deberta-v3-base-zeroshot-v2.0 complete.


Device set to use cuda


MoritzLaurer/deberta-v3-large-zeroshot-v2.0 complete.


Device set to use cuda


mlburnham/Political_DEBATE_DeBERTa_base_v1.1 complete.


Device set to use cuda
Device set to use cuda


mlburnham/Political_DEBATE_DeBERTa_large_v1.1 complete.




mlburnham/Political_DEBATE_ModernBERT_base_v1.0 complete.


Device set to use cuda


mlburnham/Political_DEBATE_ModernBERT_large_v1.0 complete.
CPU times: user 1min 21s, sys: 9.61 s, total: 1min 31s
Wall time: 1min 24s




In [20]:
%%time
# Calculate performance metrics with bootstrapped standard errors.
overall = metrics_with_errors(topic, columns, group_by = None)
overall

CPU times: user 11.8 s, sys: 0 ns, total: 11.8 s
Wall time: 10.4 s


Unnamed: 0_level_0,MCC,Accuracy,F1,MCC_SE,Accuracy_SE,F1_SE,MCC_Lower,MCC_Upper,Accuracy_Lower,Accuracy_Upper,F1_Lower,F1_Upper
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
base_nli,0.834676,0.911077,0.91041,0.004835,0.002816,0.002857,0.829841,0.83951,0.908261,0.913893,0.907554,0.913267
large_nli,0.838045,0.912602,0.911932,0.004617,0.002718,0.002754,0.833428,0.842662,0.909884,0.915319,0.909178,0.914686
base_debate,0.903384,0.951626,0.951622,0.004298,0.002154,0.002154,0.899085,0.907682,0.949473,0.95378,0.949468,0.953777
large_debate,0.933844,0.966463,0.966447,0.003466,0.00178,0.001782,0.930378,0.93731,0.964683,0.968243,0.964665,0.968229
base_modern,0.894691,0.946646,0.946605,0.004396,0.002254,0.002258,0.890295,0.899087,0.944392,0.948901,0.944347,0.948862
large_modern,0.922943,0.960976,0.960955,0.003856,0.001976,0.001977,0.919087,0.926799,0.959,0.962951,0.958978,0.962932


In [22]:
topic.to_csv('../data/ukp_topic.csv', index = False)

# RAND Terrorism

In [24]:
rand = pd.read_csv('../data/rand_terror.csv')

# models that will be tested
models = ["MoritzLaurer/deberta-v3-base-zeroshot-v2.0", 
          "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
          "mlburnham/Political_DEBATE_DeBERTa_base_v1.1",
          "mlburnham/Political_DEBATE_DeBERTa_large_v1.1",
          "mlburnham/Political_DEBATE_ModernBERT_base_v1.0",
          "mlburnham/Political_DEBATE_ModernBERT_large_v1.0"]

# column names that will hold results
columns = ['base_nli',
           'large_nli',
           'base_debate',
           'large_debate',
           'base_modern',
           'large_modern']

In [27]:
for modname, col in zip(models, columns):
    pipe = pipeline(task = 'zero-shot-classification', model = modname, 
                        batch_size = 16, device = 'cuda', 
                        max_length = 512, truncation = True)
    
    labels = list(rand['hypothesis'].unique())
    
    res = pipe(list(rand['premise']), candidate_labels = labels, template = {})
    res = [result['labels'][0] for result in res]
    rand[col] = res
    print(modname + ' complete.')

Device set to use cuda


MoritzLaurer/deberta-v3-base-zeroshot-v2.0 complete.


Device set to use cuda
Device set to use cuda


MoritzLaurer/deberta-v3-large-zeroshot-v2.0 complete.


Device set to use cuda


mlburnham/Political_DEBATE_DeBERTa_base_v1.1 complete.


You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in ModernBertForSequenceClassification is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Device set to use cuda


mlburnham/Political_DEBATE_DeBERTa_large_v1.1 complete.


Device set to use cuda


mlburnham/Political_DEBATE_ModernBERT_base_v1.0 complete.
mlburnham/Political_DEBATE_ModernBERT_large_v1.0 complete.


  rand[columns].replace({'This text describes an explosives attack.': 1,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rand[columns].replace({'This text describes an explosives attack.': 1,


In [32]:
for col in columns:
    rand[col].replace({'This text describes an explosives attack.': 1,
       'This text describes a firearms attack.': 2,
       'This text describes an arson attack.': 3,
       'This text describes a knife or sharp object attack.': 4,
       'This text describes a biological agent attack.': 5,
       'This text describes a chemical agent attack.': 6}, inplace = True)

  rand[col].replace({'This text describes an explosives attack.': 1,
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rand[col].replace({'This text describes an explosives attack.': 1,


In [35]:
overall = metrics_with_errors(rand, columns, group_by = None)
overall

Unnamed: 0_level_0,MCC,Accuracy,F1,MCC_SE,Accuracy_SE,F1_SE,MCC_Lower,MCC_Upper,Accuracy_Lower,Accuracy_Upper,F1_Lower,F1_Upper
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
base_nli,0.842392,0.915979,0.858181,0.011531,0.006337,0.006656,0.83086,0.853923,0.909642,0.922316,0.851524,0.864837
large_nli,0.936551,0.96701,0.915826,0.007486,0.003923,0.004038,0.929066,0.944037,0.963087,0.970933,0.911789,0.919864
base_debate,0.919739,0.958247,0.905951,0.008338,0.004496,0.005017,0.911401,0.928076,0.953752,0.962743,0.900935,0.910968
large_debate,0.94004,0.968041,0.939939,0.007238,0.003913,0.003881,0.932802,0.947277,0.964128,0.971954,0.936059,0.94382
base_modern,0.727122,0.847938,0.634377,0.012973,0.007935,0.007155,0.714149,0.740095,0.840003,0.855873,0.627222,0.641532
large_modern,0.866921,0.929897,0.833755,0.010945,0.005879,0.005995,0.855975,0.877866,0.924017,0.935776,0.82776,0.83975


In [36]:
rand.to_csv('../data/rand_terror.csv', index = False)