In [1]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
from datasets import load_dataset
import torch

In [2]:
# load data
ds = load_dataset('mlburnham/PolNLI')
train = ds['train'].to_pandas()
test = pd.read_csv('./data/polnli_test_results.csv')

# convert to dictionary of document pairs to pass through the pipeline
docs_dict = [{'text':test.loc[i, 'premise'], 'text_pair':test.loc[i, 'augmented_hypothesis']} for i in test.index]

Using the latest cached version of the dataset since mlburnham/PolNLI couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/mike/.cache/huggingface/datasets/mlburnham___pol_nli/default/0.0.0/062724c0a0e601ffc5fba98a5546cc2237a766fa (last modified on Tue Jan  7 00:25:02 2025).


In [3]:
def metrics(df, preds, group_by=None):
    """
    Calculate metrics grouped by model, dataset, task.
    """
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='weighted')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')

def label_docs(model, docs_dict, batch_size = 32, device = 'cuda'):
    """
    Passes documents through the pipeline. Returns a list of entail, not_entail labels
    """
    pipe = pipeline(task = 'text-classification', model = model, 
                    batch_size = batch_size, device = device, 
                    max_length = 512, truncation = True, 
                    torch_dtype = torch.bfloat16)
    res = pipe(docs_dict)
    res = [result['label'] for result in res]
    return res

# Benchmark

In [4]:
# models that will be tested
models = ["MoritzLaurer/deberta-v3-base-zeroshot-v2.0", 
          "MoritzLaurer/deberta-v3-large-zeroshot-v2.0",
          "training_base/checkpoint-96354",
          "mlburnham/Political_DEBATE_large_v1.0",
          "training_ModernBase/checkpoint-96354",
          "training_ModernLarge/checkpoint-74935"]

# column names that will hold results
columns = ['base_nli',
           'large_nli',
           'base_debate',
           'large_debate',
           'base_modern',
           'large_modern']

In [7]:
%%time
# for each model, classify documents and return labels to the test dataframe
for modname, col in zip(models, columns):
    res = label_docs(modname, docs_dict)
    test[col] = res
    test[col] = test[col].replace({'entailment': 0, 'not_entailment': 1})
    print(modname + ' complete.')

Device set to use cuda


MoritzLaurer/deberta-v3-base-zeroshot-v2.0 complete.


Device set to use cuda
Device set to use cuda


MoritzLaurer/deberta-v3-large-zeroshot-v2.0 complete.




training_base/checkpoint-96354 complete.


Device set to use cuda


mlburnham/Political_DEBATE_large_v1.0 complete.


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Device set to use cuda
Device set to use cuda


training_ModernBase/checkpoint-96354 complete.
training_ModernLarge/checkpoint-74935 complete.
CPU times: user 3min 10s, sys: 5.22 s, total: 3min 16s
Wall time: 3min 11s




# Results

In [8]:
metrics(test, preds = columns, group_by = None)

Unnamed: 0_level_0,MCC,Accuracy,F1
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
base_nli,0.657027,0.834375,0.830335
large_nli,0.7188,0.863074,0.859911
base_debate,0.892088,0.947872,0.94767
large_debate,0.915911,0.959326,0.95918
base_modern,0.877708,0.940843,0.940859
large_modern,0.913069,0.958024,0.957987


In [9]:
metrics(test, preds = columns, group_by = 'task')

Unnamed: 0_level_0,Unnamed: 1_level_0,MCC,Accuracy,F1
Column,Task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
base_nli,event extraction,0.538591,0.753841,0.753169
base_nli,hatespeech and toxicity,0.550569,0.858095,0.845361
base_nli,stance detection,0.530711,0.775285,0.770202
base_nli,topic classification,0.871001,0.935212,0.93455
large_nli,event extraction,0.723042,0.852304,0.852599
large_nli,hatespeech and toxicity,0.553551,0.85443,0.848143
large_nli,stance detection,0.585007,0.797717,0.789429
large_nli,topic classification,0.8964,0.948081,0.947663
base_debate,event extraction,0.765923,0.878492,0.878934
base_debate,hatespeech and toxicity,0.856644,0.9507,0.950405


In [10]:
metrics(test, preds = columns, group_by = 'dataset')

Unnamed: 0_level_0,Unnamed: 1_level_0,MCC,Accuracy,F1
Column,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
base_nli,mlburnham/PoliStance_Affect,0.496475,0.763473,0.759936
base_nli,mlburnham/PoliStance_Affect_QT,0.043096,0.563050,0.555894
base_nli,mlburnham/acled_event_entailment,0.503271,0.707371,0.689879
base_nli,mlburnham/argument_quality_ranking_entailment,0.760144,0.872139,0.869139
base_nli,mlburnham/bill_summary_entailment,0.877433,0.936158,0.936137
...,...,...,...,...
large_modern,mlburnham/ibm_claimstance_topic_entailment,0.966931,0.985258,0.985285
large_modern,mlburnham/polistance_issue_tweets,0.697486,0.973684,0.977890
large_modern,mlburnham/scad_event_entailment,0.784786,0.896750,0.897376
large_modern,mlburnham/targeted_hatespeech_entailment,0.697502,0.962751,0.962563


In [13]:
test.to_csv('data/polnli_test_results.csv', index = False)