In [1]:
import pandas as pd
from scipy.stats import bootstrap
from sklearn.metrics import matthews_corrcoef, accuracy_score

def mcc_function(sample1, sample2):
    mcc = matthews_corrcoef(sample1, sample2)
    return mcc

def acc_function(sample1, sample2):
    acc = accuracy_score(sample1, sample2)
    return acc

# set random state
random_state = 1

In [2]:
df = pd.read_csv("polnli_test_results.csv")

In [4]:
df

Unnamed: 0,premise,hypothesis,entailment,dataset,task,augmented_hypothesis,base_nli,large_nli,base_polnli,llama,large_polnli,sonnet
0,Transport workers strike to protest rising fue...,The event described in this text is a strike.,0,mlburnham/scad_event_entailment,event extraction,the incident mentioned in this text is a strike.,0,0,0,0,0,0
1,Municipal workers strike over pay.,The event described in this text is a strike.,0,mlburnham/scad_event_entailment,event extraction,the occurrence detailed in this passage is a s...,0,0,0,0,0,0
2,Niger's mining sector strikes.,The event described in this text is a strike.,0,mlburnham/scad_event_entailment,event extraction,the event described in this text is a strike.,0,0,0,0,0,0
3,Separatist movement protests detention of lead...,The event described in this text is a strike.,0,mlburnham/scad_event_entailment,event extraction,the event described in this text is a strike.,0,0,0,0,0,0
4,Janitors and hospital support staff staged a s...,The event described in this text is a strike.,0,mlburnham/scad_event_entailment,event extraction,the occurrence detailed in this passage is a s...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15361,The limited trickle of items into Gaza force t...,This text is about the use of affirmative action.,1,mlburnham/ibm_claimstance_topic_entailment,topic classification,this text is about the use of affirmative action.,1,1,1,1,1,1
15362,The Israeli blockade is legal,This text is about the use of affirmative action.,1,mlburnham/ibm_claimstance_topic_entailment,topic classification,this text is about the use of affirmative action.,1,1,1,1,1,1
15363,The naval blockade's implementation complied w...,This text is about the use of affirmative action.,1,mlburnham/ibm_claimstance_topic_entailment,topic classification,this text discusses the use of affirmative act...,1,1,1,1,1,1
15364,an open seaport in the Gaza Strip endangered t...,This text is about the use of affirmative action.,1,mlburnham/ibm_claimstance_topic_entailment,topic classification,this text is about the use of affirmative action.,1,1,1,1,1,1


# MCC

In [None]:
# a list of the models and results. Will be used to construct results dataframe
models = ['DeBERTa Base', 'DeBERTa Large', 'PolNLI Base', 'Llama 3.1 8B', 'PolNLI Large', 'Claude 3.5 Sonnet']
mcc = []
lower = []
upper = []

model_columns = df.columns[6:]

for model in model_columns:
    preds = df[model]
    data = (df['entailment'], preds)
    bootstrap_ci = bootstrap(data, mcc_function, 
                             confidence_level=0.95, 
                             vectorized = False, 
                             paired = True,
                             random_state = random_state, 
                             method='BCa')
    
    mcc.append(mcc_function(df['entailment'], preds))
    lower.append(bootstrap_ci.confidence_interval[0])
    upper.append(bootstrap_ci.confidence_interval[1])
    print('Bootstrapped ' + model)

In [24]:
res = pd.DataFrame({'model':models, 'mcc':mcc, 'lower':lower, 'upper':upper})
res['task'] = 'overall'

In [28]:
res.to_csv('mcc_matrix.csv', index = False)

In [26]:
# a list of the models and results. Will be used to construct results dataframe
models = ['DeBERTa Base', 'DeBERTa Large', 'PolNLI Base', 'Llama 3.1 8B', 'PolNLI Large', 'Claude 3.5 Sonnet']
model_columns = df.columns[6:]

for task in df['task'].unique():
    mcc = []
    lower = []
    upper = []
    for model in model_columns:
        preds = df.loc[df['task'] == task, model]
        labs = df.loc[df['task'] == task, 'entailment']
        data = (labs, preds)
        bootstrap_ci = bootstrap(data, mcc_function, 
                                 confidence_level=0.95, 
                                 vectorized = False, 
                                 paired = True,
                                 random_state = random_state, 
                                 method='BCa')
        
        mcc.append(mcc_function(labs, preds))
        lower.append(bootstrap_ci.confidence_interval[0])
        upper.append(bootstrap_ci.confidence_interval[1])
        print('Bootstrapped ' + model)
    task_res = pd.DataFrame({'model':models, 'mcc':mcc, 'lower':lower, 'upper':upper})
    task_res['task'] = task
    res = pd.concat([res, task_res])
    print('Finished ' + task)

Bootstrapped base_nli
Bootstrapped large_nli
Bootstrapped base_polnli
Bootstrapped llama
Bootstrapped large_polnli
Bootstrapped sonnet
Finished event extraction
Bootstrapped base_nli
Bootstrapped large_nli
Bootstrapped base_polnli
Bootstrapped llama
Bootstrapped large_polnli
Bootstrapped sonnet
Finished hatespeech and toxicity
Bootstrapped base_nli
Bootstrapped large_nli
Bootstrapped base_polnli
Bootstrapped llama
Bootstrapped large_polnli
Bootstrapped sonnet
Finished stance detection
Bootstrapped base_nli
Bootstrapped large_nli
Bootstrapped base_polnli
Bootstrapped llama
Bootstrapped large_polnli
Bootstrapped sonnet
Finished topic classification


In [27]:
res

Unnamed: 0,model,mcc,lower,upper,task
0,NLI Base,0.658454,0.646708,0.670529,overall
1,NLI Large,0.731021,0.720458,0.741759,overall
2,PolNLI Base,0.894269,0.886919,0.901347,overall
3,Llama 3.1 8B,0.730997,0.720419,0.741267,overall
4,PolNLI Large,0.915911,0.909458,0.922172,overall
5,Claude 3.5 Sonnet,0.815902,0.806819,0.824822,overall
0,DeBERTa Base,0.528918,0.499752,0.55723,event extraction
1,DeBERTa Large,0.718805,0.694819,0.741574,event extraction
2,PolNLI Base,0.813742,0.79203,0.833727,event extraction
3,Llama 3.1 8B,0.808244,0.785527,0.828796,event extraction


# Balanced Accuracy