In [24]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import matthews_corrcoef, accuracy_score, f1_score
from datasets import load_dataset

In [25]:
ds = load_dataset('mlburnham/PolNLI')
train = ds['train'].to_pandas()
test = pd.read_csv('../data/polnli_test_results.csv')

Using the latest cached version of the dataset since mlburnham/PolNLI couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\mikeb\.cache\huggingface\datasets\mlburnham___pol_nli\default\0.0.0\3d563e282efafcfed852d99d41e6a3e578a77550 (last modified on Tue Aug  6 23:43:00 2024).


In [11]:
def metrics(df, preds, group_by=None):
    true_col = 'entailment'
    
    def get_metrics(y_true, y_pred):
        return {
            'MCC': matthews_corrcoef(y_true, y_pred),
            'Accuracy': accuracy_score(y_true, y_pred),
            'F1': f1_score(y_true, y_pred, average='weighted')
        }
    
    results = []
    
    if group_by not in ['dataset', 'task']:
        for col in preds:
            metrics = get_metrics(df[true_col], df[col])
            metrics['Column'] = col
            results.append(metrics)
    else:
        for col in preds:
            for group_name, group in df.groupby(group_by):
                metrics = get_metrics(group[true_col], group[col])
                metrics['Column'] = col
                metrics[group_by.capitalize()] = group_name
                results.append(metrics)
    
    results_df = pd.DataFrame(results)
    
    if group_by in ['dataset', 'task']:
        return results_df.set_index(['Column', group_by.capitalize()])
    else:
        return results_df.set_index('Column')

# Base Generic NLI

In [4]:
model = "MoritzLaurer/deberta-v3-base-zeroshot-v2.0"
pipe = pipeline("zero-shot-classification", model = model, device = 0, batch_size = 32)

In [5]:
%%time
colname = 'base_nli'
test[colname] = 0

for i in test.index:
    hypothesis = test.loc[i, 'augmented_hypothesis']
    sample = test.loc[i, 'premise']
    res = pipe(sample, hypothesis, hypothesis_template = '{}', multi_label = False)
    test.loc[i, colname] = round(res['scores'][0])
test[colname].replace({0:1, 1:0}, inplace = True)

test[colname] = test[colname].astype(int)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


CPU times: total: 4min 9s
Wall time: 4min 37s


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




# Large Generic NLI

In [6]:
model = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0"
pipe = pipeline("zero-shot-classification", model = model, device = 0, batch_size = 32)

In [7]:
%%time
colname = 'large_nli'
test[colname] = 0

for i in test.index:
    hypothesis = test.loc[i, 'augmented_hypothesis']
    sample = test.loc[i, 'premise']
    res = pipe(sample, hypothesis, hypothesis_template = '{}', multi_label = False)
    test.loc[i, colname] = round(res['scores'][0])
test[colname].replace({0:1, 1:0}, inplace = True)

test[colname] = test[colname].astype(int)

CPU times: total: 7min 42s
Wall time: 8min 24s


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




# Base PolNLI

In [8]:
model = "training_base/best_checkpoint_augmented"
pipe = pipeline("zero-shot-classification", model = model, device = 0, batch_size = 32)

In [9]:
%%time
colname = 'base_polnli'
test[colname] = 0

for i in test.index:
    hypothesis = test.loc[i, 'augmented_hypothesis']
    sample = test.loc[i, 'premise']
    res = pipe(sample, hypothesis, hypothesis_template = '{}', multi_label = False)
    test.loc[i, colname] = round(res['scores'][0])
test[colname].replace({0:1, 1:0}, inplace = True)

test[colname] = test[colname].astype(int)

CPU times: total: 4min 5s
Wall time: 4min 28s


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




# Large Pol NLI

In [4]:
model = "training_large/checkpoint-128462"
pipe = pipeline("zero-shot-classification", model = model, device = 0, batch_size = 64)

In [5]:
%%time
colname = 'large_polnli2'
test[colname] = 0

for i in test.index:
    hypothesis = test.loc[i, 'augmented_hypothesis']
    sample = test.loc[i, 'premise']
    res = pipe(sample, hypothesis, hypothesis_template = '{}', multi_label = False)
    test.loc[i, colname] = round(res['scores'][0])
test[colname].replace({0:1, 1:0}, inplace = True)

test[colname] = test[colname].astype(int)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


CPU times: total: 7min 43s
Wall time: 8min 23s


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




In [6]:
models = ['base_nli', 'large_nli', 'base_polnli', 'large_polnli', 'large_polnli2', 'llama']
metrics(test, preds = models, group_by = None)

Unnamed: 0_level_0,MCC,Accuracy,F1
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
base_nli,0.658454,0.834635,0.830205
large_nli,0.731021,0.869062,0.866256
base_polnli,0.894269,0.948978,0.948852
large_polnli,0.915911,0.959326,0.95918
large_polnli2,0.907246,0.955226,0.955123
llama,0.730997,0.862358,0.863467


In [7]:
metrics(test, preds = models, group_by = 'task')

Unnamed: 0_level_0,Unnamed: 1_level_0,MCC,Accuracy,F1
Column,Task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
base_nli,event extraction,0.528918,0.746858,0.74571
base_nli,hatespeech and toxicity,0.494319,0.844437,0.822392
base_nli,stance detection,0.553824,0.786101,0.781703
base_nli,topic classification,0.875762,0.937653,0.937047
large_nli,event extraction,0.718805,0.850209,0.850508
large_nli,hatespeech and toxicity,0.571824,0.861426,0.854152
large_nli,stance detection,0.612922,0.811736,0.80547
large_nli,topic classification,0.899292,0.949634,0.949257
base_polnli,event extraction,0.813742,0.906774,0.907042
base_polnli,hatespeech and toxicity,0.84141,0.946036,0.945248


In [86]:
metrics(test, preds = models, group_by = 'dataset').tail(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,MCC,Accuracy,F1
Column,Dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
base_polnli,mlburnham/acled_event_entailment,0.867357,0.932893,0.932942
base_polnli,mlburnham/argument_quality_ranking_entailment,0.9603,0.980268,0.980265
base_polnli,mlburnham/bill_summary_entailment,0.907231,0.953569,0.953587
base_polnli,mlburnham/dehumanizing_hatespeech_entailment,0.816036,0.909804,0.910597
base_polnli,mlburnham/dem_rep_party_platform_topics,0.938602,0.970972,0.970993
base_polnli,mlburnham/ibm_claimstance_entailment,0.974825,0.98827,0.98827
base_polnli,mlburnham/ibm_claimstance_topic_entailment,0.923211,0.965602,0.965726
base_polnli,mlburnham/polistance_issue_tweets,1.0,1.0,1.0
base_polnli,mlburnham/scad_event_entailment,0.712124,0.861377,0.862392
base_polnli,mlburnham/targeted_hatespeech_entailment,0.65727,0.961318,0.958885


In [95]:
test.to_csv('../data/polnli_test_results.csv', index = False)