In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score, f1_score, accuracy_score

In [2]:
train1 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
train = train[train.notnull().apply(all, axis=1)]
dev = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_dev.csv')
dev = dev[dev.notnull().apply(all, axis=1)]
test = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_test.csv')
test = test[test.notnull().apply(all, axis=1)]

dev_prepared = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrase_vectors_dev.csv', sep=';')
dev_prepared = dev_prepared.drop(columns='Unnamed: 0')
dev = dev.set_index('pairID')
rel_pairIDs = dev_prepared.iloc[:,0]
y_hat = dev.loc[rel_pairIDs].gold_label
dev_prepared = dev_prepared.iloc[:,1:].to_numpy()

In [3]:
BN_dev_small = pd.read_csv("../04_BN_Explanations/BN_explanations_small_model.csv", sep=";", index_col=0)
BN_dev_large = pd.read_csv("../04_BN_Explanations/BN_explanations_large_model.csv", sep=";", index_col=0)

In [4]:
dev = dev.loc[rel_pairIDs]
dev = dev.iloc[BN_dev_large.i]

In [46]:
temp = [None, ] * 10
for i in range(1, 11):
    temp[i - 1] = pd.read_csv("../01_GPT3_Explanations/prepared_data/GPT3_explanations" + str(i) + ".csv", sep=";")
gpt3_dev = pd.concat(temp).set_index("pairID")
gpt3_dev = gpt3_dev.loc[rel_pairIDs]
gpt3_dev = gpt3_dev.iloc[BN_dev_large.i].reset_index()

In [51]:
index_correct_small_SSM_preds = np.where(BN_dev_small.y_hat_BN == gpt3_dev.gold_standard_label)[0]
index_incorrect_small_SSM_preds = np.where(BN_dev_small.y_hat_BN != gpt3_dev.gold_standard_label)[0]
index_correct_large_SSM_preds = np.where(BN_dev_large.y_hat_BN == gpt3_dev.gold_standard_label)[0]
index_incorrect_large_SSM_preds = np.where(BN_dev_large.y_hat_BN != gpt3_dev.gold_standard_label)[0]

In [58]:
print("Accuracy values of GPT-3 labels (when SSM pred. is correct / wrong) with respect to")
print(f"Gold Label: {round(accuracy_score(gpt3_dev.pred_label, gpt3_dev.gold_standard_label), 3)}")
print(f"Small SSM : {round(accuracy_score(gpt3_dev.pred_label, BN_dev_small.y_hat_BN), 3)} ({round(accuracy_score(gpt3_dev.pred_label[index_correct_small_SSM_preds], BN_dev_small.y_hat_BN[index_correct_small_SSM_preds]), 3)} / {round(accuracy_score(gpt3_dev.pred_label[index_incorrect_small_SSM_preds], BN_dev_small.y_hat_BN[index_incorrect_small_SSM_preds]), 3)})")
print(f"Large SSM : {round(accuracy_score(gpt3_dev.pred_label, BN_dev_large.y_hat_BN), 3)} ({round(accuracy_score(gpt3_dev.pred_label[index_correct_large_SSM_preds], BN_dev_large.y_hat_BN[index_correct_large_SSM_preds]), 3)} / {round(accuracy_score(gpt3_dev.pred_label[index_incorrect_small_SSM_preds], BN_dev_large.y_hat_BN[index_incorrect_small_SSM_preds]), 3)})")

print("==================================================")
print("F1-Score values of GPT-3 labels (when SSM pred. is correct / wrong) with respect to")
print(f"Gold Label: {round(f1_score(gpt3_dev.pred_label, gpt3_dev.gold_standard_label, average='macro'), 3)}")
print(f"Small SSM : {round(f1_score(gpt3_dev.pred_label, BN_dev_small.y_hat_BN, average='macro'), 3)} ({round(f1_score(gpt3_dev.pred_label[index_correct_small_SSM_preds], BN_dev_small.y_hat_BN[index_correct_small_SSM_preds], average='macro'), 3)} / {round(f1_score(gpt3_dev.pred_label[index_incorrect_small_SSM_preds], BN_dev_small.y_hat_BN[index_incorrect_small_SSM_preds], average='macro'), 3)})")
print(f"Large SSM : {round(f1_score(gpt3_dev.pred_label, BN_dev_large.y_hat_BN, average='macro'), 3)} ({round(f1_score(gpt3_dev.pred_label[index_correct_large_SSM_preds], BN_dev_large.y_hat_BN[index_correct_large_SSM_preds], average='macro'), 3)} / {round(f1_score(gpt3_dev.pred_label[index_incorrect_small_SSM_preds], BN_dev_large.y_hat_BN[index_incorrect_small_SSM_preds], average='macro'), 3)})")

print("==================================================")
print("Accuracy values of GPT-3 labels (when SSM pred. is correct / wrong) with respect to")
print(f"Gold Label: {round(cohen_kappa_score(gpt3_dev.pred_label, gpt3_dev.gold_standard_label), 3)}")
print(f"Small SSM : {round(cohen_kappa_score(gpt3_dev.pred_label, BN_dev_small.y_hat_BN), 3)} ({round(cohen_kappa_score(gpt3_dev.pred_label[index_correct_small_SSM_preds], BN_dev_small.y_hat_BN[index_correct_small_SSM_preds]), 3)} / {round(cohen_kappa_score(gpt3_dev.pred_label[index_incorrect_small_SSM_preds], BN_dev_small.y_hat_BN[index_incorrect_small_SSM_preds]), 3)})")
print(f"Large SSM : {round(cohen_kappa_score(gpt3_dev.pred_label, BN_dev_large.y_hat_BN), 3)} ({round(cohen_kappa_score(gpt3_dev.pred_label[index_correct_large_SSM_preds], BN_dev_large.y_hat_BN[index_correct_large_SSM_preds]), 3)} / {round(cohen_kappa_score(gpt3_dev.pred_label[index_incorrect_small_SSM_preds], BN_dev_large.y_hat_BN[index_incorrect_small_SSM_preds]), 3)})")

Accuracy values of GPT-3 labels (when SSM pred. is correct / wrong) with respect to
Gold Label: 0.807
Small SSM : 0.566 (0.837 / 0.142)
Large SSM : 0.468 (0.786 / 0.324)
F1-Score values of GPT-3 labels (when SSM pred. is correct / wrong) with respect to
Gold Label: 0.788
Small SSM : 0.545 (0.812 / 0.122)
Large SSM : 0.449 (0.786 / 0.29)
Accuracy values of GPT-3 labels (when SSM pred. is correct / wrong) with respect to
Gold Label: 0.709
Small SSM : 0.339 (0.75 / -0.291)
Large SSM : 0.219 (0.667 / 0.017)
