In [2]:
import pandas as pd
from bert_score import score
import numpy as np
from scipy import spatial
def cosine_sim(v1,v2): return 1 - spatial.distance.cosine(v1,v2)
from sklearn.metrics import jaccard_score
from sklearn.feature_extraction.text import CountVectorizer
import spacy
nlp = spacy.load("en_core_web_lg")

In [3]:
train1 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
train = train[train.notnull().apply(all, axis=1)]
dev = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_dev.csv')
dev = dev[dev.notnull().apply(all, axis=1)]
test = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_test.csv')
test = test[test.notnull().apply(all, axis=1)]

dev_prepared = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrase_vectors_dev.csv', sep=';')
dev_prepared = dev_prepared.drop(columns='Unnamed: 0')
dev = dev.set_index('pairID')
rel_pairIDs = dev_prepared.iloc[:,0]
y_hat = dev.loc[rel_pairIDs].gold_label
dev_prepared = dev_prepared.iloc[:,1:].to_numpy()

In [4]:
BN_dev_small = pd.read_csv("../04_BN_Explanations/BN_explanations_small_model.csv", sep=";", index_col=0)
BN_dev_large = pd.read_csv("../04_BN_Explanations/BN_explanations_large_model.csv", sep=";", index_col=0)

In [5]:
dev = dev.loc[rel_pairIDs]
dev = dev.iloc[BN_dev_large.i]

In [6]:
temp = [None, ] * 10
for i in range(1, 11):
    temp[i - 1] = pd.read_csv("../01_GPT3_Explanations/prepared_data/GPT3_explanations" + str(i) + ".csv", sep=";")
gpt3_dev = pd.concat(temp).set_index("pairID")
gpt3_dev = gpt3_dev.loc[rel_pairIDs]
gpt3_dev = gpt3_dev.iloc[BN_dev_large.i].reset_index()

In [7]:
vectorizer = CountVectorizer(binary=True)
all_explanations = gpt3_dev.pred_explanation.to_list() + gpt3_dev.gold_standard_explanation.to_list() + BN_dev_large.BN_expl.to_list() + BN_dev_small.BN_expl.to_list()
binary_counts = vectorizer.fit_transform(all_explanations)
all_models_binary = binary_counts.toarray()
gpt3_binary = all_models_binary[:gpt3_dev.shape[0]]
gold_binary = all_models_binary[gpt3_dev.shape[0]:2*gpt3_dev.shape[0]]
ssm_large_binary = all_models_binary[2*gpt3_dev.shape[0]:2*gpt3_dev.shape[0]+BN_dev_large.shape[0]]
ssm_small_binary = all_models_binary[-BN_dev_small.shape[0]:]

In [8]:
jaccard_scores_ssm_small = [jaccard_score(explanation_ssm_small, explanation_gpt3) for explanation_ssm_small, explanation_gpt3 in zip(ssm_small_binary, gpt3_binary)]
jaccard_scores_ssm_large = [jaccard_score(explanation_ssm_large, explanation_gpt3) for explanation_ssm_large, explanation_gpt3 in zip(ssm_large_binary, gpt3_binary)]
jaccard_scores_gold = [jaccard_score(explanation_gold, explanation_gpt3) for explanation_gold, explanation_gpt3 in zip(gold_binary, gpt3_binary)]

In [9]:
embedding_vecs_gpt3 = [nlp(s).vector for s in gpt3_dev.pred_explanation]
embedding_vecs_gold = [nlp(s).vector for s in gpt3_dev.gold_standard_explanation]
embedding_vecs_SSM_small = [nlp(s).vector for s in BN_dev_small.BN_expl]
embedding_vecs_SSM_large = [nlp(s).vector for s in BN_dev_large.BN_expl]

In [10]:
cosine_scores_ssm_small = [cosine_sim(explanation_ssm_small, explanation_gpt3) for explanation_ssm_small, explanation_gpt3 in zip(embedding_vecs_SSM_small, embedding_vecs_gpt3)]
cosine_scores_ssm_large = [cosine_sim(explanation_ssm_large, explanation_gpt3) for explanation_ssm_large, explanation_gpt3 in zip(embedding_vecs_SSM_large, embedding_vecs_gpt3)]
cosine_scores_gold = [cosine_sim(explanation_gold, explanation_gpt3) for explanation_gold, explanation_gpt3 in zip(embedding_vecs_gold, embedding_vecs_gpt3)]

In [93]:
bert_scores_precision_ssm_small, bert_scores_recall_ssm_small, bert_scores_f1_ssm_small = score(BN_dev_small.BN_expl.to_list(), gpt3_dev.pred_explanation.to_list(), lang="en", model_type="bert-base-uncased")
bert_scores_precision_ssm_large, bert_scores_recall_ssm_large, bert_scores_f1_ssm_large = score(BN_dev_large.BN_expl.to_list(), gpt3_dev.pred_explanation.to_list(), lang="en", model_type="bert-base-uncased")
bert_scores_precision_gold, bert_scores_recall_gold, bert_scores_f1_gold = score(gpt3_dev.gold_standard_explanation.to_list(), gpt3_dev.pred_explanation.to_list(), lang="en", model_type="bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder

In [139]:
jaccard_scores_ssm_small = np.array(jaccard_scores_ssm_small)
jaccard_scores_ssm_large = np.array(jaccard_scores_ssm_large)
jaccard_scores_gold = np.array(jaccard_scores_gold)

cosine_scores_ssm_small = np.array(cosine_scores_ssm_small)
cosine_scores_ssm_large = np.array(cosine_scores_ssm_large)
cosine_scores_gold = np.array(cosine_scores_gold)

In [152]:
index_correct_small_SSM_preds = np.where(BN_dev_small.y_hat_BN == gpt3_dev.pred_label)[0]
index_incorrect_small_SSM_preds = np.where(BN_dev_small.y_hat_BN != gpt3_dev.pred_label)[0]
index_correct_large_SSM_preds = np.where(BN_dev_large.y_hat_BN == gpt3_dev.pred_label)[0]
index_incorrect_large_SSM_preds = np.where(BN_dev_large.y_hat_BN != gpt3_dev.pred_label)[0]
index_correct_gpt3_preds = np.where(gpt3_dev.pred_label == gpt3_dev.gold_standard_label)[0]
index_incorrect_gpt3_preds = np.where(gpt3_dev.pred_label != gpt3_dev.gold_standard_label)[0]

In [156]:
print(f"Jaccard similarity between GPT-3 explanations and small SSM: {round(np.mean(jaccard_scores_ssm_small), 3)}, {round(np.std(jaccard_scores_ssm_small), 3)} ({round(np.mean(jaccard_scores_ssm_small[index_correct_small_SSM_preds]), 3)}, {round(np.std(jaccard_scores_ssm_small[index_correct_small_SSM_preds]), 3)} / {round(np.mean(jaccard_scores_ssm_small[index_incorrect_small_SSM_preds]), 3)}, {round(np.std(jaccard_scores_ssm_small[index_incorrect_small_SSM_preds]), 3)})")

print(f"Jaccard similarity between GPT-3 explanations and large SSM: {round(np.mean(jaccard_scores_ssm_large), 3)}, {round(np.std(jaccard_scores_ssm_large), 3)} ({round(np.mean(jaccard_scores_ssm_large[index_correct_large_SSM_preds]), 3)}, {round(np.std(jaccard_scores_ssm_large[index_correct_large_SSM_preds]), 3)} / {round(np.mean(jaccard_scores_ssm_large[index_incorrect_large_SSM_preds]), 3)}, {round(np.std(jaccard_scores_ssm_large[index_incorrect_large_SSM_preds]), 3)})")

print(f"Jaccard similarity between GPT-3 explanations and gold explanations: {round(np.mean(jaccard_scores_gold), 3)}, {round(np.std(jaccard_scores_gold), 3)} ({round(np.mean(jaccard_scores_gold[index_correct_gpt3_preds]), 3)}, {round(np.std(jaccard_scores_gold[index_correct_gpt3_preds]), 3)} / {round(np.mean(jaccard_scores_gold[index_incorrect_gpt3_preds]), 3)}, {round(np.std(jaccard_scores_gold[index_incorrect_gpt3_preds]), 3)})")

print("==============================================================================")
print(f"Cosine similarity between GPT-3 explanations and small SSM: {round(np.mean(cosine_scores_ssm_small), 3)}, {round(np.std(cosine_scores_ssm_small), 3)} ({round(np.mean(cosine_scores_ssm_small[index_correct_small_SSM_preds]), 3)}, {round(np.std(cosine_scores_ssm_small[index_correct_small_SSM_preds]), 3)} / {round(np.mean(cosine_scores_ssm_small[index_incorrect_small_SSM_preds]), 3)}, {round(np.std(cosine_scores_ssm_small[index_incorrect_small_SSM_preds]), 3)})")

print(f"Cosine similarity between GPT-3 explanations and large SSM: {round(np.mean(cosine_scores_ssm_large), 3)}, {round(np.std(cosine_scores_ssm_large), 3)} ({round(np.mean(cosine_scores_ssm_large[index_correct_large_SSM_preds]), 3)}, {round(np.std(cosine_scores_ssm_large[index_correct_large_SSM_preds]), 3)} / {round(np.mean(cosine_scores_ssm_large[index_incorrect_large_SSM_preds]), 3)}, {round(np.std(cosine_scores_ssm_large[index_incorrect_large_SSM_preds]), 3)})")

print(f"Cosine similarity between GPT-3 explanations and gold explanations: {round(np.mean(cosine_scores_gold), 3)}, {round(np.std(cosine_scores_gold), 3)} ({round(np.mean(cosine_scores_gold[index_correct_gpt3_preds]), 3)}, {round(np.std(cosine_scores_gold[index_correct_gpt3_preds]), 3)} / {round(np.mean(cosine_scores_gold[index_incorrect_gpt3_preds]), 3)}, {round(np.std(cosine_scores_gold[index_incorrect_gpt3_preds]), 3)})")

print("==============================================================================")
print(f"BERTScore between GPT-3 explanations and small SSM: {round(np.mean(bert_scores_f1_ssm_small.numpy()), 3)}, {round(np.std(bert_scores_f1_ssm_small.numpy()), 3)} ({round(np.mean(bert_scores_f1_ssm_small.numpy()[index_correct_small_SSM_preds]), 3)}, {round(np.std(bert_scores_f1_ssm_small.numpy()[index_correct_small_SSM_preds]), 3)} / {round(np.mean(bert_scores_f1_ssm_small.numpy()[index_incorrect_small_SSM_preds]), 3)}, {round(np.std(bert_scores_f1_ssm_small.numpy()[index_incorrect_small_SSM_preds]), 3)})")

print(f"BERTScore between GPT-3 explanations and large SSM: {round(np.mean(bert_scores_f1_ssm_large.numpy()), 3)}, {round(np.std(bert_scores_f1_ssm_large.numpy()), 3)} ({round(np.mean(bert_scores_f1_ssm_large.numpy()[index_correct_large_SSM_preds]), 3)}, {round(np.std(bert_scores_f1_ssm_large.numpy()[index_correct_large_SSM_preds]), 3)} / {round(np.mean(bert_scores_f1_ssm_large.numpy()[index_incorrect_large_SSM_preds]), 3)}, {round(np.std(bert_scores_f1_ssm_large.numpy()[index_incorrect_large_SSM_preds]), 3)})")

print(f"BERTScore between GPT-3 explanations and gold explanations: {round(np.mean(bert_scores_f1_gold.numpy()), 3)}, {round(np.std(bert_scores_f1_gold.numpy()), 3)} ({round(np.mean(bert_scores_f1_gold.numpy()[index_correct_gpt3_preds]), 3)}, {round(np.std(bert_scores_f1_gold.numpy()[index_correct_gpt3_preds]), 3)} / {round(np.mean(bert_scores_f1_gold.numpy()[index_incorrect_gpt3_preds]), 3)}, {round(np.std(bert_scores_f1_gold.numpy()[index_incorrect_gpt3_preds]), 3)})")

Jaccard similarity between GPT-3 explanations and small SSM: 0.196, 0.107 (0.218, 0.115 / 0.168, 0.087)
Jaccard similarity between GPT-3 explanations and large SSM: 0.182, 0.095 (0.198, 0.102 / 0.168, 0.087)
Jaccard similarity between GPT-3 explanations and gold explanations: 0.277, 0.158 (0.288, 0.162 / 0.233, 0.134)
Cosine similarity between GPT-3 explanations and small SSM: 0.779, 0.089 (0.793, 0.087 / 0.76, 0.088)
Cosine similarity between GPT-3 explanations and large SSM: 0.771, 0.09 (0.787, 0.087 / 0.756, 0.089)
Cosine similarity between GPT-3 explanations and gold explanations: 0.808, 0.116 (0.811, 0.117 / 0.793, 0.11)
BERTScore between GPT-3 explanations and small SSM: 0.46299999952316284, 0.0689999982714653 (0.4790000021457672, 0.07000000029802322 / 0.4440000057220459, 0.06199999898672104)
BERTScore between GPT-3 explanations and large SSM: 0.45500001311302185, 0.06300000101327896 (0.4690000116825104, 0.06199999898672104 / 0.4429999887943268, 0.061000000685453415)
BERTScore be

# Evaluate factually correct explanations

In [11]:
h1 = pd.read_excel("evaluation_template_h1.xlsx")
use_indices = ~h1.subphrase_correctness_SSM_small.isnull()
h1 = h1[use_indices]
h1 = h1.drop(columns=h1.columns[11:21]).fillna(0)

h2 = pd.read_excel("evaluation_template_h2.xlsx")
h2 = h2[use_indices]
h2 = h2.drop(columns=h2.columns[11:21]).fillna(0)

h3 = pd.read_excel("evaluation_template_h3.xlsx")
h3 = h3[use_indices]
h3 = h3.drop(columns=h3.columns[11:21]).fillna(0)
train1 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
train = train[train.notnull().apply(all, axis=1)]
dev = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_dev.csv')
dev = dev[dev.notnull().apply(all, axis=1)]
test = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_test.csv')
test = test[test.notnull().apply(all, axis=1)]

dev_prepared = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrase_vectors_dev.csv', sep=';')
dev_prepared = dev_prepared.drop(columns='Unnamed: 0')
dev = dev.set_index('pairID')
rel_pairIDs = dev_prepared.iloc[:, 0]
y_hat = dev.loc[rel_pairIDs].gold_label
dev_prepared = dev_prepared.iloc[:, 1:].to_numpy()
dev_prepared = dev_prepared[h1.i]
BN_dev_small = pd.read_csv("../04_BN_Explanations/BN_explanations_small_model.csv", sep=";", index_col=0)
BN_dev_small = BN_dev_small.iloc[h3.i].reset_index()
BN_dev_large = pd.read_csv("../04_BN_Explanations/BN_explanations_large_model.csv", sep=";", index_col=0)
BN_dev_large = BN_dev_large.iloc[h3.i].reset_index()
dev = dev.loc[rel_pairIDs]
dev = dev.iloc[BN_dev_large.i]
temp = [None, ] * 10
for i in range(1, 11):
    temp[i - 1] = pd.read_csv("../01_GPT3_Explanations/prepared_data/GPT3_explanations" + str(i) + ".csv", sep=";")
gpt3_dev = pd.concat(temp).set_index("pairID")
gpt3_dev = gpt3_dev.loc[rel_pairIDs]
gpt3_dev = gpt3_dev.iloc[h3.i].reset_index()
dev_subphrases = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrases_dev.csv', sep=',')
dev_subphrases = dev_subphrases.set_index('pairID')
dev_subphrases = dev_subphrases.loc[rel_pairIDs]
dev_subphrases = dev_subphrases.drop("Unnamed: 0", axis=1)
dev_subphrases = dev_subphrases.iloc[h3.i]
h = h1[h1.columns[11:]] + h2[h2.columns[11:]] + h3[h3.columns[11:]]
h = h.applymap(lambda x: 1 if x >= 2 else 0)

In [15]:
np.where(h.full_correctness_SSM_small)

(array([ 0,  7, 11, 12, 17, 18, 22, 27, 29, 37, 44, 47, 49, 54, 57, 62, 63,
        67, 73, 77, 80, 84, 85, 90, 91, 94]),)

In [22]:
dev.iloc[np.where(h.full_correctness_SSM_small)[0]].gold_label

pairID
5971287030.jpg#0r1c    contradiction
3670918456.jpg#3r1n          neutral
4873970424.jpg#0r1e       entailment
4687453573.jpg#3r1e          neutral
4727548713.jpg#2r1c    contradiction
560278886.jpg#1r1e        entailment
3826467863.jpg#3r1c    contradiction
2676764246.jpg#0r1e       entailment
482642539.jpg#2r1e        entailment
3735771637.jpg#2r1c    contradiction
3757332635.jpg#2r1c    contradiction
3394070357.jpg#2r1c    contradiction
4633788691.jpg#0r1e       entailment
5075833333.jpg#3r1e       entailment
1536597926.jpg#0r1n          neutral
2634085089.jpg#0r1c    contradiction
5558170783.jpg#1r1n          neutral
466956209.jpg#1r1c     contradiction
86800579.jpg#1r1c      contradiction
3038760935.jpg#3r1n          neutral
7162943359.jpg#4r1n          neutral
2309327462.jpg#4r1e       entailment
2696129516.jpg#3r1c    contradiction
2766630484.jpg#3r1c    contradiction
3440104178.jpg#0r1n          neutral
8217001488.jpg#3r1c    contradiction
Name: gold_label, dtype: object

In [24]:
np.mean(BN_dev_small.iloc[np.where(h.full_correctness_SSM_small)[0]].y == BN_dev_small.iloc[np.where(h.full_correctness_SSM_small)[0]].y_hat_BN)

0.6538461538461539