In [72]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters
import pickle

In [41]:
def predict_y_from_z(z):

    # Iterate through each row if z is a matrix
    if len(z.shape) > 1:
        z = pd.DataFrame(z)
        res = z.apply(predict_y_from_z, axis=1)
        return res.to_numpy()

    # For each single line perform the following:
    else:
        # If any z is 'contradiction' -> output class 'contradiction'
        if any(z == 'contradiction'):
            return 'contradiction'
        # Else if all subphrases of sentence 2 are entailed by any subphrase of sentence 1 -> output class 'entailment'
        elif all([any([z[i] == 'entailment' for i in subphrase_indices]) or all([z[i] == 'nan' or pd.isnull(z[i]) for i in subphrase_indices]) for subphrase_indices in Sentence2_indices]):
            return 'entailment'
        # Else output class 'neutral'
        else:
            return 'neutral'

In [42]:
h1 = pd.read_excel("evaluation_template_h1.xlsx")
use_indices = ~h1.subphrase_correctness_SSM_small.isnull()
h1 = h1[use_indices]
h1 = h1.drop(columns=h1.columns[11:21]).fillna(0)

h2 = pd.read_excel("evaluation_template_h2.xlsx")
h2 = h2[use_indices]
h2 = h2.drop(columns=h2.columns[11:21]).fillna(0)

h3 = pd.read_excel("evaluation_template_h3.xlsx")
h3 = h3[use_indices]
h3 = h3.drop(columns=h3.columns[11:21]).fillna(0)

In [43]:
train1 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
train = train[train.notnull().apply(all, axis=1)]
dev = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_dev.csv')
dev = dev[dev.notnull().apply(all, axis=1)]
test = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_test.csv')
test = test[test.notnull().apply(all, axis=1)]

dev_prepared = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrase_vectors_dev.csv', sep=';')
dev_prepared = dev_prepared.drop(columns='Unnamed: 0')
dev = dev.set_index('pairID')
rel_pairIDs = dev_prepared.iloc[:,0]
y_hat = dev.loc[rel_pairIDs].gold_label
dev_prepared = dev_prepared.iloc[:,1:].to_numpy()
dev_prepared = dev_prepared[h1.i]

In [44]:
BN_dev_small = pd.read_csv("../04_BN_Explanations/BN_explanations_small_model.csv", sep=";", index_col=0)
BN_dev_small = BN_dev_small.iloc[h3.i].reset_index()
BN_dev_large = pd.read_csv("../04_BN_Explanations/BN_explanations_large_model.csv", sep=";", index_col=0)
BN_dev_large = BN_dev_large.iloc[h3.i].reset_index()

In [45]:
dev = dev.loc[rel_pairIDs]
dev = dev.iloc[BN_dev_large.i]

In [46]:
temp = [None, ] * 10
for i in range(1, 11):
    temp[i - 1] = pd.read_csv("../01_GPT3_Explanations/prepared_data/GPT3_explanations" + str(i) + ".csv", sep=";")
gpt3_dev = pd.concat(temp).set_index("pairID")
gpt3_dev = gpt3_dev.loc[rel_pairIDs]
gpt3_dev = gpt3_dev.iloc[h3.i].reset_index()

In [47]:
dev_subphrases = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrases_dev.csv', sep=',')
dev_subphrases = dev_subphrases.set_index('pairID')
dev_subphrases = dev_subphrases.loc[rel_pairIDs]
dev_subphrases = dev_subphrases.drop("Unnamed: 0", axis=1)
dev_subphrases = dev_subphrases.iloc[h3.i]

In [48]:
h = h1[h1.columns[11:]] + h2[h2.columns[11:]] + h3[h3.columns[11:]]
h = h.applymap(lambda x: 1 if x >= 2 else 0)

In [49]:
def transform_z(Z):
    res = np.zeros((dev_prepared.shape[0], 10), dtype="int")
    for j in range(Z.shape[0]):
        y_hat_pred = predict_y_from_z(Z[j,:])
        pairs = [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                 [1, 0], [1, 1], [1, 2], [1, 3], [1, 4],
                 [2, 0], [2, 1], [2, 2], [2, 3], [2, 4],
                 [3, 0], [3, 1], [3, 2], [3, 3], [3, 4],
                 [4, 0], [4, 1], [4, 2], [4, 3], [4, 4]]
        if y_hat_pred == 'neutral':
            for i, subphrase_indices in enumerate(Sentence2_indices):
                if all([Z[j,k] == 'neutral' or Z[j,k] == 'nan' or pd.isnull(Z[j,k]) for k in subphrase_indices]) and any([Z[j,k] == 'neutral' for k in subphrase_indices]):
                    res[j,i+5] = 1
        elif y_hat_pred == 'contradiction':
            for i,z in enumerate(Z[j,:]):
                if z == 'contradiction':
                    res[j,pairs[use_z_values[i]][0]] = 1
                    res[j,pairs[use_z_values[i]][1]+5] = 1
        elif y_hat_pred == 'entailment':
            for i,z in enumerate(Z[j,:]):
                if z == 'entailment':
                    res[j,pairs[use_z_values[i]][0]] = 1
                    res[j,pairs[use_z_values[i]][1]+5] = 1
    return res

for k in range(2):
    if k == 0:
        directory = 'vers33/MLP_Classifiers_480k_training_15_iter_NN_size_200_50_30'
        use_z_values = (0, 3, 4, 6, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)
    elif k == 1:
        directory = 'vers31/MLP_Classifiers_480k_training_15_iter_NN_size_200_50_30'
        use_z_values = tuple(range(25))

    # Indices in terms of z for all hidden variables that are not mixed, e.g. Subject1-Subject2, Verb1-Verb2, etc.
    non_mixed_pairs_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (0, 6, 12, 18, 24)]
    # Indices for all z variables influenced by Subject2 (Verb2, Object2 etc. respectively) (e.g. Subject1-Subject2)
    Subj2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (0, 5, 10, 15, 20)]
    Verb2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (1, 6, 11, 16, 21)]
    Obj2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (2, 7, 12, 17, 22)]
    Loc2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (3, 8, 13, 18, 23)]
    Clo2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (4, 9, 14, 19, 24)]
    Sentence2_indices = [Subj2_indices, Verb2_indices, Obj2_indices, Loc2_indices, Clo2_indices]

    clf = list()
    for i in range(len(use_z_values)):
        with open("../03_Bayesian_Network/" + directory + "/MLP_Classifier" + str(i) + ".pkl", "rb") as f:
            clf += [pickle.load(f), ]

    # Prepare colum indices
    indices = np.array([[0,1500], [0,1800], [0,2100], [0,2400], [0,2700],
                        [300,1500], [300,1800], [300,2100], [300,2400], [300,2700],
                        [600,1500], [600,1800], [600,2100], [600,2400], [600,2700],
                        [900,1500], [900,1800], [900,2100], [900,2400], [900,2700],
                        [1200,1500], [1200,1800], [1200,2100], [1200,2400], [1200,2700]])
    indices = indices[use_z_values,:].tolist()

    # Initialise colulmn indices and "nan" values if information (e.g. location of sentence) is not detected
    not_nan = [None, ] * len(use_z_values)
    cols = [None, ] * len(use_z_values)
    for i in range(len(use_z_values)):
        cols[i] = list(range(indices[i][0], indices[i][0]+300)) + list(range(indices[i][1],indices[i][1]+300))
        not_nan[i] = pd.Series([not x for x in pd.DataFrame(np.isnan(dev_prepared[:,cols[i]])).apply(any, axis=1)])
    not_nan = np.array(not_nan).T

    z = np.empty((dev_prepared.shape[0], len(use_z_values)), dtype=np.dtype('U25'))
    z[:,:] = np.nan

    for i in range(len(use_z_values)):
        z[not_nan[:,i], i] = clf[i].predict(dev_prepared[not_nan[:,i],:][:, cols[i]])
    if k == 0:
        z_small = transform_z(z)
    elif k == 1:
        z_large = transform_z(z)

In [50]:
jaccard_sim_small = list()
jaccard_sim_large = list()
for i in range(z_small.shape[0]):
    jaccard_sim_small += [jaccard_score(z_small[i,:], h[h.columns[:10]].to_numpy()[i,:])]
    jaccard_sim_large += [jaccard_score(z_large[i,:], h[h.columns[:10]].to_numpy()[i,:])]
jaccard_sim_small = np.array(jaccard_sim_small)
jaccard_sim_large = np.array(jaccard_sim_large)

In [51]:
index_correct_small_SSM_preds = np.where(BN_dev_small.y_hat_BN == gpt3_dev.pred_label)[0]
index_incorrect_small_SSM_preds = np.where(BN_dev_small.y_hat_BN != gpt3_dev.pred_label)[0]
index_correct_large_SSM_preds = np.where(BN_dev_large.y_hat_BN == gpt3_dev.pred_label)[0]
index_incorrect_large_SSM_preds = np.where(BN_dev_large.y_hat_BN != gpt3_dev.pred_label)[0]

In [52]:
print(f"{jaccard_sim_small.mean()}, ({jaccard_sim_small[index_correct_small_SSM_preds].mean()} / {jaccard_sim_small[index_incorrect_small_SSM_preds].mean()})")
print(f"{jaccard_sim_large.mean()}, ({jaccard_sim_large[index_correct_large_SSM_preds].mean()} / {jaccard_sim_large[index_incorrect_large_SSM_preds].mean()})")

0.4456140350877192, (0.4716216216216216 / 0.4290229885057471)
0.35918546365914794, (0.3392857142857143 / 0.367501776830135)


In [53]:
h.mean()

subj1_present                      0.526316
verb1_present                      0.536842
obj1_present                       0.210526
loc1_present                       0.147368
clo1_present                       0.115789
subj2_present                      0.547368
verb2_present                      0.663158
obj2_present                       0.294737
loc2_present                       0.242105
clo2_present                       0.084211
structure                          0.842105
support                            0.968421
correctness_GPT3                   0.863158
full_correctness_SSM_large         0.210526
subphrase_correctness_SSM_large    0.273684
full_correctness_SSM_small         0.273684
subphrase_correctness_SSM_small    0.368421
dtype: float64

In [54]:
h.iloc[index_incorrect_small_SSM_preds][['full_correctness_SSM_small', 'subphrase_correctness_SSM_small']].mean()

full_correctness_SSM_small         0.293103
subphrase_correctness_SSM_small    0.379310
dtype: float64

In [55]:
h.iloc[index_correct_small_SSM_preds][['full_correctness_SSM_small', 'subphrase_correctness_SSM_small']].mean()

full_correctness_SSM_small         0.243243
subphrase_correctness_SSM_small    0.351351
dtype: float64

In [56]:
h.iloc[index_incorrect_large_SSM_preds][['full_correctness_SSM_large', 'subphrase_correctness_SSM_large']].mean()

full_correctness_SSM_large         0.194030
subphrase_correctness_SSM_large    0.268657
dtype: float64

In [57]:
h.iloc[index_correct_large_SSM_preds][['full_correctness_SSM_large', 'subphrase_correctness_SSM_large']].mean()

full_correctness_SSM_large         0.250000
subphrase_correctness_SSM_large    0.285714
dtype: float64

In [58]:
h[h.obj1_present == 1].verb1_present.mean()

0.85

In [59]:
h.iloc[index_correct_small_SSM_preds].subphrase_correctness_SSM_small

0     1
1     0
2     1
3     0
7     0
11    0
13    1
14    1
16    0
20    0
23    1
24    0
28    1
31    0
32    0
33    0
39    0
41    0
43    0
46    0
48    1
51    1
52    0
54    0
57    1
62    0
68    0
71    1
75    0
79    0
80    0
82    0
87    0
89    1
91    0
92    1
94    1
Name: subphrase_correctness_SSM_small, dtype: int64

In [60]:
BN_dev_small.iloc[np.where(h.subphrase_correctness_SSM_small == 1)[0]].y_hat_BN.value_counts()

entailment       15
contradiction    10
neutral          10
Name: y_hat_BN, dtype: int64

In [75]:
temp

(array([[5, 0],
        [5, 0],
        [2, 3],
        [2, 3],
        [2, 3],
        [3, 2],
        [3, 2],
        [3, 2],
        [3, 2],
        [3, 2]]),
 array([0, 1]))

In [82]:
h.columns

Index(['subj1_present', 'verb1_present', 'obj1_present', 'loc1_present',
       'clo1_present', 'subj2_present', 'verb2_present', 'obj2_present',
       'loc2_present', 'clo2_present', 'structure', 'support',
       'correctness_GPT3', 'full_correctness_SSM_large',
       'subphrase_correctness_SSM_large', 'full_correctness_SSM_small',
       'subphrase_correctness_SSM_small'],
      dtype='object')

In [92]:
for col in h.columns:
    temp = np.array([h1[col], h2[col], h3[col]]).T
    temp = aggregate_raters(temp)[0]
    print(col, fleiss_kappa(temp, method='fleiss'))

subj1_present 0.7330671398994378
verb1_present 0.8442158616577218
obj1_present 0.6037031615925053
loc1_present 0.48048002430502745
clo1_present 0.7227626459143964
subj2_present 0.6609497372856148
verb2_present 0.6302702702702698
obj2_present 0.5725657591139821
loc2_present 0.5155502392344495
clo2_present 0.6773584905660378
structure 0.31603419829008506
support 0.08584905660377305
correctness_GPT3 0.28705440900562823
full_correctness_SSM_large 0.5219404186795485
subphrase_correctness_SSM_large 0.541035428844089
full_correctness_SSM_small 0.5532915360501564
subphrase_correctness_SSM_small 0.5944450300411086


In [94]:
for col, i in zip(['full_correctness_SSM_large', 'subphrase_correctness_SSM_large', 'full_correctness_SSM_small', 'subphrase_correctness_SSM_small'], [index_correct_large_SSM_preds, index_correct_large_SSM_preds, index_correct_small_SSM_preds, index_correct_small_SSM_preds]):
    temp = np.array([h1[col].iloc[i], h2[col].iloc[i], h3[col].iloc[i]]).T
    temp = aggregate_raters(temp)[0]
    print(col, fleiss_kappa(temp, method='fleiss'))

full_correctness_SSM_large 0.40128296507483946
subphrase_correctness_SSM_large 0.5714285714285712
full_correctness_SSM_small 0.6926424050632908
subphrase_correctness_SSM_small 0.726408450704225


In [95]:
for col, i in zip(['full_correctness_SSM_large', 'subphrase_correctness_SSM_large', 'full_correctness_SSM_small', 'subphrase_correctness_SSM_small'], [index_incorrect_large_SSM_preds, index_incorrect_large_SSM_preds, index_incorrect_small_SSM_preds, index_incorrect_small_SSM_preds]):
    temp = np.array([h1[col].iloc[i], h2[col].iloc[i], h3[col].iloc[i]]).T
    temp = aggregate_raters(temp)[0]
    print(col, fleiss_kappa(temp, method='fleiss'))

full_correctness_SSM_large 0.5771388499298736
subphrase_correctness_SSM_large 0.5244084682440849
full_correctness_SSM_small 0.4682964094728799
subphrase_correctness_SSM_small 0.5117845117845117
