In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
import pickle

In [2]:
def predict_y_from_z(z):

    # Iterate through each row if z is a matrix
    if len(z.shape) > 1:
        z = pd.DataFrame(z)
        res = z.apply(predict_y_from_z, axis=1)
        return res.to_numpy()

    # For each single line perform the following:
    else:
        # If any z is 'contradiction' -> output class 'contradiction'
        if any(z == 'contradiction'):
            return 'contradiction'
        # Else if all subphrases of sentence 2 are entailed by any subphrase of sentence 1 -> output class 'entailment'
        elif all([any([z[i] == 'entailment' for i in subphrase_indices]) or all([z[i] == 'nan' or pd.isnull(z[i]) for i in subphrase_indices]) for subphrase_indices in Sentence2_indices]):
            return 'entailment'
        # Else output class 'neutral'
        else:
            return 'neutral'

In [3]:
h1 = pd.read_excel("evaluation_template_h1.xlsx")
use_indices = ~h1.subphrase_correctness_SSM_small.isnull()
h1 = h1[use_indices]
h1 = h1.drop(columns=h1.columns[11:21]).fillna(0)

h2 = pd.read_excel("evaluation_template_h2.xlsx")
h2 = h2[use_indices]
h2 = h2.drop(columns=h2.columns[11:21]).fillna(0)

h3 = pd.read_excel("evaluation_template_h3.xlsx")
h3 = h3[use_indices]
h3 = h3.drop(columns=h3.columns[11:21]).fillna(0)

In [4]:
train1 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
train = train[train.notnull().apply(all, axis=1)]
dev = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_dev.csv')
dev = dev[dev.notnull().apply(all, axis=1)]
test = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_test.csv')
test = test[test.notnull().apply(all, axis=1)]

dev_prepared = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrase_vectors_dev.csv', sep=';')
dev_prepared = dev_prepared.drop(columns='Unnamed: 0')
dev = dev.set_index('pairID')
rel_pairIDs = dev_prepared.iloc[:,0]
y_hat = dev.loc[rel_pairIDs].gold_label
dev_prepared = dev_prepared.iloc[:,1:].to_numpy()
dev_prepared = dev_prepared[h1.i]

In [5]:
BN_dev_small = pd.read_csv("../04_BN_Explanations/BN_explanations_small_model.csv", sep=";", index_col=0)
BN_dev_small = BN_dev_small.iloc[h3.i].reset_index()
BN_dev_large = pd.read_csv("../04_BN_Explanations/BN_explanations_large_model.csv", sep=";", index_col=0)
BN_dev_large = BN_dev_large.iloc[h3.i].reset_index()

In [6]:
dev = dev.loc[rel_pairIDs]
dev = dev.iloc[BN_dev_large.i]

In [7]:
temp = [None, ] * 10
for i in range(1, 11):
    temp[i - 1] = pd.read_csv("../01_GPT3_Explanations/prepared_data/GPT3_explanations" + str(i) + ".csv", sep=";")
gpt3_dev = pd.concat(temp).set_index("pairID")
gpt3_dev = gpt3_dev.loc[rel_pairIDs]
gpt3_dev = gpt3_dev.iloc[h3.i].reset_index()

In [8]:
dev_subphrases = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrases_dev.csv', sep=',')
dev_subphrases = dev_subphrases.set_index('pairID')
dev_subphrases = dev_subphrases.loc[rel_pairIDs]
dev_subphrases = dev_subphrases.drop("Unnamed: 0", axis=1)
dev_subphrases = dev_subphrases.iloc[h3.i]

In [9]:
h = h1[h1.columns[11:]] + h2[h2.columns[11:]] + h3[h3.columns[11:]]
h = h.applymap(lambda x: 1 if x >= 2 else 0)

In [11]:
def transform_z(Z):
    res = np.zeros((dev_prepared.shape[0], 10), dtype="int")
    for j in range(Z.shape[0]):
        y_hat_pred = predict_y_from_z(Z[j,:])
        pairs = [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                 [1, 0], [1, 1], [1, 2], [1, 3], [1, 4],
                 [2, 0], [2, 1], [2, 2], [2, 3], [2, 4],
                 [3, 0], [3, 1], [3, 2], [3, 3], [3, 4],
                 [4, 0], [4, 1], [4, 2], [4, 3], [4, 4]]
        if y_hat_pred == 'neutral':
            for i, subphrase_indices in enumerate(Sentence2_indices):
                if all([Z[j,k] == 'neutral' or Z[j,k] == 'nan' or pd.isnull(Z[j,k]) for k in subphrase_indices]) and any([Z[j,k] == 'neutral' for k in subphrase_indices]):
                    res[j,i+5] = 1
        elif y_hat_pred == 'contradiction':
            for i,z in enumerate(Z[j,:]):
                if z == 'contradiction':
                    res[j,pairs[use_z_values[i]][0]] = 1
                    res[j,pairs[use_z_values[i]][1]+5] = 1
        elif y_hat_pred == 'entailment':
            for i,z in enumerate(Z[j,:]):
                if z == 'entailment':
                    res[j,pairs[use_z_values[i]][0]] = 1
                    res[j,pairs[use_z_values[i]][1]+5] = 1
    return res

for k in range(2):
    if k == 0:
        directory = 'vers33/MLP_Classifiers_480k_training_15_iter_NN_size_200_50_30'
        use_z_values = (0, 3, 4, 6, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)
    elif k == 1:
        directory = 'vers31/MLP_Classifiers_480k_training_15_iter_NN_size_200_50_30'
        use_z_values = tuple(range(25))

    # Indices in terms of z for all hidden variables that are not mixed, e.g. Subject1-Subject2, Verb1-Verb2, etc.
    non_mixed_pairs_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (0, 6, 12, 18, 24)]
    # Indices for all z variables influenced by Subject2 (Verb2, Object2 etc. respectively) (e.g. Subject1-Subject2)
    Subj2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (0, 5, 10, 15, 20)]
    Verb2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (1, 6, 11, 16, 21)]
    Obj2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (2, 7, 12, 17, 22)]
    Loc2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (3, 8, 13, 18, 23)]
    Clo2_indices = [i for i in range(len(use_z_values)) if use_z_values[i] in (4, 9, 14, 19, 24)]
    Sentence2_indices = [Subj2_indices, Verb2_indices, Obj2_indices, Loc2_indices, Clo2_indices]

    clf = list()
    for i in range(len(use_z_values)):
        with open("../03_Bayesian_Network/" + directory + "/MLP_Classifier" + str(i) + ".pkl", "rb") as f:
            clf += [pickle.load(f), ]

    # Prepare colum indices
    indices = np.array([[0,1500], [0,1800], [0,2100], [0,2400], [0,2700],
                        [300,1500], [300,1800], [300,2100], [300,2400], [300,2700],
                        [600,1500], [600,1800], [600,2100], [600,2400], [600,2700],
                        [900,1500], [900,1800], [900,2100], [900,2400], [900,2700],
                        [1200,1500], [1200,1800], [1200,2100], [1200,2400], [1200,2700]])
    indices = indices[use_z_values,:].tolist()

    # Initialise colulmn indices and "nan" values if information (e.g. location of sentence) is not detected
    not_nan = [None, ] * len(use_z_values)
    cols = [None, ] * len(use_z_values)
    for i in range(len(use_z_values)):
        cols[i] = list(range(indices[i][0], indices[i][0]+300)) + list(range(indices[i][1],indices[i][1]+300))
        not_nan[i] = pd.Series([not x for x in pd.DataFrame(np.isnan(dev_prepared[:,cols[i]])).apply(any, axis=1)])
    not_nan = np.array(not_nan).T

    z = np.empty((dev_prepared.shape[0], len(use_z_values)), dtype=np.dtype('U25'))
    z[:,:] = np.nan

    for i in range(len(use_z_values)):
        z[not_nan[:,i], i] = clf[i].predict(dev_prepared[not_nan[:,i],:][:, cols[i]])
    if k == 0:
        z_small = transform_z(z)
    elif k == 1:
        z_large = transform_z(z)

In [12]:
jaccard_sim_small = list()
jaccard_sim_large = list()
for i in range(z_small.shape[0]):
    jaccard_sim_small += [jaccard_score(z_small[i,:], h[h.columns[:10]].to_numpy()[i,:])]
    jaccard_sim_large += [jaccard_score(z_large[i,:], h[h.columns[:10]].to_numpy()[i,:])]
jaccard_sim_small = np.array(jaccard_sim_small)
jaccard_sim_large = np.array(jaccard_sim_large)

In [13]:
index_correct_small_SSM_preds = np.where(BN_dev_small.y_hat_BN == gpt3_dev.pred_label)[0]
index_incorrect_small_SSM_preds = np.where(BN_dev_small.y_hat_BN != gpt3_dev.pred_label)[0]
index_correct_large_SSM_preds = np.where(BN_dev_large.y_hat_BN == gpt3_dev.pred_label)[0]
index_incorrect_large_SSM_preds = np.where(BN_dev_large.y_hat_BN != gpt3_dev.pred_label)[0]

In [14]:
print(f"{jaccard_sim_small.mean()}, ({jaccard_sim_small[index_correct_small_SSM_preds].mean()} / {jaccard_sim_small[index_incorrect_small_SSM_preds].mean()})")
print(f"{jaccard_sim_large.mean()}, ({jaccard_sim_small[index_correct_large_SSM_preds].mean()} / {jaccard_sim_small[index_incorrect_large_SSM_preds].mean()})")

0.4234837092731829, (0.44510939510939507 / 0.409688013136289)
0.3840601503759399, (0.42049319727891155 / 0.42473347547974416)


In [15]:
h.mean()

subj1_present                      0.526316
verb1_present                      0.536842
obj1_present                       0.210526
loc1_present                       0.147368
clo1_present                       0.115789
subj2_present                      0.547368
verb2_present                      0.663158
obj2_present                       0.294737
loc2_present                       0.242105
clo2_present                       0.084211
structure                          0.842105
support                            0.968421
correctness_GPT3                   0.863158
full_correctness_SSM_large         0.210526
subphrase_correctness_SSM_large    0.273684
full_correctness_SSM_small         0.273684
subphrase_correctness_SSM_small    0.368421
dtype: float64

In [21]:
h.iloc[index_correct_small_SSM_preds][['full_correctness_SSM_small', 'subphrase_correctness_SSM_small']].mean()

full_correctness_SSM_small         0.243243
subphrase_correctness_SSM_small    0.351351
dtype: float64

In [22]:
h.iloc[index_correct_large_SSM_preds][['full_correctness_SSM_large', 'subphrase_correctness_SSM_large']].mean()

full_correctness_SSM_large         0.250000
subphrase_correctness_SSM_large    0.285714
dtype: float64