In [1]:
import pandas as pd
import datetime
import numpy as np
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import pickle

In [96]:
directory = 'MLP_Classifiers_200k_training_10_iter_NN_size_200_200_50_30'

In [57]:
train1 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_1.csv')
train2 = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_train_2.csv')
train = pd.concat([train1, train2])
train = train[train.notnull().apply(all, axis=1)]
dev = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_dev.csv')
dev = dev[dev.notnull().apply(all, axis=1)]
test = pd.read_csv('../Input_Data/e-SNLI/dataset/esnli_test.csv')
test = test[test.notnull().apply(all, axis=1)]

dev_prepared = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrase_vectors_dev.csv', sep=';')
dev_prepared = dev_prepared.drop(columns='Unnamed: 0')
dev = dev.set_index('pairID')
rel_pairIDs = dev_prepared.iloc[:,0]
y_hat = dev.loc[rel_pairIDs].gold_label
dev_prepared = dev_prepared.iloc[:,1:].to_numpy()

In [58]:
dev_subphrases = pd.read_csv('../02_Extract_Subphrases/prepared_data/subphrases_dev.csv', sep=',')
dev_subphrases = dev_subphrases.set_index('pairID')
dev_subphrases = dev_subphrases.loc[rel_pairIDs]

In [36]:
dev_subphrases.iloc[0]['string_subj_s1']

'Two women'

In [3]:
def evaluate_performance(preds, actual):
    labels = ['contradiction', 'entailment', 'neutral']
    print('Confusion Matrix')
    print(confusion_matrix(actual, preds, labels=labels))
    print(f'F1-Score: {f1_score(actual, preds, labels=labels, average="macro")}')
    print(f'Accuracy: {accuracy_score(actual, preds)}')

In [99]:
def predict_y_from_z(z):
    if len(z.shape) > 1:
        z = pd.DataFrame(z)
        res = z.apply(predict_y_from_z, axis=1)
        return res.to_numpy()
    else:
        if all([z[i] == 'nan' or z[i] == 'neutral' for i in range(25)]):
            return 'neutral'
        elif any(z == 'contradiction'):
            return 'contradiction'
        else:
            return 'entailment'

In [97]:
clf = list()
for i in range(25):
    with open("../03_Bayesian_Network/" + directory + "/MLP_Classifier" + str(i) + ".pkl", "rb") as f:
        clf += [pickle.load(f), ]

In [98]:
# Prepare colum indices
indices = [[0,1500], [0,1800], [0,2100], [0,2400], [0,2700],
           [300,1500], [300,1800], [300,2100], [300,2400], [300,2700],
           [600,1500], [600,1800], [600,2100], [600,2400], [600,2700],
           [900,1500], [900,1800], [900,2100], [900,2400], [900,2700],
           [1200,1500], [1200,1800], [1200,2100], [1200,2400], [1200,2700]]

# Initialise colulmn indices and "nan" values if information (e.g. location of sentence) is not detected
print(datetime.datetime.now())
print("Initialise column indices and 'nan' values")
not_nan = [None, ] * 25
cols = [None, ] * 25
for i in range(25):
    cols[i] = list(range(indices[i][0], indices[i][0]+300)) + list(range(indices[i][1],indices[i][1]+300))
    not_nan[i] = pd.Series([not x for x in pd.DataFrame(np.isnan(dev_prepared[:,cols[i]])).apply(any, axis=1)])
not_nan = np.array(not_nan).T

2023-02-27 16:26:52.628504
Initialise column indices and 'nan' values


In [100]:
z = np.empty((y_hat.shape[0], 25), dtype=np.dtype('U100'))

for i in range(25):
    z[not_nan[:,i], i] = clf[i].predict(dev_prepared[not_nan[:,i],:][:, cols[i]])

for j in range(y_hat.shape[0]):
    for i in range(25):
        if z[j,i] == '':
            z[j,i] = np.nan

y_hat_pred = predict_y_from_z(z)

In [106]:
def generate_explanation(Z, subphrases):
    pairs = [['subj_s1', 'subj_s2'], ['subj_s1', 'verb_s2'], ['subj_s1', 'obj_s2'], ['subj_s1', 'loc_s2'], ['subj_s1', 'clo_s2'],
             ['verb_s1', 'subj_s2'], ['verb_s1', 'verb_s2'], ['verb_s1', 'obj_s2'], ['verb_s1', 'loc_s2'], ['verb_s1', 'clo_s2'],
             ['obj_s1', 'subj_s2'], ['obj_s1', 'verb_s2'], ['obj_s1', 'obj_s2'], ['obj_s1', 'loc_s2'], ['obj_s1', 'clo_s2'],
             ['loc_s1', 'subj_s2'], ['loc_s1', 'verb_s2'], ['loc_s1', 'obj_s2'], ['loc_s1', 'loc_s2'], ['loc_s1', 'clo_s2'],
             ['clo_s1', 'subj_s2'], ['clo_s1', 'verb_s2'], ['clo_s1', 'obj_s2'], ['clo_s1', 'loc_s2'], ['clo_s1', 'clo_s2']]
    pairs_map = {'subj_s1': 'the subject of sentence 1',
                 'verb_s1': 'the verb of sentence 1',
                 'obj_s1': 'the object of sentence 1',
                 'loc_s1': 'the location of sentence 1',
                 'clo_s1': 'the clothing described in sentence 1',
                 'subj_s2': 'the subject of sentence 2',
                 'verb_s2': 'the verb of sentence 2',
                 'obj_s2': 'the object of sentence 2',
                 'loc_s2': 'the location of sentence 2',
                 'clo_s2': 'the clothing described in sentence 2',}
    reasons = list()
    if all([Z[i] in ['neutral', 'nan'] for i in range(25)]):
        return 'The sentences are neutral'
    else:
        for i,z in enumerate(Z):
            if z == 'contradiction':
                if i in (0, 6, 12, 18, 24):
                    reasons += [f'{subphrases["string_" + pairs[i][0]]} is not the same as {subphrases["string_" + pairs[i][1]]}', ]
                else:
                    reasons += [f'if {pairs_map[pairs[i][0]]} is {subphrases["string_" + pairs[i][0]]}, {pairs_map[pairs[i][1]]} cannot be {subphrases["string_" + pairs[i][1]]}', ]
        if len(reasons) == 0:
            for i,z in enumerate(Z):
                if z == 'entailment':
                    if i in (0, 6, 12, 18, 24):
                        reasons += [f'{subphrases["string_" + pairs[i][0]]} is the same as {subphrases["string_" + pairs[i][1]]}', ]
                    else:
                        reasons += [f'if {pairs_map[pairs[i][0]]} is {subphrases["string_" + pairs[i][0]]}, then {pairs_map[pairs[i][1]]} has to be {subphrases["string_" + pairs[i][1]]}', ]
        return " and ".join(reasons)

In [107]:
for i in range(40):
    if y_hat[i] != y_hat_pred[i]:
        suffix = 'WRONG: '
    else:
        suffix = 'TRUE: '
    print(suffix + generate_explanation(z[i,:], dev_subphrases.iloc[i]))

TRUE: The sentences are neutral
TRUE: embracing is the same as holding and if the verb of sentence 1 is embracing, then the object of sentence 2 has to be packages
WRONG: The sentences are neutral
TRUE: if the verb of sentence 1 is standing, then the subject of sentence 2 has to be Two kids in numbered jerseys and if the verb of sentence 1 is standing, then the object of sentence 2 has to be their hands
WRONG: if the verb of sentence 1 is standing, then the object of sentence 2 has to be their hands
TRUE: in blue jerseys is not the same as in jackets
TRUE: A man is not the same as A woman and if the subject of sentence 1 is A man, the verb of sentence 2 cannot be drinks and if the subject of sentence 1 is A man, the object of sentence 2 cannot be her coffee and if the subject of sentence 1 is A man, the location of sentence 2 cannot be in a small cafe and if the object of sentence 1 is donuts, the location of sentence 2 cannot be in a small cafe and if the location of sentence 1 is in 

In [105]:
z[39,:]

array(['entailment', 'entailment', 'neutral', 'nan', 'nan', 'neutral',
       'neutral', 'neutral', 'nan', 'nan', 'neutral', 'neutral',
       'neutral', 'nan', 'nan', 'neutral', 'neutral', 'neutral', 'nan',
       'nan', 'nan', 'nan', 'nan', 'nan', 'nan'], dtype='<U100')

In [104]:
dev_subphrases.iloc[39]

Unnamed: 0                        43
string_subj_s1           Two doctors
string_verb_s1               perform
string_obj_s1     surgery on patient
string_loc_s1             on patient
string_clo_s1                    NaN
string_subj_s2               Doctors
string_verb_s2            performing
string_obj_s2                surgery
string_loc_s2                    NaN
string_clo_s2                    NaN
Name: 5436250638.jpg#4r1e, dtype: object