In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import reduce_sum
from keras.models import Sequential, Model, load_model
from keras.layers import LSTM, Dense, Dropout, Input, Concatenate, Subtract

from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('data/contradiction dataset/training.txt', delimiter = '\t')
train = train.drop('S.NO.', axis = 1)

test = pd.read_csv('data/contradiction dataset/testing.txt', delimiter = '\t')
test = test.drop('S.NO.', axis = 1)

train.head()

Unnamed: 0,SENTENCE A,SENTENCE B,SENTENCE TYPE,IOOU,NEGATION,ANTONYM,OVERLAP COEFFICIENT
0,the dough used for pancakes is thin,the dough used for pancakes is thick,contradiction,0.75,False,1,0.171429
1,she showed him my ugly picture,she showed him my handsome picture,contradiction,0.714286,False,1,0.166667
2,i only need the healthy half,i only need the bad half,contradiction,0.714286,False,1,0.208333
3,i cant confidently tell you yet,i cant diffidently tell you yet,contradiction,0.714286,False,1,0.16129
4,i need to be sure,i do not need to be sure,contradiction,0.714286,True,0,0.294118


In [3]:
train[train['SENTENCE TYPE'] == 'neutral']

Unnamed: 0,SENTENCE A,SENTENCE B,SENTENCE TYPE,IOOU,NEGATION,ANTONYM,OVERLAP COEFFICIENT
100,a group of kids is playing in a yard and an ol...,a group of boys in a yard is playing and a man...,neutral,0.750000,False,0,0.157895
101,a group of children is playing in the house an...,a group of kids is playing in a yard and an ol...,neutral,0.578947,True,0,0.135802
102,the kids are playing outdoors near a man with ...,a group of kids is playing in a yard and an ol...,neutral,0.250000,False,0,0.094340
103,the young boys are playing outdoors and the ma...,a group of kids is playing in a yard and an ol...,neutral,0.238095,False,1,0.076923
104,two dogs are fighting,two dogs are wrestling and hugging,neutral,0.428571,False,0,0.142857
...,...,...,...,...,...,...,...
3317,a door is being opened by a man,a bald man in a band is playing guitar in the ...,neutral,0.214286,False,0,0.096774
3318,someone is boiling okra in a pot,the man is not playing the drums,neutral,0.083333,True,0,0.031250
3319,the man is singing heartily and playing the gu...,a bicyclist is holding a bike over his head in...,neutral,0.052632,False,0,0.020000
3320,a man in blue has a yellow ball in the mitt,a man is jumping rope outside,neutral,0.153846,False,0,0.068966


In [4]:
print("Total number of samples in training dataset: ", len(train))
print("Total number of samples in test dataset: ", len(test))

print("Output prediction categories: ", set(train['SENTENCE TYPE']))

Total number of samples in training dataset:  3822
Total number of samples in test dataset:  3513
Output prediction categories:  {'contradiction', 'neutral'}


In [5]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [6]:
def get_max_length(X):
    max_words = 0
    for sentence in X:
        sent_indices = [word.lower() for word in sentence.split()]
        if len(sent_indices)> max_words:
            max_words = len(sent_indices)
            
    return max_words

In [7]:
Xtraina = train['SENTENCE A'].values
Xtrainb = train['SENTENCE B'].values

Xtesta = test['SENTENCE A'].values
Xtestb = test['SENTENCE B'].values

print("Maximum length of sentence in training dataset is :", max(get_max_length(Xtraina), get_max_length(Xtrainb)))
print("Maximum length of sentence in training dataset is :", max(get_max_length(Xtesta), get_max_length(Xtestb)))

Maximum length of sentence in training dataset is : 32
Maximum length of sentence in training dataset is : 28


In [8]:
word_to_index, index_to_word, word_to_vec = read_glove_vecs('data/glove.6B.200d.txt')# dictionaries mapping corresponding elements

In [9]:
SENT_SIZE = 40

def convert_sent_to_emb(X, word_to_vec):
    data = np.zeros((len(X), SENT_SIZE, len(word_to_vec['word'])))
    
    for i, sent in enumerate(X):
        for j, word in enumerate(sent.split()):
            try:
                data[i, j, :] = word_to_vec[word]
            except:
                pass
            
    return data

In [10]:
XtrainA_vec = convert_sent_to_emb(Xtraina, word_to_vec)
XtrainB_vec = convert_sent_to_emb(Xtrainb, word_to_vec)
XtestA_vec = convert_sent_to_emb(Xtesta, word_to_vec)
XtestB_vec = convert_sent_to_emb(Xtestb, word_to_vec)

trainFeatures = np.array(train[['IOOU', 'NEGATION', 'ANTONYM']].values, dtype = float)
testFeatures = np.array(test[['IOOU', 'NEGATION', 'ANTONYM']].values, dtype = float)

Ytrain = train['SENTENCE TYPE'].values
Ytest = test['SENTENCE TYPE'].values

Ytrain = np.array(Ytrain == 'neutral', dtype = int).reshape(len(Ytrain), 1)
Ytest = np.array(Ytest == 'neutral', dtype = int).reshape(len(Ytest), 1)

In [11]:
inp11 = Input(shape = (SENT_SIZE, len(word_to_vec['word'])))
inp12 = Input(shape = (SENT_SIZE, len(word_to_vec['word'])))
inp2 = Input(shape = (3, ))

X11 = LSTM(128)(inp11)
X12 = LSTM(128)(inp12)

subtracted = Subtract()([X11, X12])
X1 = Dense(256)(subtracted)

X = Concatenate(axis = 1)([X1, inp2])

X = Dense(64, activation = 'relu')(X)
out = Dense(1, activation = 'sigmoid')(X)

model = Model(inputs = [inp11, inp12, inp2], outputs = out)

In [12]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 40, 200)]    0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 40, 200)]    0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 128)          168448      ['input_1[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 128)          168448      ['input_2[0][0]']                
                                                                                              

In [13]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [14]:
model.fit(x = [XtrainA_vec, XtrainB_vec, trainFeatures], y = Ytrain, validation_data=([XtestA_vec, XtestB_vec, testFeatures], Ytest), epochs = 5, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x21206b4fac0>

In [15]:
Ypred = model.predict([XtestA_vec, XtestB_vec, testFeatures])
Ypred = np.array(Ypred > 0.5, dtype = int)

n = 0
c = 0
for i in range(len(Ytest)):
    if(Ytest[i] == Ypred[i] and Ytest[i] == 1):
        n = n + 1
    if(Ytest[i] == Ypred[i] and Ytest[i] == 0):
        c = c + 1
    
print(n, c)

2678 577


In [26]:
print(Ytest[Ytest == 1].sum())
Ytest[Ytest == 0].sum()

2793


0

In [17]:
n = 0
c = 0
for i, j in zip(Ytest, Ypred):
    if(i[0] == 1):
        n = n+1
    if(i[0] == 0):
        c = c+1
        
print(n, c)

2793 720


In [18]:
n = 0
c = 0
for i, j in zip(Ytest, Ypred):
    if(i[0] == 1 and j[0] == 0):
        n = n+1
    if(i[0] == 0 and j[0] == 1):
        c = c+1
        
print(n, c)

115 143


In [19]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

def sent_to_list(sent1, sent2):
    l1 = sent1.lower().split()
    l2 = sent2.lower().split()
    return l1, l2

In [33]:
dum1 = 'two dogs are fighting'
dum2 = 'two dogs are loving'

dum3 = 'the dough used for pancakes is thick'
dum4 = 'the dough used for pancakes is thin'

dum5 = 'The response time of the system should be high'
dum6 = 'The capacity of the system should be high'

l1, l2 = sent_to_list(dum5, dum6)
iou = jaccard_similarity(l1, l2)

s1 = convert_sent_to_emb([dum1.lower()], word_to_vec)
s2 = convert_sent_to_emb([dum2.lower()], word_to_vec)

In [34]:

s1.shape

(1, 40, 200)

In [35]:
model.predict([s1, s2, np.array([iou, 0, 0]).reshape(1, 3)])

array([[0.83693063]], dtype=float32)

In [23]:
model.save('contradiction_detection_model.h5')

  layer_config = serialize_layer_fn(layer)


In [24]:
cd_model = load_model('contradiction_detection_model.h5')

In [25]:
cd_model.predict([s1, s2, np.array([0.75, 0, 1.0]).reshape(1, 3)])

array([[0.02787796]], dtype=float32)