In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glove840b300dtxt/glove.840B.300d.txt
/kaggle/input/snli-data/SNLI/test.txt
/kaggle/input/snli-data/SNLI/ReadMe.txt
/kaggle/input/snli-data/SNLI/train.txt
/kaggle/input/snli-data/SNLI/dev.txt


In [2]:
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.layers import LSTM, Input, Dot, Softmax, Multiply, Concatenate, Subtract, Dense, Lambda, Embedding, Dropout
from keras.layers.wrappers import Bidirectional
from keras.models import Model, load_model

In [3]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip


--2021-12-07 15:22:54--  https://nlp.stanford.edu/projects/snli/snli_1.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94550081 (90M) [application/zip]
Saving to: ‘snli_1.0.zip’


2021-12-07 15:23:56 (1.47 MB/s) - ‘snli_1.0.zip’ saved [94550081/94550081]



In [4]:
!unzip snli_1.0.zip

Archive:  snli_1.0.zip
   creating: snli_1.0/
  inflating: snli_1.0/.DS_Store      
   creating: __MACOSX/
   creating: __MACOSX/snli_1.0/
  inflating: __MACOSX/snli_1.0/._.DS_Store  
 extracting: snli_1.0/Icon           
  inflating: __MACOSX/snli_1.0/._Icon  
  inflating: snli_1.0/README.txt     
  inflating: __MACOSX/snli_1.0/._README.txt  
  inflating: snli_1.0/snli_1.0_dev.jsonl  
  inflating: snli_1.0/snli_1.0_dev.txt  
  inflating: snli_1.0/snli_1.0_test.jsonl  
  inflating: snli_1.0/snli_1.0_test.txt  
  inflating: snli_1.0/snli_1.0_train.jsonl  
  inflating: snli_1.0/snli_1.0_train.txt  
  inflating: __MACOSX/._snli_1.0     


In [5]:
from os.path import join as pjoin, isfile
import json
import numpy as np

In [6]:
TEXT_DATA_DIR = 'snli_1.0'

def load_data(tier):
    
    premise = []
    hypothseis = []
    label = []
    cnt = 0
    
    with open(pjoin(TEXT_DATA_DIR, 'snli_1.0_' + tier + '.jsonl')) as f:
        for line in f.readlines():
            d = json.loads(line)
            if d['gold_label'] != '-':
                cnt += 1
                premise.append(d['sentence1'])
                hypothseis.append(d['sentence2'])
                label.append(d['gold_label'])

    print('# of', tier, 'samples :', cnt, end=' | ')
    print('Entailment :', label.count('entailment'), '| Neutral :', label.count('neutral'), '| Contradiction :', label.count('contradiction'))
    return (premise, hypothseis, label)

train = load_data('train')
dev = load_data('dev')
test = load_data('test')

# of train samples : 549367 | Entailment : 183416 | Neutral : 182764 | Contradiction : 183187
# of dev samples : 9842 | Entailment : 3329 | Neutral : 3235 | Contradiction : 3278
# of test samples : 9824 | Entailment : 3368 | Neutral : 3219 | Contradiction : 3237


In [7]:
SentenceLen = 100
WordVecLen = 300

if not isfile('tokenizer.pickle'):
    tokenizer = Tokenizer(num_words=SentenceLen)
    tokenizer.fit_on_texts(train[0] + train[1] + dev[0] + dev[1] + test[0] + test[1])
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

tokenizer = pickle.load(open('tokenizer.pickle', 'rb'))

def PadSeq(text):
    sequences = tokenizer.texts_to_sequences(text)
    return pad_sequences(sequences, maxlen=SentenceLen)

In [8]:
if not isfile('embeddings.npy'):

    embeddings_index = {}
    f = open('../input/glove840b300dtxt/glove.840B.300d.txt', encoding='utf8')
    for line in f:
        values = line.split()
        word = ' '.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    
    word_index = tokenizer.word_index

    embedding_matrix = np.zeros((len(word_index) + 1, WordVecLen))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    np.save('embeddings.npy', embedding_matrix)

def load_embeddings():
    embedding_matrix = np.load('embeddings.npy')
    embedding_layer = Embedding(len(tokenizer.word_index) + 1,
                                WordVecLen,
                                weights=[embedding_matrix],
                                input_length=SentenceLen,
                                trainable=False)
    return embedding_layer


embedding_layer = load_embeddings()

In [9]:
def labelToVec(labels):
    vec = []
    for label in labels:
        if label == 'entailment':
            vec.append([1.0, 0.0, 0.0])
        elif label == 'contradiction':
            vec.append([0.0, 1.0, 0.0])
        elif label == 'neutral':
            vec.append([0.0, 0.0, 1.0])
        else:
            raise ValueError('Unknown label %s' % (label))
    return np.array(vec)

train_y = labelToVec(train[2])
train_x = [PadSeq(train[0]), PadSeq(train[1])]
dev_y = labelToVec(dev[2])
dev_x = [PadSeq(dev[0]), PadSeq(dev[1])]
test_y = labelToVec(test[2])
test_x = [PadSeq(test[0]), PadSeq(test[1])]

del train
del dev
del test
del tokenizer

In [10]:
bilstm1 = Bidirectional(LSTM(300, return_sequences=True))
bilstm2 = Bidirectional(LSTM(300, return_sequences=True))

i1 = Input(shape=(SentenceLen,), dtype='float32')
i2 = Input(shape=(SentenceLen,), dtype='float32')

x1 = embedding_layer(i1)
x2 = embedding_layer(i2)

x1 = bilstm1(x1)
x2 = bilstm1(x2)

e = Dot(axes=2)([x1, x2])
e1 = Softmax(axis=2)(e)
e2 = Softmax(axis=1)(e)
e1 = Lambda(K.expand_dims, arguments={'axis' : 3})(e1)
e2 = Lambda(K.expand_dims, arguments={'axis' : 3})(e2)

_x1 = Lambda(K.expand_dims, arguments={'axis' : 1})(x2)
_x1 = Multiply()([e1, _x1])
_x1 = Lambda(K.sum, arguments={'axis' : 2})(_x1)
_x2 = Lambda(K.expand_dims, arguments={'axis' : 2})(x1)
_x2 = Multiply()([e2, _x2])
_x2 = Lambda(K.sum, arguments={'axis' : 1})(_x2)

m1 = Concatenate()([x1, _x1, Subtract()([x1, _x1]), Multiply()([x1, _x1])])
m2 = Concatenate()([x2, _x2, Subtract()([x2, _x2]), Multiply()([x2, _x2])])

y1 = bilstm2(m1)
y2 = bilstm2(m2)

mx1 = Lambda(K.max, arguments={'axis' : 1})(y1)
av1 = Lambda(K.mean, arguments={'axis' : 1})(y1)
mx2 = Lambda(K.max, arguments={'axis' : 1})(y2)
av2 = Lambda(K.mean, arguments={'axis' : 1})(y2)

y = Concatenate()([av1, mx1, av2, mx2])
y = Dense(1024, activation='tanh')(y)
y = Dropout(0.5)(y)
y = Dense(1024, activation='tanh')(y)
y = Dropout(0.5)(y)
y = Dense(3, activation='softmax')(y)

model = Model(inputs=[i1, i2], outputs=y)
model.summary()

2021-12-07 15:29:52.039066: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-07 15:29:52.173357: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-07 15:29:52.174543: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-07 15:29:52.178292: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     10461900    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 600)     1442400     embedding[0][0]              

In [11]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
history = model.fit(train_x, train_y, epochs=10, validation_data=(dev_x, dev_y))

2021-12-07 15:29:58.412124: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 219746800 exceeds 10% of free system memory.
2021-12-07 15:29:58.585609: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 219746800 exceeds 10% of free system memory.
2021-12-07 15:29:58.858609: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/10


2021-12-07 15:30:12.812115: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
score = model.evaluate(test_x, test_y, verbose=1)
print('Test loss : ', score[0])
print('Test accuracy : ', score[1])

Test loss :  0.8461464643478394
Test accuracy :  0.6169584393501282


In [14]:
model.save_weights('NLI.h5')

2021-12-07 23:30:03.366198: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 41847600 exceeds 10% of free system memory.
