In [1]:
import pandas as pd
import numpy as np
from keras import preprocessing
from keras.preprocessing.text import Tokenizer
from data import load_data

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Firstload in the Data

In [2]:
df = load_data.get_snli()
df

Unnamed: 0,Sentence 1,Sentence 2,isSim
0,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",0
1,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",1
2,Children smiling and waving at camera,There are children present,1
3,Children smiling and waving at camera,The kids are frowning,0
4,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.,0
5,A boy is jumping on skateboard in the middle o...,The boy does a skateboarding trick.,1
6,An older man sits with his orange juice at a s...,A boy flips a burger.,0
7,Two blond women are hugging one another.,The women are sleeping.,0
8,Two blond women are hugging one another.,There are women showing affection.,1
9,"A few people in a restaurant setting, one of t...",The people are sitting at desks in school.,0


Now we split into testing and training data

In [3]:
train,test = load_data.split_df(df)

In [4]:
max_words = 10000 #number of words to act as features
maxlen = 20 #maxium length of a sentence

Now we must tokenize the sentences

In [5]:
sentences = np.unique(np.append(train["sentence1"].values, train["sentence2"].values))
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(sentences)

In [6]:
def tokenize_df(df):
    pad = preprocessing.sequence.pad_sequences
    x1 =  pad(tokenizer.texts_to_sequences(df['sentence1']), maxlen=maxlen)
    x2 =  pad(tokenizer.texts_to_sequences(df['sentence2']), maxlen=maxlen)
    return x1,x2

In [7]:
train_x1, train_x2 = tokenize_df(train)
test_x1, test_x2 = tokenize_df(test)

In [8]:
train_x1.shape

(293780, 20)

In [9]:
test_x1.shape

(73593, 20)

Now lets setup the network, since we are making a siamese network lets setup the base layer that is shared first

In [10]:
from keras.layers import Embedding, LSTM, Dense, Input
from keras.models import Sequential, Model
from keras import layers

In [11]:
def create_base_network():
    """ Base network to be shared (eq. to feature extraction).
    """
    model = Sequential()
    model.add(Embedding(max_words, 64, input_length=maxlen))
    model.add(LSTM(64))
    return model

In [12]:
base = create_base_network()
in1 = Input(shape=(20,))
in2 = Input(shape=(20,))

In [13]:
base.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 64)            640000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
Total params: 673,024
Trainable params: 673,024
Non-trainable params: 0
_________________________________________________________________


Now we share the base layer btwn the two

In [14]:
encoded_1 = base(in1)
encoded_2 = base(in2)

In [15]:
concatenated = layers.concatenate([encoded_1, encoded_2], axis=-1)

In [16]:
predictions = Dense(1, activation='sigmoid')(concatenated)

In [17]:
model = Model(inputs=[in1, in2], outputs=predictions)

In [18]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [19]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 20)           0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 64)           673024      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 128)          0           sequential_1[1][0]               
          

In [20]:
model.save('model.h5')

In [21]:
# model.fit([train_x1, train_x2], train['isSim'], epochs=10)

In [22]:
# model.evaluate(x=[test_x1, test_x2], y=test['isSim'])

In [23]:
# model.metrics_names