In [1]:
import pandas as pd
import numpy as np
from keras import preprocessing
from keras.preprocessing.text import Tokenizer
from data import load_data

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# fix random seed for reproducibility
seed = 23
np.random.seed(seed)

In [3]:
train, dev, test = load_data.load_sts()

train.head()

Unnamed: 0,score,sentence1,sentence2
0,5.0,A plane is taking off.,An air plane is taking off.
1,3.8,A man is playing a large flute.,A man is playing a flute.
2,3.8,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,2.6,Three men are playing chess.,Two men are playing chess.
4,4.25,A man is playing the cello.,A man seated is playing the cello.


First we need to tokeinze all the words

In [4]:
max_words = 10000 #number of words to act as features
maxlen = 20 #maxium length of a sentence

In [5]:
sentences = np.unique(np.append(train["sentence1"].values, train["sentence2"].values))
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(sentences)

In [6]:
num_words_found = len(tokenizer.word_counts)
print('skipping {} words'.format(num_words_found - max_words))

skipping 2064 words


In [7]:
def tokenize_df(df):
    pad = preprocessing.sequence.pad_sequences
    x1 =  pad(tokenizer.texts_to_sequences(df['sentence1']), maxlen=maxlen)
    x2 =  pad(tokenizer.texts_to_sequences(df['sentence2']), maxlen=maxlen)
    return x1,x2

In [8]:
(train_x1, train_x2), (dev_x1, dev_x2), (test_x1, test_x2) = tuple(map(lambda x: tokenize_df(x), [train, dev, test]))

Now we can build the model

In [9]:
from keras.layers import Embedding, LSTM, Dense, Input
from keras.models import Sequential, Model
from keras import layers
import keras

First we build a base model to be shared btwn the two input sentences so they both are encodded the same

In [10]:
def create_base_network():
    """ Base network to be shared (eq. to feature extraction).
    """
    model = Sequential()
    model.add(Embedding(max_words, 64, input_length=maxlen))
    model.add(LSTM(64))
    return model

In [11]:
base = create_base_network()
in1 = Input(shape=(20,))
in2 = Input(shape=(20,))

Now we share the base layer and combine them

In [12]:
encoded_1 = base(in1)
encoded_2 = base(in2)

concatenated = layers.concatenate([encoded_1, encoded_2], axis=-1)

The output has no activation so it can output any number

In [13]:
predictions = Dense(1)(concatenated)
model = Model(inputs=[in1, in2], outputs=predictions)

In [26]:
model.compile(optimizer='rmsprop',
              loss='mse',
              metrics=['mae'])

Add callbacks to stop training when it doesnt get better

In [33]:
callbacks_list = [
    keras.callbacks.EarlyStopping(
        monitor='val_mean_absolute_error',
        patience=1,
    ),
    keras.callbacks.ModelCheckpoint(
        filepath='my_model.h5',
        monitor='val_loss',
        save_best_only=True,
    )
]

In [None]:
model.fit([train_x1, train_x2], train['score'], epochs=10,
            batch_size=32,
            callbacks=callbacks_list,
            validation_data=([dev_x1, dev_x2], dev['score']))

Train on 5749 samples, validate on 1500 samples
Epoch 1/10
Epoch 2/10
 224/5749 [>.............................] - ETA: 4s - loss: 0.0408 - mean_squared_error: 0.0408





In [29]:
model.evaluate(x=[test_x1, test_x2], y=test['score'])



[3.055799792692919, 3.055799792692919]

In [30]:
model.metrics_names

['loss', 'mean_squared_error']