In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec

df = pd.read_csv('input/train.csv')

Using TensorFlow backend.


In [2]:
sent1 = list(df['sent1'])
sent2 = list(df['sent2'])
samesies = list(df['same_source'])
del df

In [3]:
def word2vec(sentences, embedding_dim):
    model = Word2Vec(sentences, min_count=1, size=embedding_dim)
    wv = model.wv
    return wv

def create_embedding_matrix(tokenizer, word_vectors, embedding_dim):
    num_words = len(tokenizer.word_index) + 1
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((num_words, embedding_dim))
    print("Embedding matrix shape: %s" % str(embedding_matrix.shape))
    for word, i in word_index.items():
        try:
            embedding_vector = word_vectors[word]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
        except KeyError:
            pass
    print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    return embedding_matrix

In [4]:
both_sent = sent1 + sent2
tokenizer = Tokenizer()
tokenizer.fit_on_texts(both_sent)
both_sent = [x.lower().split() for x in both_sent]
wv = word2vec(both_sent,100)
embedding_matrix = create_embedding_matrix(tokenizer, wv,100)

Embedding matrix shape: (140451, 100)
Null word embeddings: 33552


In [5]:
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.models import Model
from keras import optimizers
import time
import gc
import os

def create_train(tokenizer, sent1, sent2, is_similar, max_sequence_length, validation_split_ratio):
    sent1 = [x.lower() for x in sent1]
    sent2 = [x.lower() for x in sent2]
    train_sequences_1 = tokenizer.texts_to_sequences(sent1)
    train_sequences_2 = tokenizer.texts_to_sequences(sent2)
    leaks = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
             for x1, x2 in zip(train_sequences_1, train_sequences_2)]

    train_padded_data_1 = pad_sequences(train_sequences_1, maxlen=max_sequence_length)
    train_padded_data_2 = pad_sequences(train_sequences_2, maxlen=max_sequence_length)
    train_labels = np.array(is_similar)
    leaks = np.array(leaks)

    shuffle_indices = np.random.permutation(np.arange(len(train_labels)))
    train_data_1_shuffled = train_padded_data_1[shuffle_indices]
    train_data_2_shuffled = train_padded_data_2[shuffle_indices]
    train_labels_shuffled = train_labels[shuffle_indices]
    leaks_shuffled = leaks[shuffle_indices]

    dev_idx = max(1, int(len(train_labels_shuffled) * validation_split_ratio))

    del train_padded_data_1
    del train_padded_data_2
    gc.collect()

    train_data_1, val_data_1 = train_data_1_shuffled[:-dev_idx], train_data_1_shuffled[-dev_idx:]
    train_data_2, val_data_2 = train_data_2_shuffled[:-dev_idx], train_data_2_shuffled[-dev_idx:]
    labels_train, labels_val = train_labels_shuffled[:-dev_idx], train_labels_shuffled[-dev_idx:]
    leaks_train, leaks_val = leaks_shuffled[:-dev_idx], leaks_shuffled[-dev_idx:]

    return train_data_1, train_data_2, labels_train, leaks_train, val_data_1, val_data_2, labels_val, leaks_val


def create_test(tokenizer, sent_pair, max_length):
    test_sent1 = [x[0].lower() for x in sent_pair]
    test_sent2 = [x[1].lower() for x in sent_pair]

    test_sent1 = tokenizer.texts_to_sequences(test_sent1)
    test_sent2 = tokenizer.texts_to_sequences(test_sent2)
    leaks_test = [[len(set(x1)), len(set(x2)), len(set(x1).intersection(x2))]
                  for x1, x2 in zip(test_sent1, test_sent2)]

    leaks_test = np.array(leaks_test)
    test_data_1 = pad_sequences(test_sent1, maxlen=max_length)
    test_data_2 = pad_sequences(test_sent2, maxlen=max_length)

    return test_data_1, test_data_2, leaks_test

In [7]:
train_data_x1, train_data_x2, train_labels, leaks_train, \
val_data_x1, val_data_x2, val_labels, leaks_val = create_train(tokenizer, sent1,sent2,
                                                                       samesies, 30,
                                                                       .2)

In [8]:
sentences_pair = [(x1, x2) for x1, x2 in zip(sent1, sent2)]

EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = 30
VALIDATION_SPLIT = 0.1
LSTM_DROP_RATE = 0.4
DENSE_DROP_RATE = 0.5
LSTM_UNITS = 100
DENSE_UNITS = 100

num_words = len(tokenizer.word_index) + 1

# Creating word embedding layer
embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)

# Creating LSTM Encoder
lstm_layer = Bidirectional(LSTM(LSTM_UNITS, dropout=LSTM_DROP_RATE, recurrent_dropout=LSTM_DROP_RATE))

# Creating LSTM Encoder layer for First Sentence
sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

# Creating LSTM Encoder layer for Second Sentence
sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
x2 = lstm_layer(embedded_sequences_2)

# Creating leaks input
leaks_input = Input(shape=(leaks_train.shape[1],))
leaks_dense = Dense(int(DENSE_UNITS/2), activation='relu')(leaks_input)

# Merging two LSTM encodes vectors from sentences to
# pass it to dense layer applying dropout and batch normalisation
merged = concatenate([x1, x2, leaks_dense])
merged = BatchNormalization()(merged)
merged = Dropout(DENSE_DROP_RATE)(merged)
merged = Dense(DENSE_UNITS, activation='relu')(merged)
merged = BatchNormalization()(merged)
merged = Dropout(DENSE_DROP_RATE)(merged)
preds = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(lr=0.001), metrics=['acc'])
print(model.summary())

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint('checkpoints/', save_best_only=True, save_weights_only=False)
tensorboard = TensorBoard(log_dir='model/' + "logs/{}".format(time.time()))

print(train_data_x1[0])

model.fit([train_data_x1, train_data_x2, leaks_train], train_labels,
          validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels),
          epochs=30, batch_size=64, shuffle=True,
          callbacks=[early_stopping, model_checkpoint, tensorboard])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 100)      14045100    input_1[0][0]                    
                                                                 input_2[0][0]                    
______________________________________________________________

KeyboardInterrupt: 

In [None]:
test_df = pd.read_csv('input/test.csv')

In [None]:
test_df['tuple'] = list(zip(test_df.sent1, test_df.sent2))
test_df.head()

In [None]:
list_test = list(test_df.tuple)

In [None]:
from operator import itemgetter
from keras.models import load_model

model = load_model(best_model_path)

test_sentence_pairs = [('What can make Physics easy to learn?','How can you make physics easy to learn?'),('How many times a day do a clocks hands overlap?','What does it mean that every time I look at the clock the numbers are the same?')]

test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer,list_test, 30)

preds = list(model.predict([test_data_x1, test_data_x2, leaks_test], verbose=1).ravel())
results = [(x, y, z) for (x, y), z in zip(list_test, preds)]

In [None]:
siamese

In [None]:
print(results[0])

In [None]:
test_dfy = pd.DataFrame(results) 
test_dfy.head() 

In [None]:
print(test_df.shape)
print(test_dfy.shape)

In [None]:
result = pd.concat([test_df, test_dfy], axis=1, join='inner')
result

In [None]:
result.drop(['sent1','sent2','tuple',0,1],axis=1,inplace=True)
result.head()

In [None]:
result.columns = ['id', 'same_source']

In [None]:
result.shape

In [None]:
result.to_csv('submission_4.csv')

In [None]:
final_df = pd.DataFrame(results)

In [None]:
final_df.head()