# Load Data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("../../../data/train_data.csv").drop("is_duplicate",axis=1)
train = train[~(train.question2.isna() | train.question1.isna())]

In [3]:
labels_pd = pd.read_csv("../../../data/train_labels.csv")
labels_pd = labels_pd.merge(train[["id"]],on="id",how="inner").is_duplicate
print('Shape of train data:', train.shape)
print('Shape of label data:', labels_pd.shape)

Shape of train data: (323160, 3)
Shape of label data: (323160,)


In [None]:
labels_pd = labels_pd.reset_index()
train = train.reset_indexe()

## create corpus of questions

In [4]:
import numpy as np

In [5]:
text_body = pd.concat([train.question1,train.question2]).unique()

In [6]:
line_size = lambda x: len(x) 
vfunc = np.vectorize(line_size)

In [7]:
import seaborn as sns
sns.distplot(vfunc(text_body))

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


<matplotlib.axes._subplots.AxesSubplot at 0x11ed41a58>

# Create Splits and Tokanise Data

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()#nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(text_body)

sequences_q1 = tokenizer.texts_to_sequences(train.question1)
sequences_q2 = tokenizer.texts_to_sequences(train.question2)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

PADDING_SIZE = 10

q1_data = pad_sequences(sequences_q1, maxlen=PADDING_SIZE)
q2_data = pad_sequences(sequences_q2, maxlen=PADDING_SIZE)


labels = labels_pd
print('Shape of q1 tensor:', q1_data.shape)
print('Shape of q2 tensor:', q2_data.shape)

print('Shape of label tensor:', labels.shape)

Using TensorFlow backend.


Found 86071 unique tokens.
Shape of q1 tensor: (323160, 10)
Shape of q2 tensor: (323160, 10)
Shape of label tensor: (323160,)


In [9]:
# split the data into a training set and a validation set
indices = np.arange(q1_data.shape[0])
np.random.shuffle(indices)


q1_data = q1_data[indices]
q1_data = q1_data[indices]


labels = labels[indices]

VALIDATION_SPLIT = 0.2

nb_validation_samples = int(VALIDATION_SPLIT * q1_data.shape[0])

x1_train = q1_data[:-nb_validation_samples]
x2_train = q2_data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x1_val = q1_data[-nb_validation_samples:]
x2_val = q2_data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

# Load Glove 100d into dict

In [10]:
import os
import numpy as np

In [11]:
GLOVE_DIR = "../../../data/glove/"

In [12]:
embeddings_index = {}
line_num = 0

f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
ff = list(f)

In [13]:
f.close()

In [14]:
for line in ff:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    line_num += 1
    if line_num == 400000:
        break;

# Create Embedding


In [15]:
EMBEDDING_DIM = 100

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [16]:
from keras.layers import Embedding

MAX_SEQUENCE_LENGTH = x1_train.shape[1]

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [17]:
x1_train.shape

(258528, 10)

# Model

In [18]:
from keras.layers import * 
from keras.models import Model


In [19]:
input1_tensor = Input(x1_train.shape[1:])
input2_tensor = Input(x2_train.shape[1:])

words_embedding_layer = embedding_layer
seq_embedding_layer = LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 10, 100)      8607200     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 256)          365568      embedding_1[0][0]                
          

In [19]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [20]:
model.fit([x1_train, x2_train], y_train, 
          validation_data=([x1_val, x2_val], y_val), 
          batch_size=128, epochs=6)

Train on 258528 samples, validate on 64632 samples
Epoch 1/6
 35968/258528 [===>..........................] - ETA: 19:44 - loss: 0.6603

KeyboardInterrupt: 