# Quick and dirty model

The aim of this notebook is to provide a very simple model that will serve as a baseline for future modeling. The model used is a simple densely connected neural network with shared word embeddings for both input questions.

In [34]:
import pandas as pd

from keras import backend as K
from keras import layers
from keras.models import Model, Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [4]:
# Constants
MAX_WORDS = 300
MAX_LEN = 100

In [5]:
def prepare_tokenizer(df, max_words):
    all_texts = pd.concat([df['question1'], df['question2']])
    tokenizer = Tokenizer(max_words)
    tokenizer.fit_on_texts(str(text) for text in all_texts.to_list())
    return tokenizer

In [6]:
def pad_question_sequences(df, max_len):
    return pad_sequences(df['q1_sequences'], maxlen=max_len), pad_sequences(df['q2_sequences'], maxlen=max_len)

In [7]:
train = pd.read_csv('train.csv')

In [8]:
train.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [9]:
all_texts = pd.concat((train['question1'], train['question2']))

In [10]:
tokenizer = Tokenizer(MAX_WORDS)

In [11]:
tokenizer.fit_on_texts(str(text) for text in all_texts.to_list())

In [12]:
train['q1_sequences'] = tokenizer.texts_to_sequences(str(text) for text in train['question1'].to_list())

In [13]:
train['q2_sequences'] = tokenizer.texts_to_sequences(str(text) for text in train['question2'].to_list())

In [14]:
x_train, x_test, y_train, y_test = train_test_split(train.drop('is_duplicate', axis=1),
                                                    train['is_duplicate'],
                                                    test_size=0.05,
                                                    stratify=train['is_duplicate'].values,
                                                    random_state=42)

In [15]:
q1_train, q2_train, q1_test, q2_test = pad_sequences(x_train['q1_sequences'], maxlen=MAX_LEN), pad_sequences(x_train['q2_sequences'], maxlen=MAX_LEN), pad_sequences(x_test['q1_sequences'], maxlen=MAX_LEN), pad_sequences(x_test['q2_sequences'], maxlen=MAX_LEN)

In [29]:
def make_model():
    input_q1 = layers.Input(shape=(MAX_LEN,))
    input_q2 = layers.Input(shape=(MAX_LEN,))
    
    shared_embeddings = layers.Embedding(MAX_WORDS, 300, input_length=MAX_LEN)
    
    q1_embeddings = shared_embeddings(input_q1)
    q2_embeddings = shared_embeddings(input_q2)
    
    #sub = layers.Subtract()([q1_embeddings, q2_embeddings])
    concat = layers.Concatenate()([q1_embeddings, q2_embeddings])
    flattened = layers.Flatten()(concat)
    
    dense_1 = layers.Dense(256, activation='relu')(flattened)
    dense_2 = layers.Dense(128, activation='relu')(dense_1)
    dense_3 = layers.Dense(128, activation='relu')(dense_2)
    dense_4 = layers.Dense(64, activation='relu')(dense_3)
    
    out = layers.Dense(1, activation='sigmoid')(dense_4)
    
    return Model(inputs=[input_q1, input_q2], outputs=out)

In [75]:
class ModelMaker:
    
    def __init__(self, max_words, dimensions, input_length):
        self._max_words = max_words
        self._dimensions = dimensions
        self._input_length = input_length
    
    def _prepare_embeddings_model(self):
        input_q1 = layers.Input(shape=(self._input_length,))
        input_q2 = layers.Input(shape=(self._input_length,))
        
        shared_embeddings = layers.Embedding(self._max_words, self._dimensions, input_length=self._input_length)
        q1_embeddings = shared_embeddings(input_q1)
        q2_embeddings = shared_embeddings(input_q2)
        
        concat = layers.Concatenate()([q1_embeddings, q2_embeddings])
        flattened = layers.Flatten()(concat)
        
        return Model(inputs=[input_q1, input_q2], outputs=flattened)
    
    def prepare_model(self, n_neurons):
        input_q1 = layers.Input(shape=(self._input_length,))
        input_q2 = layers.Input(shape=(self._input_length,))
        
        embeddings_model = self._prepare_embeddings_model()
        
        int_result = embeddings_model(inputs=[input_q1, input_q2])
                
        model = Sequential()
        
        for n in n_neurons:
            model.add(layers.Dense(n, activation='relu'))
            
        model.add(layers.Dense(1, activation='sigmoid'))
        
        out = model(int_result)
        
        return Model(inputs=[input_q1, input_q2], outputs=out)

In [30]:
model = make_model()

In [31]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [32]:
history = model.fit([q1_train, q2_train],
                    y_train.values,
                    batch_size=128,
                    epochs=10,
                    validation_split=0.05)

Train on 364871 samples, validate on 19204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
model_maker = ModelMaker(max_words=MAX_WORDS, dimensions=300, input_length=MAX_LEN)

In [76]:
model = model_maker.prepare_model([64, 32, 32])

In [77]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [78]:
history = model.fit([q1_train, q2_train],
                    y_train.values,
                    batch_size=128,
                    epochs=10,
                    validation_split=0.05)

Train on 364871 samples, validate on 19204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [79]:
layer_sizes = [
    [64, 32],
    [64, 64, 32],
    [128, 128, 64, 32],
    [128, 128, 64, 64, 32],
    [256, 256, 128, 128, 64, 64, 32, 32, 16, 16, 8]
]

In [80]:
model_maker = ModelMaker(max_words=MAX_WORDS, dimensions=300, input_length=MAX_LEN)

results = []
for layers_size in layer_sizes:
    model = model_maker.prepare_model(layers_size)
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    history = model.fit([q1_train, q2_train],
                    y_train.values,
                    batch_size=128,
                    epochs=10,
                    validation_split=0.05)
    
    results.append(history)

Train on 364871 samples, validate on 19204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 364871 samples, validate on 19204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 364871 samples, validate on 19204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 364871 samples, validate on 19204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 364871 samples, validate on 19204 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
