# Architecture A - Multi Concat Functions with Dropout and Using Word2Vec Embedding

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Architecture-A---Multi-Concat-Functions-with-Dropout-and-Using-Word2Vec-Embedding" data-toc-modified-id="Architecture-A---Multi-Concat-Functions-with-Dropout-and-Using-Word2Vec-Embedding-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Architecture A - Multi Concat Functions with Dropout and Using Word2Vec Embedding</a></span></li><li><span><a href="#Download-libraries-for-Google-Colab-and-Download-Embeddings" data-toc-modified-id="Download-libraries-for-Google-Colab-and-Download-Embeddings-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Download libraries for Google Colab and Download Embeddings</a></span></li><li><span><a href="#Import-Data" data-toc-modified-id="Import-Data-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Import Data</a></span></li><li><span><a href="#Load-Embedding-and-Create-Embedding-Layer" data-toc-modified-id="Load-Embedding-and-Create-Embedding-Layer-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load Embedding and Create Embedding Layer</a></span></li><li><span><a href="#Define-and-Train-Network" data-toc-modified-id="Define-and-Train-Network-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Define and Train Network</a></span></li><li><span><a href="#Pass-Train-and-Test-Data-Through-Network-and-Save-for-Stacking" data-toc-modified-id="Pass-Train-and-Test-Data-Through-Network-and-Save-for-Stacking-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Pass Train and Test Data Through Network and Save for Stacking</a></span></li></ul></div>

# Download libraries for Google Colab and Download Embeddings

In [0]:
! pip install pandas==0.23.4

Collecting pandas==0.23.4
[?25l  Downloading https://files.pythonhosted.org/packages/e1/d8/feeb346d41f181e83fba45224ab14a8d8af019b48af742e047f3845d8cff/pandas-0.23.4-cp36-cp36m-manylinux1_x86_64.whl (8.9MB)
[K    100% |████████████████████████████████| 8.9MB 3.8MB/s 
[31mcufflinks 0.14.6 has requirement plotly>=3.0.0, but you'll have plotly 1.12.12 which is incompatible.[0m
Installing collected packages: pandas
  Found existing installation: pandas 0.22.0
    Uninstalling pandas-0.22.0:
      Successfully uninstalled pandas-0.22.0
Successfully installed pandas-0.23.4


In [0]:
import pandas as pd
pd.__version__

'0.22.0'

In [0]:
! wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2018-12-12 23:02:59--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.232.61
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.232.61|:443... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 1647046227 (1.5G), 648801875 (619M) remaining [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2018-12-12 23:03:14 (41.3 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
! gunzip GoogleNews-vectors-negative300.bin.gz

# Import Data

In [0]:
import numpy as np
import pandas as pd
import csv

In [0]:
pd.__version__

'0.23.4'

In [0]:
df_train = pd.read_csv("train_data.csv",doublequote=True,quotechar='"',sep=",").drop("is_duplicate",axis=1)

In [0]:
df_t_labels= pd.read_csv('train_labels.csv', encoding='utf-8')
df_t_labels.head()

Unnamed: 0,id,is_duplicate
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [0]:
df_train = df_train.merge(df_t_labels,on=["id"],how="inner")
df_train.head()
df_train['id'] = df_train['id'].apply(str)
df_train['is_duplicate'] = df_train['is_duplicate'].apply(int)

In [0]:
df_train['question1'].fillna('', inplace=True)
df_train['question2'].fillna('', inplace=True)

In [0]:
pd.__version__

'0.23.4'

In [0]:
df_test = pd.read_csv('test_data.csv',doublequote=True)
df_test['test_id'] = df_test['test_id'].apply(str)

In [0]:
df_all = pd.concat((df_train, df_test))
df_all['question1'].fillna('', inplace=True)
df_all['question2'].fillna('', inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
import itertools

In [0]:
counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
    itertools.chain(df_all['question1'], df_all['question2']))

other_index = len(counts_vectorizer.vocabulary_)

In [0]:
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import keras 
keras.__version__

Using TensorFlow backend.


'2.2.4'

In [0]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

In [0]:
def create_padded_seqs(texts, max_len=30):
    seqs = texts.apply(lambda s: 
        [counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
         for w in words_tokenizer.findall(s.lower())])
    return pad_sequences(seqs, maxlen=max_len)

In [0]:
nlp_train_df = pd.read_csv("train_features_31.csv")

In [0]:
print(nlp_train_df.drop("id",axis=1).shape)
print(nlp_train_df.shape)

(323164, 31)
(323164, 32)


In [0]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val, train_id, train_val_id, nlp_train, nlp_val= \
    train_test_split(create_padded_seqs(df_train['question1']),
                     create_padded_seqs(df_train['question2']),
                     df_train['is_duplicate'].values,
                     df_train.id,
                     nlp_train_df.drop("id",axis=1).as_matrix(),
                     stratify=df_train['is_duplicate'].values,
                     test_size=0.3, random_state=2018)

  """


In [0]:
train_id_pd = pd.DataFrame(train_id.values)
train_val_id_pd = pd.DataFrame(train_val_id.values)

In [0]:
train_id_pd.to_csv("train_id_multiple_merge_dropout_word2vec.csv",index=False)
train_val_id_pd.to_csv("train_val_id_multiple_merge_dropout_word2vec.csv", index=False)

# Load Embedding and Create Embedding Layer

In [0]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

EMBEDDING_DIM=300
NUM_WORDS= 10000
word_index = counts_vectorizer.vocabulary_

vocabulary_size=min(len(word_index)+1,NUM_WORDS)


embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

In [0]:
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

In [0]:
embedding_matrix.shape

(10000, 300)

In [0]:
from keras.layers import Embedding

MAX_SEQUENCE_LENGTH = X1_train.shape[1]

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Define and Train Network

In [0]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [0]:
from keras.layers import *
from keras.models import Model

In [0]:
words_embedding_layer = embedding_layer
seq_embedding_layer = LSTM(256, activation='tanh')

input1_tensor = Input(X1_train.shape[1:])
input2_tensor = Input(X2_train.shape[1:])

words_embedding_layer1 = words_embedding_layer(input1_tensor)
words_embedding_layer2 = words_embedding_layer(input2_tensor)

seq_embedding_layer1 = seq_embedding_layer(words_embedding_layer1) 
seq_embedding_layer2 = seq_embedding_layer(words_embedding_layer2)

norm_seq_embedding_layer1 = batch_norm(seq_embedding_layer1)
norm_seq_embedding_layer2 = batch_norm(seq_embedding_layer2)


multiply_merge = multiply([norm_seq_embedding_layer1, norm_seq_embedding_layer2])

sub_merge = subtract([norm_seq_embedding_layer1, norm_seq_embedding_layer2])

add_merge = add([norm_seq_embedding_layer1, norm_seq_embedding_layer2])

max_merge = maximum([norm_seq_embedding_layer1, norm_seq_embedding_layer2])

merge_layer = concatenate([multiply_merge, sub_merge, add_merge,max_merge])

dropout1 = Dropout(0.15)(merge_layer)

dense1_layer = Dense(100, activation='sigmoid')(dropout1)

dropout2 = Dropout(0.15)(dense1_layer)

dense2_layer = Dense(100, activation='sigmoid')(dropout2)


ouput_layer = Dense(1, activation='sigmoid')(dense2_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["accuracy"])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 30, 300)      3000000     input_15[0][0]                   
                                                                 input_16[0][0]                   
__________________________________________________________________________________________________
lstm_8 (LSTM)                   (None, 256)          570368      embedding_2[0][0]                
          

In [0]:
# with 300d
#Epoch 00003: val_loss improved from 0.42090 to 0.39988, saving model to weights.best.Siamese.gloverLSTM256
#Epoch 4/5
#226214/226214 [==============================] - 329s 1ms/step - loss: 0.3274 - acc: 0.8537 - val_loss: 0.3956 - val_acc: 0.8191

In [0]:
! mkdir weights.LSTM256.multiple.merge.word2vec

mkdir: cannot create directory ‘weights.LSTM256.multiple.merge.word2vec’: File exists


In [0]:
from keras.callbacks import ModelCheckpoint  


directory = "weights.LSTM256.multiple.merge.word2vec/dropout.{epoch:02d}-{loss:.2f}-{val_loss:.2f}.hdf5"

checkpointer = ModelCheckpoint(filepath=directory, 
                               verbose=1, save_best_only=False,monitor="val_loss")


In [0]:
model.fit([X1_train, X2_train], 
          y_train, 
            batch_size=128, 
            epochs=4,
            callbacks=[checkpointer],
            validation_data=([X1_val, X2_val], y_val))

Train on 226214 samples, validate on 96950 samples
Epoch 1/4

Epoch 00001: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.01-0.47-0.44.hdf5
Epoch 2/4

Epoch 00002: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.02-0.39-0.41.hdf5
Epoch 3/4

Epoch 00003: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.03-0.35-0.39.hdf5
Epoch 4/4

Epoch 00004: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.04-0.32-0.37.hdf5


<keras.callbacks.History at 0x7fd12d504e10>

In [0]:
#Train on 226214 samples, validate on 96950 samples
#Train on 226214 samples, validate on 96950 samples
#Epoch 1/4
#226214/226214 [==============================] - 364s 2ms/step - loss: 0.4673 - acc: 0.7692 - val_loss: 0.4354 - val_acc: 0.7914
#
#Epoch 00001: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.01-0.47-0.44.hdf5
#Epoch 2/4
#226214/226214 [==============================] - 358s 2ms/step - loss: 0.3949 - acc: 0.8124 - val_loss: 0.4078 - val_acc: 0.8048
#
#Epoch 00002: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.02-0.39-0.41.hdf5
#Epoch 3/4
#226214/226214 [==============================] - 356s 2ms/step - loss: 0.3532 - acc: 0.8361 - val_loss: 0.3857 - val_acc: 0.8180
#
#Epoch 00003: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.03-0.35-0.39.hdf5
#Epoch 4/4
#226214/226214 [==============================] - 355s 2ms/step - loss: 0.3185 - acc: 0.8552 - val_loss: 0.3746 - val_acc: 0.8279
#
#Epoch 00004: saving model to weights.LSTM256.multiple.merge.word2vec/dropout.04-0.32-0.37.hdf5
#<keras.callbacks.History at 0x7fd12d504e10>

In [0]:
# Load Best Epoch
model.load_weights("weights.LSTM256.multiple.merge.word2vec/dropout.03-0.35-0.39.hdf5")

# Pass Train and Test Data Through Network and Save for Stacking

In [0]:
#features_model_512 = Model([input1_tensor, input2_tensor], merge_layer)
#features_model_512.compile(loss='mse', optimizer='adam')

In [0]:
features_model_16 = Model([input1_tensor, input2_tensor], dense1_layer)
features_model_16.compile(loss='mse', optimizer='adam')

In [0]:
train_preds = [create_padded_seqs(df_train['question1']),create_padded_seqs(df_train['question2'])]
test_preds = [create_padded_seqs(df_test['question1']),create_padded_seqs(df_test['question2'])]

In [0]:
F_train_16 = features_model_16.predict(train_preds, batch_size=128)
F_test_16 = features_model_16.predict(test_preds, batch_size=128)

In [0]:
F_train_1 = model.predict(train_preds, batch_size=128)
F_test_1 = model.predict(test_preds, batch_size=128)

In [0]:
test_1 = pd.concat([df_test.test_id,pd.DataFrame(F_test_1)],axis=1)
test_16 = pd.concat([df_test.test_id,pd.DataFrame(F_test_16)],axis=1)

In [0]:
print(test_1.shape)
print(df_test.shape)
test_16.head(1)

(81126, 2)
(81126, 3)


Unnamed: 0,test_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,15,0.193592,0.005574,0.000911,0.993908,0.338313,0.723192,0.795046,0.00223,0.158854,0.988268,2e-06,0.007147,0.451349,0.000704,0.218361,0.002545,9e-06,2.2e-05,4e-06,0.18982,3e-05,2.927827e-08,0.606357,7e-06,1.038797e-07,0.012725,0.942089,0.243378,2e-06,0.005277,0.999998,0.009669,7e-06,0.013546,0.998158,2e-06,0.057761,0.017171,0.002004,...,0.995264,1.028949e-07,0.013369,0.00278,0.101534,0.955251,0.997745,0.451523,0.000364,0.104227,0.005304,2.279523e-07,0.99782,0.990574,0.351793,0.036958,1.4e-05,0.112098,0.998474,0.00014,3e-06,0.998801,0.871266,0.858014,0.8965,3e-06,0.985471,0.162734,2.359971e-07,0.004697,0.033329,4.848347e-07,0.982954,4e-06,0.00162,0.998611,0.963799,0.001922,0.00044,0.99841


In [0]:
train_1 = pd.concat([df_train.id,pd.DataFrame(F_train_1)],axis=1)
train_16 = pd.concat([df_train.id,pd.DataFrame(F_train_16)],axis=1)

In [0]:
print(train_1.shape)
print(df_train.shape)
train_16.head(1)

(323164, 2)
(323164, 4)


Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0,0.187019,0.339917,9.9e-05,0.081118,0.408207,0.83827,0.356676,0.003912,0.021298,0.000603,1.1e-05,0.000147,0.000317,0.001656,0.024902,3.7e-05,4.6e-05,0.004094,0.000474,0.005132,2e-06,1.3e-05,0.002496,0.000216,5e-06,0.009388,0.985914,0.000195,2.3e-05,0.002273,0.998812,0.556846,3e-06,0.180587,0.84744,0.002108,0.958627,0.003195,0.000801,...,0.006933,2e-06,0.396937,4.1e-05,2.2e-05,0.548771,0.336816,0.003949,0.005035,0.024591,7.5e-05,8.2e-05,0.005896,0.255509,0.084327,0.000645,2.104622e-07,0.003387,0.018401,0.000776,2.6e-05,0.981612,0.991896,0.001459,0.001782,9.054973e-07,0.005581,0.598334,0.005106,9.6e-05,0.007305,1.8e-05,0.27108,2.3e-05,0.000304,0.289869,0.860059,0.000668,0.000604,0.998583


In [0]:
train_16.to_csv("train_100_multi_merge_dropout_word2vec.csv",index=False)
train_1.to_csv("train_1_merge_dropout_word2vec.csv",index=False)

In [0]:
pd.read_csv("train_16_multi_merge_dropout.csv").head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0,0.004667,0.07342,0.127493,0.20464,0.040163,0.152333,0.102389,0.036234,0.111532,0.028089,0.150028,0.048816,0.031324,0.052991,0.036564,0.068313,0.05479,0.169379,0.09156,0.04119,0.02022,0.019513,0.026912,0.084623,0.103742,0.060157,0.006674,0.116495,0.020345,0.218703,0.362897,0.023856,0.09903,0.259389,0.175478,0.030123,0.180475,0.034675,0.10202,...,0.044497,0.024875,0.1005,0.133805,0.085602,0.059995,0.051828,0.035246,0.014621,0.365312,0.33334,0.034149,0.009329,0.071517,0.067056,0.0481,0.018455,0.013533,0.05922,0.087057,0.096042,0.227474,0.030402,0.01463,0.117645,0.103173,0.086414,0.194886,0.028696,0.014988,0.018165,0.021553,0.070063,0.010757,0.017155,0.030942,0.177486,0.027851,0.007006,0.022598
1,1,0.000423,0.897088,0.002005,0.338341,0.660417,0.001525,0.853235,0.623497,0.000516,0.001115,0.169963,0.000143,0.445844,0.002176,0.944412,0.623338,0.750383,0.315461,0.520308,0.008011,0.605156,0.000147,0.000265,0.499908,0.008825,0.115254,0.827936,0.954568,0.528,0.008182,0.602005,0.152116,0.009338,0.975245,0.011404,0.683131,0.750451,0.676423,0.923661,...,0.251134,0.212705,0.000242,0.97736,0.518564,0.720124,0.305401,0.675188,0.00053,0.973834,0.795573,0.182144,0.00036,0.287978,0.864915,0.002588,0.473792,0.455723,0.592317,0.652479,0.901628,0.958366,0.003515,0.000186,0.564399,0.467446,0.001201,0.46635,0.766823,0.35634,0.000466,0.528631,0.425397,0.144474,0.797371,0.206964,0.944103,0.154332,0.000864,0.652351
2,2,0.115304,0.132704,0.058829,0.052093,0.030391,0.044588,0.226563,0.021577,0.029912,0.039258,0.226687,0.056501,0.1006,0.160682,0.22153,0.157684,0.060787,0.054259,0.021069,0.155891,0.42732,0.017356,0.047988,0.065651,0.029063,0.582467,0.049546,0.094049,0.084518,0.132829,0.130267,0.195862,0.059304,0.508515,0.058997,0.003851,0.33972,0.719692,0.050266,...,0.059524,0.506619,0.021308,0.137966,0.743919,0.042207,0.044071,0.277817,0.016746,0.241894,0.246933,0.032122,0.007194,0.078324,0.174977,0.067701,0.033444,0.211241,0.237725,0.164458,0.268323,0.569535,0.089587,0.011176,0.088941,0.323738,0.147319,0.12747,0.206096,0.128958,0.021739,0.004592,0.349932,0.497275,0.017509,0.036994,0.706145,0.018159,0.004911,0.016304
3,3,0.006831,0.839499,0.011691,0.67631,0.892242,0.012116,0.578721,0.642392,0.00561,0.021896,0.866786,0.004417,0.831264,0.016619,0.837934,0.54239,0.813424,0.416669,0.648489,0.029095,0.876559,0.016252,0.005036,0.491099,0.057817,0.886672,0.788893,0.816339,0.88708,0.022026,0.828281,0.723763,0.055295,0.958809,0.018945,0.647679,0.945159,0.504467,0.876758,...,0.874309,0.928691,0.009421,0.8811,0.763493,0.620783,0.757361,0.85444,0.005215,0.812121,0.850381,0.930143,0.002642,0.943876,0.847716,0.010996,0.605328,0.811276,0.790742,0.908339,0.849347,0.839488,0.005324,0.001685,0.840654,0.754639,0.008007,0.878216,0.712741,0.637815,0.009659,0.207069,0.674223,0.515196,0.893495,0.444446,0.955914,0.953855,0.007934,0.250554
4,4,0.002555,0.885917,0.096429,0.773589,0.650507,0.022017,0.903419,0.833384,0.004758,0.012168,0.756652,0.002675,0.72017,0.013975,0.670773,0.780261,0.22974,0.898469,0.576911,0.006888,0.068969,0.000873,0.004387,0.730955,0.008274,0.180409,0.42223,0.783457,0.33707,0.006441,0.83615,0.117267,0.017697,0.898733,0.007879,0.906324,0.16436,0.280883,0.569925,...,0.384491,0.143103,0.003422,0.773395,0.897884,0.46267,0.615819,0.163696,0.001693,0.915301,0.686311,0.498028,0.000732,0.722349,0.958903,0.010239,0.187053,0.452648,0.264134,0.43166,0.887274,0.681674,0.001358,0.000496,0.957469,0.738253,0.013733,0.845504,0.734865,0.083917,0.00018,0.030306,0.327885,0.101516,0.465974,0.1603,0.965753,0.339846,0.000933,0.016839


In [0]:
test_16.to_csv("test_100_multi_merge_dropout_word2vec.csv",index=False)
test_1.to_csv("test_1_multi_merge_dropout_word2vec.csv",index=False)

In [0]:
pd.read_csv("train_1.csv").head()

Unnamed: 0,id,0
0,0,0.654116
1,1,0.020497
2,2,0.078235
3,3,0.001608
4,4,0.007873
