In [1]:
import pandas as pd
import numpy as np
import preprocess_text as pt
import keras

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv('data/train.csv')

In [3]:
train_df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
print(train_df.values[0])

[0 1 2 'What is the step by step guide to invest in share market in india?'
 'What is the step by step guide to invest in share market?' 0]


In [5]:
pre = pt.Preprocess_text(word2vec_path="data/GoogleNews-vectors-negative300.bin")

[nltk_data] Downloading package punkt to /home/joakim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
q1_tokenized = pre.tokenize_data(train_df['question1'])

In [7]:
q2_tokenized = pre.tokenize_data(train_df['question2'])

In [8]:
print(q1_tokenized[5])

[1801011, 524970, 1576513, 0, 0, 2157674, 1143634, 2017886, 0, 1143634, 1786104, 2134523, 129564, 1203063, 451206, 447127, 204907]


In [9]:
train_df.question1.values[5]

'Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?'

In [10]:
pre.vectors.shape

(3000001, 300)

In [11]:
def vectorize_tokens(preprocessor, tokenized_array):
    return [preprocessor.idx2vector[word_index] for word_index in tokenized_array]
class SentGenerator(object):
    def __init__(self, tokenized_data1, tokenized_data2, y, preprocessor, batch_size, train=True):
        self.tokenized_data1 = tokenized_data1
        self.tokenized_data2 = tokenized_data2
        self.y = y
        self.preprocessor = preprocessor
        self.batch_size = batch_size
        self.num_values = len(tokenized_data1)
        self.train = train
        
    def generate(self):
        targets1, targets2, labels = [], [], []
        while True:
            for tokenized_array1, tokenized_array2, y_label in zip(self.tokenized_data1, self.tokenized_data2, self.y):
                targets1.append(vectorize_tokens(self.preprocessor, tokenized_array1))
                targets2.append(vectorize_tokens(self.preprocessor, tokenized_array2))
                labels.append(y_label)
                if len(targets1) == self.batch_size:
                    tmp_targets1 = np.array(targets1)
                    tmp_targets2 = np.array(targets2)
                    tmp_labels = np.array(labels)
                    targets1, targets2, labels = [], [], []
                    if self.train:
                        yield [tmp_targets1, tmp_targets2], [tmp_targets1, tmp_targets2], tmp_labels
                    else:
                        yield [tmp_targets1, tmp_targets2], [tmp_targets1, tmp_targets2]

It is quite obvious that better tokenizing and pre-processing would help, as we see a lot of 0 values in our tokenized data. Indicating we have several unknown words present.

In [12]:
import CNNSentence
import RNNSentence; reload(RNNSentence)

<module 'RNNSentence' from 'RNNSentence.pyc'>

In [13]:
batch_size = 50
train_val_split = 0.9
use_sample = False
max_doc_length = 14

In [14]:
cnns = CNNSentence.CNNSentence(num_inputs=2, max_doc_length=max_doc_length, output_name_size={"classes": 1}, 
                               regularization="SomethingElse", num_features=250, kernel_size_start=2, 
                               final_activation="relu")
model = cnns.model

In [7]:
cnns = RNNSentence.RNNSentence(num_inputs=2, max_doc_length=max_doc_length, 
                               output_name_size={"classes": 1}, regularization="SomethingElse", 
                               num_features=500, rate_drop_lstm=0.2, rate_drop_dense=0.2)
model = cnns.model

In [14]:
y = train_df.is_duplicate.values

In [15]:
factor = 1 if not use_sample else 0.1
tind = np.random.rand(int(len(train_df) * factor)) < train_val_split

In [16]:
x1_pad = CNNSentence.get_padded_input(q1_tokenized, max_doc_length)

In [17]:
x2_pad = CNNSentence.get_padded_input(q2_tokenized, max_doc_length)

In [18]:
#y_oh = CNNSentence.one_hot_y(y)

In [19]:
x1_train = x1_pad[tind,:]
x2_train = x2_pad[tind,:]
x1_val = x1_pad[~tind,:]
x2_val = x2_pad[~tind,:]
y_train = y[tind]
y_val = y[~tind]

In [20]:
x1_train.shape

(363926, 14)

In [24]:
#optim = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
#optim = keras.optimizers.Adam(lr=0.001)
optim = keras.optimizers.Adam(lr=0.001, beta_1=0.85, beta_2=0.99, epsilon=1e-07, decay=0.0)
model.compile(loss='binary_crossentropy',
              optimizer=optim,
              metrics=['accuracy'])

In [26]:
trn_sent_gen = SentGenerator(x1_train, x2_train, y_train, pre, batch_size, train=False)

In [26]:
val_sent_gen = SentGenerator(x1_val, x2_val, y_val, pre, batch_size)

In [27]:
model.fit_generator(trn_sent_gen.generate(),
                                 steps_per_epoch=int(trn_sent_gen.num_values / batch_size),
                                 epochs=1,
                                 verbose=1,
                                 validation_data=val_sent_gen.generate(),
                                 validation_steps=int(val_sent_gen.num_values / batch_size),
                                 workers=1)

Epoch 1/1


<keras.callbacks.History at 0x7fdbd67e0110>

In [28]:
#optim = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
optim = keras.optimizers.Adam(lr=0.0001, beta_1=0.85, beta_2=0.99, epsilon=1e-07, decay=0.0)
model.compile(loss='binary_crossentropy',
              optimizer=optim,
              metrics=['accuracy'])

In [29]:
model.fit_generator(trn_sent_gen.generate(),
                                 steps_per_epoch=int(trn_sent_gen.num_values / batch_size),
                                 epochs=1,
                                 verbose=1,
                                 validation_data=val_sent_gen.generate(),
                                 validation_steps=int(val_sent_gen.num_values / batch_size),
                                 workers=1)

Epoch 1/1


<keras.callbacks.History at 0x7fdbd60b41d0>

In [49]:
from keras.models import Model, Sequential
from keras.layers import concatenate, Input
from keras.layers import Dense, Dropout
from keras.layers.merge import Concatenate

In [26]:
rnn_model = keras.models.load_model("rnn.hpy5")
cnn_model = keras.models.load_model("cnn.hpy5")

In [34]:
rnn_conc = rnn_model.layers[2]
cnn_conc = cnn_model.layers[2]

In [40]:
inp1 = Input(shape=(max_doc_length, 300, ))
inp2 = Input(shape=(max_doc_length, 300, ))
conc = Concatenate()([inp1, inp2])

In [55]:
rnn_cut = Model([rnn_model.layers[2].get_input_at(i) for i in range(2)], rnn_model.output)

TypeError: unhashable type: 'list'

In [37]:
# generate feature_model for ensamble

rnn_features = rnn.get_layer("feature_layer").output
cnn_features = cnn.get_layer("feature_layer").output

x = concatenate([rnn_features, cnn_features], name="combined_features")
x = Dropout(0.2)(x)

x = Dense(1, activation="relu")(x)

ensamble_model = Model(inputs=[inp1, inp2], outputs=x)

AttributeError: 'Tensor' object has no attribute 'get_layer'

In [None]:
optim = keras.optimizers.Adam(lr=0.001, beta_1=0.85, beta_2=0.99, epsilon=1e-07, decay=0.0)
ensamble_model.compile(loss='binary_crossentropy',
              optimizer=optim,
              metrics=['accuracy'])

In [None]:
ensamble_model.fit_generator(trn_sent_gen.generate(),
                                 steps_per_epoch=int(trn_sent_gen.num_values / batch_size),
                                 epochs=1,
                                 verbose=1,
                                 validation_data=val_sent_gen.generate(),
                                 validation_steps=int(val_sent_gen.num_values / batch_size),
                                 workers=1)

In [24]:
model = keras.models.load_model("cnn.hpy5")

In [30]:
del pre

In [4]:
test_df = pd.read_csv('data/test.csv')

In [5]:
test_df.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [6]:
#test_q1_tokenized = pre.tokenize_data(test_df['question1'])
#test_q2_tokenized = pre.tokenize_data(test_df['question2'])

In [31]:
test_q1_tokenized = np.load("test_q1_tokenized.npy")

In [32]:
test_q2_tokenized = np.load("test_q2_tokenized.npy")

In [33]:
test_pad1, test_pad2 = CNNSentence.get_padded_input(test_q1_tokenized, max_doc_length), CNNSentence.get_padded_input(test_q2_tokenized, max_doc_length)

In [34]:
test_sen_gen = SentGenerator(test_pad1, test_pad2, np.zeros(len(test_pad1)), pre, batch_size=36, train=False)

In [35]:
preds = model.predict_generator(test_sen_gen.generate(), steps=int(test_sen_gen.num_values/36))

In [36]:
preds[:2].reshape(2)

array([ 0.08587371,  0.13448898], dtype=float32)

In [37]:
def do_clip(arr, mx): return np.clip(arr, (1-mx)/2, mx)

In [38]:
clip_preds = do_clip(preds, 0.95)

In [39]:
indexes =[int(i) for i in range(len(preds))]

In [40]:
preds.shape

(2345796, 1)

In [41]:
subm = np.zeros((len(preds), 2)).astype(object)
subm[:,0] = subm[:,0].astype(int)
subm[:,0] = subm[:,0].astype(float)
subm[:,0] = indexes
subm[:, 1] = clip_preds.reshape(len(preds))

In [42]:
subm.shape

(2345796, 2)

In [43]:
submission = pd.DataFrame(subm, columns=['test_id', 'is_duplicate'])
submission.head()

Unnamed: 0,test_id,is_duplicate
0,0,0.0858737
1,1,0.134489
2,2,0.262329
3,3,0.0319645
4,4,0.56186


In [44]:
submission.to_csv('ans13.csv', index=False)

In [None]:
#TODO find unique words in test, find unique words in train, see if we can reduce word2vec somehow?
#TODO