## Load Packages

In [28]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import time
import pickle
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate, Lambda
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.models import load_model

## Data Preprocessing

In [2]:
max_features=95000
maxlen=70
embed_size=300

In [109]:
def data_preparation(predict=False):
    start_time = time.time()
    train_df = pd.read_csv("train.csv")
    print("Train shape : ",train_df.shape)

    
    if predict:
        test_df = pd.read_csv("test.csv")
        print("Test shape : ",test_df.shape)
        test_X = test_df["question_text"].fillna("_##_").values
        
        
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2018)
    
    # Bootstrap
    #count_class_0, count_class_1 = train_df['target'].value_counts()
    #df_class_0 = train_df[train_df['target'] == 0]
    #df_class_1 = train_df[train_df['target'] == 1]
    #df_class_1_over = df_class_1.sample(count_class_0, replace=True)
    #train_df = pd.concat([df_class_0, df_class_1_over], axis=0)
    #print("Train shape after Bootstrap : ",train_df.shape)
    #target_counts = train_df.target.value_counts()
    #print('sincere questions: {}%'. format(((target_counts[0]*100)/train_df.shape[0]).round(2)))
    #print('insincere questions: {}%'. format(((target_counts[1]*100)/train_df.shape[0]).round(2)))

    
    ## fill up the missing values
    train_X = train_df["question_text"].values
    val_X = val_df["question_text"].values

    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters='')
    print('fitting text to tokenizer..')
    check_point1 = time.time()
    tokenizer.fit_on_texts(list(train_X))
    word_index = tokenizer.word_index
    
    print('Found %s unique tokens.' % len(word_index))
    check_point2 = time.time()
    
    print('fitting took {:.2f} seconds to finish'.format(check_point2 - check_point1))
    
    print('transforming text to sequence of word indices..')
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    check_point3 = time.time()
    print('transforming took {:.2f} seconds to finish'.format(check_point3 - check_point2))
    if predict:
        test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    print('padding sentence to the same length..')
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    check_point4 = time.time()
    print('padding took {:.2f} seconds to finish'.format(check_point4 - check_point3))
    
    if predict:
        test_X = pad_sequences(test_X, maxlen=maxlen)
        
    print('it took {:.2f} seconds to finish data prepartation'.format(time.time() - start_time))

    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values    
    
    if predict:
        return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index, tokenizer
    else:
        return train_X, val_X, train_y, val_y, tokenizer.word_index

In [110]:
train_X, val_X, test_X, train_y, val_y, word_index, tokenizer = data_preparation(predict=True)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)
fitting text to tokenizer..
Found 427454 unique tokens.
fitting took 17.92 seconds to finish
transforming text to sequence of word indices..
transforming took 20.00 seconds to finish
padding sentence to the same length..
padding took 6.13 seconds to finish
it took 47.77 seconds to finish data prepartation


## Load Embeddings

In [5]:
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Glove Embeddings

In [6]:
def load_glove(word_index):
    EMBEDDING_FILE = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8'))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

### Wiki News FastText Embeddings

In [7]:
def load_fasttext(word_index):
    EMBEDDING_FILE = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

### Paragram Embeddings

In [8]:
def load_para(word_index):
    EMBEDDING_FILE = 'embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [9]:
embedding_matrix1 = load_glove(word_index)
#embedding_fname='glove.6B.50d.txt'

A Jupyter Widget




In [10]:
embedding_matrix2 = load_fasttext(word_index)

A Jupyter Widget




In [11]:
embedding_matrix3 = load_para(word_index)

A Jupyter Widget




In [12]:
embedding_matrix = np.mean([embedding_matrix1,embedding_matrix2, embedding_matrix3], axis = 0)
np.shape(embedding_matrix)

(95000, 300)

## Load Models

### CNN Model

In [13]:
def model_cnn(embedding_matrix):
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [14]:
def model_cnn_flip(embedding_matrix):
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Lambda(lambda x: K.reverse(x,axes=-1))(inp)
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(x)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

### Attention Layer

In [15]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True
        
    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)
    
    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

### LSTM Model

In [16]:
def model_lstm_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [24]:
def model_gru_srk_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x) 
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model 

In [25]:
def model_lstm_du(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(64, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [26]:
def model_gru_atten_3(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(GRU(128, return_sequences=True))(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    x = Bidirectional(GRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

## Train, Predict and Blend

In [17]:
def train_pred(model, epochs=2):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

        best_thresh = 0.5
        best_score = 0.0
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            score = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
            if score > best_score:
                best_thresh = thresh
                best_score = score

        print("Val F1 Score: {:.4f}".format(best_score))

    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    return pred_val_y, pred_test_y, best_score

### Train Models

In [None]:
outputs = []

In [32]:
model1 = model_cnn(embedding_matrix)
pred_val_y, pred_test_y, best_score = train_pred(model1, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'CNN'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6519
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6470


In [33]:
model1.save('model1.h5', 'w') 

In [34]:
model2 = model_cnn_flip(embedding_matrix)
pred_val_y, pred_test_y, best_score = train_pred(model2, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'CNN flip'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6515
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6501


In [35]:
model2.save('model2.h5', 'w') 

In [120]:
model3 = model_lstm_atten(embedding_matrix)
pred_val_y, pred_test_y, best_score = train_pred(model3, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'LSTM atten'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6545
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6617


In [121]:
model3.save('model3.h5', 'w') 

In [122]:
model4 = model_gru_srk_atten(embedding_matrix)
pred_val_y, pred_test_y, best_score = train_pred(model4, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'GRU srk atten'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6550
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6579


In [123]:
model4.save('model4.h5', 'w') 

In [124]:
model5 = model_lstm_du(embedding_matrix)
pred_val_y, pred_test_y, best_score = train_pred(model5, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'LSTM du'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6560
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6603


In [125]:
model5.save('model5.h5', 'w') 

In [126]:
model6 = model_gru_atten_3(embedding_matrix)
pred_val_y, pred_test_y, best_score = train_pred(model6, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'GRU atten 3'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6336
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6504


In [127]:
model6.save('model6.h5', 'w') 

In [39]:
model7 = model_lstm_atten(embedding_matrix1)

In [40]:
pred_val_y, pred_test_y, best_score = train_pred(model7, epochs = 2)

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6598
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6652


In [42]:
outputs.append([pred_val_y, pred_test_y, best_score, 'LSTM atten glove'])

In [41]:
model7.save('model7.h5', 'w') 

In [47]:
model8 = model_gru_srk_atten(embedding_matrix1)
pred_val_y, pred_test_y, best_score = train_pred(model8, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'GRU srk atten glove'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6558
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6613


In [60]:
model8.save('model8.h5', 'w') 

In [64]:
model9 = model_lstm_du(embedding_matrix1)
pred_val_y, pred_test_y, best_score = train_pred(model9, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'LSTM du glove'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6557
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6633


In [67]:
model9.save('model9.h5', 'w') 

In [111]:
model10 = model_cnn(embedding_matrix1)
pred_val_y, pred_test_y, best_score = train_pred(model10, epochs = 2)

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6458
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6491


In [112]:
outputs.append([pred_val_y, pred_test_y, best_score, 'CNN glove'])

In [115]:
model10.save('CNN_glove.h5', 'w')

In [116]:
model11 = model_cnn_flip(embedding_matrix1)
pred_val_y, pred_test_y, best_score = train_pred(model11, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'CNN flip glove'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6477
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6498


In [130]:
model11.save('CNN_flip_glove.h5', 'w')

In [119]:
model12 = model_gru_atten_3(embedding_matrix1)
pred_val_y, pred_test_y, best_score = train_pred(model12, epochs = 2)
outputs.append([pred_val_y, pred_test_y, best_score, 'GRU atten 3 glove'])

Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6435
Train on 1201632 samples, validate on 104490 samples
Epoch 1/1
Val F1 Score: 0.6550


In [131]:
model12.save('GRU_3_glove.h5', 'w')

### Blending

In [132]:
outputs.sort(key=lambda x: x[2])
for output in outputs:
    print(output[2], output[3])

0.6470461560548418 CNN
0.6490960575038117 CNN glove
0.6498450641876937 CNN flip glove
0.6501303643154112 CNN flip
0.6510047281323877 GRU atten 3
0.6549674092113745 GRU atten 3 glove
0.6580165521110956 LSTM atten
0.6593295928643361 LSTM du
0.6604736916749399 GRU srk atten
0.6613148188327186 GRU srk atten glove
0.6632911392405064 LSTM du glove
0.6652069501341998 LSTM atten glove


In [158]:
output = []
for i in outputs:
    output.append(i)
del(output[0:3])

In [160]:
pred_val_y = np.mean([output[i][0] for i in range(len(output))], axis = 0)

thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

F1 score at threshold 0.1 is 0.6100257621296694
F1 score at threshold 0.11 is 0.6184557853057633
F1 score at threshold 0.12 is 0.6260341043390174
F1 score at threshold 0.13 is 0.6326366559485531
F1 score at threshold 0.14 is 0.6377082724349605
F1 score at threshold 0.15 is 0.6430738119312437
F1 score at threshold 0.16 is 0.648207484432622
F1 score at threshold 0.17 is 0.6518273240127546
F1 score at threshold 0.18 is 0.6552475864216755
F1 score at threshold 0.19 is 0.6592143488695213
F1 score at threshold 0.2 is 0.661632
F1 score at threshold 0.21 is 0.6625607779578606
F1 score at threshold 0.22 is 0.6638655462184875
F1 score at threshold 0.23 is 0.6670208527028821
F1 score at threshold 0.24 is 0.6702913142703719
F1 score at threshold 0.25 is 0.6723271294661051
F1 score at threshold 0.26 is 0.6726186388297507
F1 score at threshold 0.27 is 0.6738057158751131
F1 score at threshold 0.28 is 0.6748207507380852
F1 score at threshold 0.29 is 0.6754373488835158
F1 score at threshold 0.3 is 0.67

In [59]:
pred_test_y = np.mean([outputs[i][1] for i in range(len(outputs))], axis = 0)