In [1]:
import pandas as pd
import numpy as np
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Flatten, Embedding, CuDNNLSTM, Bidirectional, concatenate, Dropout, LeakyReLU
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
# F1 metrics function
def f1(y_true, y_pred):
    '''returns the f1 score given targets and predictions'''
    
    def recall(y_true, y_pred):
        true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_pos = K.sum(K.round(K.clip(y_true, 0, 1)))
        rec = true_pos / (possible_pos + K.epsilon())
        return rec
    
    def precision(y_true, y_pred):
        true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
        prec = true_pos / (predicted_pos + K.epsilon())
        return prec
    
    _precision = precision(y_true, y_pred)
    _recall = recall(y_true, y_pred)
    return 2 * ((_precision * _recall) / (_precision + _recall + K.epsilon()))

In [3]:
df = pd.read_csv('D:/Datasets/hackerearth/hm_train.csv')
df.drop(['num_sentence'], axis=1, inplace=True)
labels = df.predicted_category
df.drop(['predicted_category'], axis=1, inplace=True)

print(df.shape, len(labels))

(60321, 3) 60321


In [4]:
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm
0,27673,24h,I went on a successful date with someone I fel...
1,27674,24h,I was happy when my son got 90% marks in his e...
2,27675,24h,I went to the gym this morning and did yoga.
3,27676,24h,We had a serious talk with some friends of our...
4,27677,24h,I went with grandchildren to butterfly display...


In [5]:
df.cleaned_hm = df.cleaned_hm.str.lower()
SEQ_LEN = 60

In [6]:
def remove_stopwords(sentence):
    # removing some known errors and differences between american and british english
    sentence = sentence.replace("\n", " ").replace(";", " ; ").replace(":", " : ").replace(",", " , ").replace(".", " . ").replace("?", " ? ").replace("/", " / ").replace("\\", " \ ").replace("'s", "").replace("n't", " not").replace("travelled", "traveled").replace("traveller", "traveler").replace("cancelled", "canceled").replace("favourite", "favorite").replace("i'm", "i am").replace("i've", "i have").replace("colour", "color").replace("neighbour", "neighbor").replace("jewellery", "jewelry").replace("theatre", "theater").replace("i'd", "i would").replace("didnt", "did not").replace("doesnt", "does not").replace("wasnt", "was not").replace("programme", "program").replace("organise", "organize")
    
    split = sentence.split()
    if len(split) > SEQ_LEN:
        return ' '.join([w for w in sentence.split() if w not in stopwords.words('english')])
    else:
        return sentence

In [7]:
df['cleaned_hm2'] = df.cleaned_hm.apply(remove_stopwords)
df.drop(['cleaned_hm'], axis=1, inplace=True)
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm2
0,27673,24h,i went on a successful date with someone i fel...
1,27674,24h,i was happy when my son got 90% marks in his e...
2,27675,24h,i went to the gym this morning and did yoga .
3,27676,24h,we had a serious talk with some friends of our...
4,27677,24h,i went with grandchildren to butterfly display...


In [8]:
t = Tokenizer()
t.fit_on_texts(df.cleaned_hm2)
VOCAB_SIZE = len(t.word_index) + 1

In [9]:
encoded_train_set = t.texts_to_sequences(df.cleaned_hm2)
len(encoded_train_set)

60321

In [10]:
df['tokens'] = encoded_train_set
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm2,tokens
0,27673,24h,i went on a successful date with someone i fel...,"[1, 21, 16, 3, 762, 314, 11, 282, 1, 90, 12593..."
1,27674,24h,i was happy when my son got 90% marks in his e...,"[1, 7, 12, 22, 2, 62, 17, 2277, 1423, 10, 80, ..."
2,27675,24h,i went to the gym this morning and did yoga .,"[1, 21, 4, 6, 393, 37, 91, 5, 101, 929]"
3,27676,24h,we had a serious talk with some friends of our...,"[25, 19, 3, 1615, 312, 11, 46, 48, 13, 4418, 1..."
4,27677,24h,i went with grandchildren to butterfly display...,"[1, 21, 11, 1903, 4, 3801, 4198, 20, 12595, 12..."


In [11]:
df.drop(['cleaned_hm2'], axis=1, inplace=True)
df.head()

Unnamed: 0,hmid,reflection_period,tokens
0,27673,24h,"[1, 21, 16, 3, 762, 314, 11, 282, 1, 90, 12593..."
1,27674,24h,"[1, 7, 12, 22, 2, 62, 17, 2277, 1423, 10, 80, ..."
2,27675,24h,"[1, 21, 4, 6, 393, 37, 91, 5, 101, 929]"
3,27676,24h,"[25, 19, 3, 1615, 312, 11, 46, 48, 13, 4418, 1..."
4,27677,24h,"[1, 21, 11, 1903, 4, 3801, 4198, 20, 12595, 12..."


In [12]:
padded_train_docs = pad_sequences(encoded_train_set, maxlen=SEQ_LEN, padding='post')
train_paddocs = []
for doc in padded_train_docs:
    train_paddocs.append(list(doc))
df['tokens2'] = train_paddocs

lengths = []
for doc in train_paddocs:
    lengths.append(len(doc))
    
print(np.mean(lengths))
df.drop(['tokens'], axis=1, inplace=True)
df.rename(index=str, columns={'tokens2': 'tokens'}, inplace=True)
df.head()

60.0


Unnamed: 0,hmid,reflection_period,tokens
0,27673,24h,"[1, 21, 16, 3, 762, 314, 11, 282, 1, 90, 12593..."
1,27674,24h,"[1, 7, 12, 22, 2, 62, 17, 2277, 1423, 10, 80, ..."
2,27675,24h,"[1, 21, 4, 6, 393, 37, 91, 5, 101, 929, 0, 0, ..."
3,27676,24h,"[25, 19, 3, 1615, 312, 11, 46, 48, 13, 4418, 1..."
4,27677,24h,"[1, 21, 11, 1903, 4, 3801, 4198, 20, 12595, 12..."


In [13]:
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format('D:/Datasets/embeddings/Word2Vec/GoogleNews-vectors-negative300.bin', binary=True)
print(VOCAB_SIZE)

embedding_matrix = np.zeros((VOCAB_SIZE, 300))
count = 0
for word, i in t.word_index.items():
    try:
        embedding_vector = embeddings_index[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        print(f'Did not find {word}')
        count += 1

count

20810
Did not find a
Did not find to
Did not find and
Did not find of
Did not find 
Did not find 10
Did not find mturk
Did not find 24
Did not find 20
Did not find 100
Did not find 30
Did not find 15
Did not find 50
Did not find 12
Did not find 00
Did not find 2017
Did not find 25
Did not find 40
Did not find 11
Did not find 500
Did not find 13
Did not find 000
Did not find 200
Did not find i'll
Did not find donat
Did not find 14
Did not find 300
Did not find 60
Did not find travelling
Did not find judgements
Did not find 250
Did not find 18
Did not find 80
Did not find 10th
Did not find 1000
Did not find 45
Did not find 16
Did not find 70
Did not find 17
Did not find 90
Did not find morty
Did not find ooty
Did not find 2000
Did not find ps4
Did not find 75
Did not find fianca
Did not find 30th
Did not find eudaimonia
Did not find jigarthanda
Did not find a3i
Did not find iave
Did not find 2016
Did not find 150
Did not find 99
Did not find thatas
Did not find learnt
Did not find euda

Did not find keats'
Did not find aashwin
Did not find chainsmoker
Did not find youad
Did not find aoh
Did not find pulwama
Did not find 3mths
Did not find udemy
Did not find 'reverse
Did not find danville
Did not find rachael
Did not find abeading
Did not find it
Did not find 66
Did not find obgyn
Did not find isbell
Did not find perumal
Did not find taskrabbit
Did not find bhagubali
Did not find 190
Did not find festival'
Did not find 110
Did not find feeled
Did not find taipei
Did not find labtop
Did not find chromecast
Did not find 30p
Did not find tirumala
Did not find comedians'
Did not find show'
Did not find enkement
Did not find gova
Did not find employeesa
Did not find sons'
Did not find plr
Did not find favourable
Did not find favrote
Did not find alton
Did not find ziva
Did not find surajkund
Did not find haagen
Did not find bhk
Did not find tumour
Did not find beaufort
Did not find attravtive
Did not find shakeology
Did not find enjojment
Did not find hesi
Did not find ku

Did not find foyt
Did not find 1983
Did not find calistoga
Did not find ramsay
Did not find iworked
Did not find april'17
Did not find 205
Did not find hayes'
Did not find dooney
Did not find freeleech
Did not find deocrations
Did not find spikeball
Did not find 2600
Did not find 2020
Did not find momas
Did not find rpcied
Did not find finalised
Did not find jillian
Did not find 325
Did not find emmanuel
Did not find camgirl
Did not find witg
Did not find hard
Did not find aeropostale
Did not find joplin
Did not find caspian
Did not find aileen
Did not find diabilities
Did not find matlock
Did not find bhagat
Did not find heroas
Did not find qol
Did not find happiness'
Did not find erasmus
Did not find rotterdam
Did not find thirumala
Did not find dyi
Did not find mendocino
Did not find peoms
Did not find peom
Did not find dantdm
Did not find 9month
Did not find iep
Did not find cartain
Did not find listeining
Did not find ppppotty
Did not find attendees'
Did not find imgur
Did not f

2687

In [14]:
embedding_matrix.shape

(20810, 300)

In [15]:
df['24h'] = df.reflection_period == '24h'
df['3m'] = df.reflection_period == '3m'
df.drop(['reflection_period'], axis=1, inplace=True)
df.head()

Unnamed: 0,hmid,tokens,24h,3m
0,27673,"[1, 21, 16, 3, 762, 314, 11, 282, 1, 90, 12593...",True,False
1,27674,"[1, 7, 12, 22, 2, 62, 17, 2277, 1423, 10, 80, ...",True,False
2,27675,"[1, 21, 4, 6, 393, 37, 91, 5, 101, 929, 0, 0, ...",True,False
3,27676,"[25, 19, 3, 1615, 312, 11, 46, 48, 13, 4418, 1...",True,False
4,27677,"[1, 21, 11, 1903, 4, 3801, 4198, 20, 12595, 12...",True,False


In [16]:
print(df.shape, len(labels))

(60321, 4) 60321


In [17]:
labels_to_cats = {'achievement':      (1, 0, 0, 0, 0, 0, 0),
                  'affection':        (0, 1, 0, 0, 0, 0, 0),
                  'enjoy_the_moment': (0, 0, 1, 0, 0, 0, 0),
                  'nature':           (0, 0, 0, 1, 0, 0, 0),
                  'exercise':         (0, 0, 0, 0, 1, 0, 0),
                  'bonding':          (0, 0, 0, 0, 0, 1, 0),
                  'leisure':          (0, 0, 0, 0, 0, 0, 1)}

cats_to_labels = dict()
for k, v in labels_to_cats.items():
    cats_to_labels[v] = k

y = []
for label in labels:
    y.append(labels_to_cats[label])

y = np.array(y)
print(len(y))
print(len(df.tokens[0]))

60321
60


In [18]:
def shuffle():
    x_train, x_val, y_train, y_val = train_test_split(df, y, test_size=0.2)
    x_train_24h = x_train['24h']
    x_train_3m = x_train['3m']
    x_val_24h = x_val['24h']
    x_val_3m = x_val['3m']
    x_train_new = []
    for element in x_train.tokens:
        x_train_new.append(np.array(element))
    x_train_new = np.array(x_train_new)

    x_val_new = []
    for element in x_val.tokens:
        x_val_new.append(np.array(element))
    x_val_new = np.array(x_val_new)

    return x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val

In [19]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [20]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(256, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(128, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [21]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model1.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])

In [22]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 60, 300)      6243000     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 60, 512)      1142784     embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 60, 256)      657408      bidirectional_1[0][0]            
__________________________________________________________________________________________________
bidirectio

In [23]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.85852, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model1.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.85852 to 0.86813, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model1.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.86813 to 0.87725, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model1.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.87725 to 0.87998, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model1.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.87998 to 0.89059, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model1.hdf5
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.89059
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.89059
Epoch 8/20

Epoch 00008: val_acc did not improve from 0.89059
Epoch 9/20

Epoch 00009: val_acc did not improve from 0.8

<keras.callbacks.History at 0x1b8da777940>

In [24]:
score1 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)



In [25]:
score1

[0.6476768623631474, 0.8910899296174443, 0.8917144951512099]

In [26]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [27]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(256, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(128, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [28]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])

In [29]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 60, 300)      6243000     input_4[0][0]                    
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 60, 512)      1142784     embedding_2[0][0]                
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 60, 256)      657408      bidirectional_4[0][0]            
__________________________________________________________________________________________________
bidirectio

In [30]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.85595, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.85595 to 0.87037, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.87037 to 0.87053, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5
Epoch 4/20

Epoch 00004: val_acc did not improve from 0.87053
Epoch 5/20

Epoch 00005: val_acc improved from 0.87053 to 0.88462, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5
Epoch 6/20

Epoch 00006: val_acc improved from 0.88462 to 0.88504, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5
Epoch 7/20

Epoch 00007: val_acc improved from 0.88504 to 0.88993, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5
Epoch 8/20

Epoch 00008: val_acc did no

<keras.callbacks.History at 0x1bbb845e5c0>

In [31]:
score2 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score2



[0.6919101040441369, 0.8861997514160352, 0.8861684659927453]

In [32]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [33]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(128, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [34]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])

In [35]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 60, 300)      6243000     input_7[0][0]                    
__________________________________________________________________________________________________
bidirectional_7 (Bidirectional) (None, 60, 256)      440320      embedding_3[0][0]                
__________________________________________________________________________________________________
bidirectional_8 (Bidirectional) (None, 128)          164864      bidirectional_7[0][0]            
__________________________________________________________________________________________________
input_8 (I

In [36]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.85603, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.85603 to 0.87029, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.87029 to 0.88222, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.88222 to 0.88264, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.88264 to 0.89051, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.89051
Epoch 7/20

Epoch 00007: val_acc improved from 0.89051 to 0.89092, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5
Epoch 8/20

Epoch 00008: val_acc did no

<keras.callbacks.History at 0x1b8da777a20>

In [37]:
score3 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score3



[0.7172850785885814, 0.8942395358376122, 0.8941674241424842]

In [38]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [39]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(128, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [40]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model4.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])

In [41]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 60, 300)      6243000     input_10[0][0]                   
__________________________________________________________________________________________________
bidirectional_9 (Bidirectional) (None, 60, 256)      440320      embedding_4[0][0]                
__________________________________________________________________________________________________
bidirectional_10 (Bidirectional (None, 128)          164864      bidirectional_9[0][0]            
__________________________________________________________________________________________________
input_11 (

In [42]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.86042, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model4.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.86042 to 0.86855, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model4.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.86855 to 0.87576, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model4.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.87576 to 0.87717, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model4.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.87717 to 0.88695, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model4.hdf5
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.88695
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.88695
Epoch 8/20

Epoch 00008: val_acc improved from 0.88695 to 0.89175, saving model to D:/Datasets/hackerearth/models/word2

<keras.callbacks.History at 0x1bbbea37748>

In [43]:
score4 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score4



[0.6838089465600942, 0.8910070452460895, 0.8914537867618407]

In [44]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [45]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(256, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(64, activation='relu')(output)
output = Dropout(0.2)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [46]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])

In [47]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 60, 300)      6243000     input_13[0][0]                   
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional (None, 60, 256)      440320      embedding_5[0][0]                
__________________________________________________________________________________________________
bidirectional_12 (Bidirectional (None, 128)          164864      bidirectional_11[0][0]           
__________________________________________________________________________________________________
input_14 (

In [48]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.85760, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.85760 to 0.86448, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.86448 to 0.87634, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.87634 to 0.88131, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.88131 to 0.88346, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5
Epoch 6/20

Epoch 00006: val_acc improved from 0.88346 to 0.88587, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5
Epoch 7/20

Epoch 00007: val_acc improved from 0.88587 to 0.88595, saving model to D:/Datasets/hacker

<keras.callbacks.History at 0x1bbbdf27c88>

In [49]:
score5 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score5



[0.7376303530312454, 0.8871114795602213, 0.8870814942641718]

In [50]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [51]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(256, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(64, activation='relu')(output)
output = Dropout(0.2)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [52]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 60, 300)      6243000     input_16[0][0]                   
__________________________________________________________________________________________________
bidirectional_13 (Bidirectional (None, 60, 256)      440320      embedding_6[0][0]                
__________________________________________________________________________________________________
bidirectional_14 (Bidirectional (None, 128)          164864      bidirectional_13[0][0]           
__________________________________________________________________________________________________
input_17 (

In [53]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.86051, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.86051 to 0.87816, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.87816 to 0.87965, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.87965 to 0.88703, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.88703 to 0.89018, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.89018
Epoch 7/20

Epoch 00007: val_acc improved from 0.89018 to 0.89283, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5
Epoch 8/20

Epoch 00008: val_acc improv

<keras.callbacks.History at 0x1bbc3597550>

In [54]:
score6 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score6



[0.7967415176483293, 0.8779113137025434, 0.8778340134342034]

In [55]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [56]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(256, activation='relu')(output)
output = Dropout(0.6)(output)
output = Dense(128, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(64, activation='relu')(output)
output = Dropout(0.4)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [57]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model7.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 60, 300)      6243000     input_19[0][0]                   
__________________________________________________________________________________________________
bidirectional_15 (Bidirectional (None, 60, 256)      440320      embedding_7[0][0]                
__________________________________________________________________________________________________
bidirectional_16 (Bidirectional (None, 128)          164864      bidirectional_15[0][0]           
__________________________________________________________________________________________________
input_20 (

In [58]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.84269, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model7.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.84269 to 0.85661, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model7.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.85661 to 0.87667, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model7.hdf5
Epoch 4/20

Epoch 00004: val_acc did not improve from 0.87667
Epoch 5/20

Epoch 00005: val_acc improved from 0.87667 to 0.88413, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model7.hdf5
Epoch 6/20

Epoch 00006: val_acc did not improve from 0.88413
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.88413
Epoch 8/20

Epoch 00008: val_acc improved from 0.88413 to 0.89200, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model7.hdf5
Epoch 9/20

Epoch 00009: val_acc did not improve from 0.8

<keras.callbacks.History at 0x1bbc3597390>

In [59]:
score7 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score7



[0.6367931431005764, 0.89200165775669, 0.8928826240647447]

In [60]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [61]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(256, activation='relu')(output)
output = Dropout(0.6)(output)
output = Dense(128, activation='relu')(output)
output = Dropout(0.5)(output)
output = Dense(64, activation='relu')(output)
output = Dropout(0.4)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [62]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 60, 300)      6243000     input_22[0][0]                   
__________________________________________________________________________________________________
bidirectional_17 (Bidirectional (None, 60, 256)      440320      embedding_8[0][0]                
__________________________________________________________________________________________________
bidirectional_18 (Bidirectional (None, 128)          164864      bidirectional_17[0][0]           
__________________________________________________________________________________________________
input_23 (

In [63]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.84244, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.84244 to 0.86158, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.86158 to 0.86871, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.86871 to 0.87526, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.87526 to 0.87667, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5
Epoch 6/20

Epoch 00006: val_acc improved from 0.87667 to 0.87891, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5
Epoch 7/20

Epoch 00007: val_acc improved from 0.87891 to 0.88090, saving model to D:/Datasets/hacker

<keras.callbacks.History at 0x1bbc3597f98>

In [64]:
score8 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score8



[0.7222323112859798, 0.887940323323172, 0.88859429980925]

In [65]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [66]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(256)(output)
output = LeakyReLU(alpha=0.1)(output)
output = Dropout(0.6)(output)
output = Dense(128)(output)
output = LeakyReLU(alpha=0.1)(output)
output = Dropout(0.5)(output)
output = Dense(64)(output)
output = LeakyReLU(alpha=0.1)(output)
output = Dropout(0.4)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [67]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 60, 300)      6243000     input_25[0][0]                   
__________________________________________________________________________________________________
bidirectional_19 (Bidirectional (None, 60, 256)      440320      embedding_9[0][0]                
__________________________________________________________________________________________________
bidirectional_20 (Bidirectional (None, 128)          164864      bidirectional_19[0][0]           
__________________________________________________________________________________________________
input_26 (

In [68]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.85280, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.85280 to 0.86838, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.86838 to 0.87824, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5
Epoch 4/20

Epoch 00004: val_acc improved from 0.87824 to 0.88396, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5
Epoch 5/20

Epoch 00005: val_acc improved from 0.88396 to 0.88462, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5
Epoch 6/20

Epoch 00006: val_acc improved from 0.88462 to 0.88943, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.88943
Epoch 8/20

Epoch 00008: val_acc improv

<keras.callbacks.History at 0x1bbd45b55c0>

In [69]:
score9 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score9



[0.6636065094193333, 0.8938251140401218, 0.8946281663026743]

In [70]:
x_train_new, x_val_new, x_train_24h, x_train_3m, x_val_24h, x_val_3m, y_train, y_val = shuffle()

In [71]:
input_tensor = Input(shape=(SEQ_LEN,), dtype='int32')
e = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=SEQ_LEN, trainable=False)(input_tensor)
x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(e)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(x)
x_2 = Input(shape=(1,), dtype='float32')
x_3 = Input(shape=(1,), dtype='float32')
output = concatenate([x, x_2, x_3])
output = Dense(256)(output)
output = LeakyReLU(alpha=0.1)(output)
output = Dropout(0.6)(output)
output = Dense(128)(output)
output = LeakyReLU(alpha=0.1)(output)
output = Dropout(0.5)(output)
output = Dense(64)(output)
output = LeakyReLU(alpha=0.1)(output)
output = Dropout(0.4)(output)
output = Dense(7, activation='softmax')(output)
model = Model([input_tensor, x_2, x_3], output)

In [72]:
checkpoint = ModelCheckpoint('D:/Datasets/hackerearth/models/word2vec-lstm-6-model10.hdf5', monitor='val_acc', save_best_only=True, verbose=1, mode='max')
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_28 (InputLayer)           (None, 60)           0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 60, 300)      6243000     input_28[0][0]                   
__________________________________________________________________________________________________
bidirectional_21 (Bidirectional (None, 60, 256)      440320      embedding_10[0][0]               
__________________________________________________________________________________________________
bidirectional_22 (Bidirectional (None, 128)          164864      bidirectional_21[0][0]           
__________________________________________________________________________________________________
input_29 (

In [73]:
model.fit([x_train_new, x_train_24h, x_train_3m], y_train,
           validation_data=([x_val_new, x_val_24h, x_val_3m], y_val),
           callbacks=[checkpoint],
           epochs=20,
           verbose=1)

Train on 48256 samples, validate on 12065 samples
Epoch 1/20

Epoch 00001: val_acc improved from -inf to 0.85562, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model10.hdf5
Epoch 2/20

Epoch 00002: val_acc improved from 0.85562 to 0.86283, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model10.hdf5
Epoch 3/20

Epoch 00003: val_acc improved from 0.86283 to 0.86821, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model10.hdf5
Epoch 4/20

Epoch 00004: val_acc did not improve from 0.86821
Epoch 5/20

Epoch 00005: val_acc improved from 0.86821 to 0.87816, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model10.hdf5
Epoch 6/20

Epoch 00006: val_acc improved from 0.87816 to 0.89291, saving model to D:/Datasets/hackerearth/models/word2vec-lstm-6-model10.hdf5
Epoch 7/20

Epoch 00007: val_acc did not improve from 0.89291
Epoch 8/20

Epoch 00008: val_acc did not improve from 0.89291
Epoch 9/20

Epoch 00009: val_acc improved from 0.892

<keras.callbacks.History at 0x1bbd934f390>

In [74]:
score10 = model.evaluate([x_val_new, x_val_24h, x_val_3m], y_val, batch_size=256, verbose=1)
score10



[0.6692936087366066, 0.8906755077409092, 0.8915371985243803]

## Testing

In [18]:
df_test = pd.read_csv('D:/Datasets/hackerearth/hm_test.csv')
df_test.drop(['num_sentence'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm
0,88305,3m,I spent the weekend in Chicago with my friends.
1,88306,3m,We moved back into our house after a remodel. ...
2,88307,3m,My fiance proposed to me in front of my family...
3,88308,3m,I ate lobster at a fancy restaurant with some ...
4,88309,3m,I went out to a nice restaurant on a date with...


In [19]:
df_test.cleaned_hm = df_test.cleaned_hm.str.lower()
df_test['cleaned_hm2'] = df_test.cleaned_hm.apply(remove_stopwords)
df_test.drop(['cleaned_hm'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm2
0,88305,3m,i spent the weekend in chicago with my friends .
1,88306,3m,we moved back into our house after a remodel ....
2,88307,3m,my fiance proposed to me in front of my family...
3,88308,3m,i ate lobster at a fancy restaurant with some ...
4,88309,3m,went nice restaurant date wife . popular resta...


In [20]:
encoded_test_set = t.texts_to_sequences(df_test.cleaned_hm2)
lengths = []
for doc in encoded_test_set:
    lengths.append(len(doc))
    
np.mean(lengths)

16.205356476761246

In [21]:
df_test['tokens'] = encoded_test_set
df_test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm2,tokens
0,88305,3m,i spent the weekend in chicago with my friends .,"[1, 198, 6, 172, 10, 1928, 11, 2, 48]"
1,88306,3m,we moved back into our house after a remodel ....,"[25, 416, 96, 160, 58, 107, 44, 3, 5123, 25, 1..."
2,88307,3m,my fiance proposed to me in front of my family...,"[2, 682, 1762, 4, 9, 10, 562, 13, 2, 49, 10, 6..."
3,88308,3m,i ate lobster at a fancy restaurant with some ...,"[1, 158, 4647, 20, 3, 1517, 239, 11, 46, 48]"
4,88309,3m,went nice restaurant date wife . popular resta...,"[21, 81, 239, 314, 88, 1710, 239, 151, 55, 511..."


In [22]:
df_test.drop(['cleaned_hm2'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,hmid,reflection_period,tokens
0,88305,3m,"[1, 198, 6, 172, 10, 1928, 11, 2, 48]"
1,88306,3m,"[25, 416, 96, 160, 58, 107, 44, 3, 5123, 25, 1..."
2,88307,3m,"[2, 682, 1762, 4, 9, 10, 562, 13, 2, 49, 10, 6..."
3,88308,3m,"[1, 158, 4647, 20, 3, 1517, 239, 11, 46, 48]"
4,88309,3m,"[21, 81, 239, 314, 88, 1710, 239, 151, 55, 511..."


In [23]:
padded_test_set = pad_sequences(encoded_test_set, maxlen=SEQ_LEN, padding='post')
paddocs_test = []
for doc in padded_test_set:
    paddocs_test.append(list(doc))
    
df_test['tokens2'] = paddocs_test
lengths = []
for doc in paddocs_test:
    lengths.append(len(doc))
np.mean(lengths)

60.0

In [24]:
df_test.drop(['tokens'], axis=1, inplace=True)
df_test.rename(index=str, columns={'tokens2': 'tokens'}, inplace=True)
df_test.head()

Unnamed: 0,hmid,reflection_period,tokens
0,88305,3m,"[1, 198, 6, 172, 10, 1928, 11, 2, 48, 0, 0, 0,..."
1,88306,3m,"[25, 416, 96, 160, 58, 107, 44, 3, 5123, 25, 1..."
2,88307,3m,"[2, 682, 1762, 4, 9, 10, 562, 13, 2, 49, 10, 6..."
3,88308,3m,"[1, 158, 4647, 20, 3, 1517, 239, 11, 46, 48, 0..."
4,88309,3m,"[21, 81, 239, 314, 88, 1710, 239, 151, 55, 511..."


In [25]:
df_test['24h'] = df_test.reflection_period == '24h'
df_test['3m'] = df_test.reflection_period == '3m'
df_test.drop(['reflection_period'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,hmid,tokens,24h,3m
0,88305,"[1, 198, 6, 172, 10, 1928, 11, 2, 48, 0, 0, 0,...",False,True
1,88306,"[25, 416, 96, 160, 58, 107, 44, 3, 5123, 25, 1...",False,True
2,88307,"[2, 682, 1762, 4, 9, 10, 562, 13, 2, 49, 10, 6...",False,True
3,88308,"[1, 158, 4647, 20, 3, 1517, 239, 11, 46, 48, 0...",False,True
4,88309,"[21, 81, 239, 314, 88, 1710, 239, 151, 55, 511...",False,True


In [26]:
print(df_test.shape)

(40213, 4)


In [27]:
x_test = df_test.tokens
x_test_new = []
for element in x_test:
    x_test_new.append(np.array(element))
x_test_new = np.array(x_test_new)

In [30]:
x_test_24h = df_test['24h']
x_test_3m = df_test['3m']

In [32]:
from keras.models import load_model

In [34]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model1.hdf5', custom_objects={'f1': f1})

In [35]:
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)



In [39]:
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model1-preds.npy', preds)

In [40]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model2.hdf5', custom_objects={'f1': f1})

In [41]:
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)



In [42]:
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model2-preds.npy', preds)

In [43]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model3.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model3-preds.npy', preds)



In [44]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model4.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model4-preds.npy', preds)



In [45]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model5.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model5-preds.npy', preds)



In [46]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model6.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model6-preds.npy', preds)



In [47]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model7.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model7-preds.npy', preds)



In [48]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model8.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model8-preds.npy', preds)



In [49]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model9.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model9-preds.npy', preds)



In [50]:
model = load_model('D:/Datasets/hackerearth/models/word2vec-lstm-6-model10.hdf5', custom_objects={'f1': f1})
preds = model.predict([x_test_new, x_test_24h, x_test_3m], batch_size=256, verbose=1)
np.save('D:/Datasets/hackerearth/models/word2vec-lstm-6-model10-preds.npy', preds)



## Non-trainable hard-voting
Used to generate `submission_word2vec-lstm-6-hard.csv`

In [53]:
predictions = []
for i in range(1, 11):
    pred = np.load(f'D:/Datasets/hackerearth/models/word2vec-lstm-6-model{i}-preds.npy')
    predictions.append(pred)
    
predictions = np.array(predictions)

In [54]:
predictions.shape

(10, 40213, 7)

In [75]:
votes = []

for i in range(40213):
    count = np.zeros((11,))
    for j in range(10):
        count[np.argmax(predictions[j][i])] += 1
    
    dummy = np.zeros((7,))
    dummy[np.argmax(count)] = 1
    votes.append(dummy)

In [76]:
categories = []
for vote in votes:
    categories.append(cats_to_labels[tuple(vote)])

In [77]:
categories[:10]

['bonding',
 'achievement',
 'affection',
 'bonding',
 'affection',
 'leisure',
 'achievement',
 'affection',
 'leisure',
 'bonding']

In [71]:
df_test.head()

Unnamed: 0,hmid,tokens,24h,3m
0,88305,"[1, 198, 6, 172, 10, 1928, 11, 2, 48, 0, 0, 0,...",False,True
1,88306,"[25, 416, 96, 160, 58, 107, 44, 3, 5123, 25, 1...",False,True
2,88307,"[2, 682, 1762, 4, 9, 10, 562, 13, 2, 49, 10, 6...",False,True
3,88308,"[1, 158, 4647, 20, 3, 1517, 239, 11, 46, 48, 0...",False,True
4,88309,"[21, 81, 239, 314, 88, 1710, 239, 151, 55, 511...",False,True


In [72]:
df_pred = df_test.copy()
df_pred.drop(['tokens', '24h', '3m'], axis=1, inplace=True)
df_pred.head()

Unnamed: 0,hmid
0,88305
1,88306
2,88307
3,88308
4,88309


In [78]:
len(categories)

40213

In [79]:
df_pred['predicted_category'] = categories

In [80]:
df_pred.to_csv('D:/Datasets/hackerearth/submission_word2vec-lstm-6-hard.csv', index=False)

## Non-trainable soft voting
Used to generate `submission_word2vec-lstm-6-soft.csv`

In [1]:
import numpy as np
import pandas as pd

In [2]:
predictions = []
for i in range(1, 11):
    pred = np.load(f'D:/Datasets/hackerearth/models/word2vec-lstm-6-model{i}-preds.npy')
    predictions.append(pred)
    
predictions = np.array(predictions)    

In [3]:
predictions.shape

(10, 40213, 7)

In [13]:
votes = []

for i in range(40213):
    vote = np.zeros((7,))
    for j in range(10):
        vote += predictions[j][i]
        
    dummy = np.zeros((7,))
    dummy[np.argmax(vote)] = 1
    votes.append(dummy)

In [14]:
categories = []
for vote in votes:
    categories.append(cats_to_labels[tuple(vote)])

In [15]:
categories[:10]

['bonding',
 'achievement',
 'affection',
 'bonding',
 'affection',
 'leisure',
 'achievement',
 'affection',
 'leisure',
 'bonding']

In [16]:
df_test = pd.read_csv('D:/Datasets/hackerearth/hm_test.csv')
df_test.drop(['num_sentence'], axis=1, inplace=True)
df_test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm
0,88305,3m,I spent the weekend in Chicago with my friends.
1,88306,3m,We moved back into our house after a remodel. ...
2,88307,3m,My fiance proposed to me in front of my family...
3,88308,3m,I ate lobster at a fancy restaurant with some ...
4,88309,3m,I went out to a nice restaurant on a date with...


In [18]:
df_test.drop(['reflection_period', 'cleaned_hm'], axis=1, inplace=True)

In [19]:
df_test.head()

Unnamed: 0,hmid
0,88305
1,88306
2,88307
3,88308
4,88309


In [20]:
df_test['predicted_category'] = categories

In [21]:
df_test.shape

(40213, 2)

In [22]:
df_test.head()

Unnamed: 0,hmid,predicted_category
0,88305,bonding
1,88306,achievement
2,88307,affection
3,88308,bonding
4,88309,affection


In [23]:
df_test.to_csv('D:/Datasets/hackerearth/submission_word2vec-lstm-6-soft.csv', index=False)

## Check if both methods differ

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_soft = pd.read_csv('D:/Datasets/hackerearth/submission_word2vec-lstm-6-soft.csv')

In [5]:
df_hard = pd.read_csv('D:/Datasets/hackerearth/submission_word2vec-lstm-6-hard.csv')

In [8]:
count = 0
for i in range(len(df_hard)):
    if df_hard.predicted_category[i] == df_soft.predicted_category[i]:
        count +=1

In [9]:
count

39724

In [10]:
count * 100 / len(df_hard)

98.78397533136051

Soft voting and hard voting is 98% similar