In [1]:
import tensorflow as tf
import tensorflow_hub as hub

import pandas as pd
import numpy as np

import os
import re
import nltk
import gensim

from random import shuffle
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split

from keras import backend as K
from keras.models import Sequential, Model, load_model
from keras.engine import Layer
from keras.layers import Input, Dense, Flatten, Dropout, Embedding, CuDNNLSTM, Bidirectional, concatenate
from keras.optimizers import Adam, Adagrad
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def f1(y_true, y_pred):
    '''returns the f1 score given targets and predictions'''
    
    def recall(y_true, y_pred):
        true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_pos = K.sum(K.round(K.clip(y_true, 0, 1)))
        rec = true_pos / (possible_pos + K.epsilon())
        return rec
    
    def precision(y_true, y_pred):
        true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
        prec = true_pos / (predicted_pos + K.epsilon())
        return prec
    
    _precision = precision(y_true, y_pred)
    _recall = recall(y_true, y_pred)
    return 2 * ((_precision * _recall) / (_precision + _recall + K.epsilon()))

In [3]:
class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable = True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)
        
    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable, name='{}_module'.format(self.name))
        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)
        
    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1), as_dict=True, signature='default',)['default']
        return result
    
    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

In [4]:
df = pd.read_csv('D:/Datasets/hackerearth/hm_train.csv')
df.drop(['num_sentence'], axis=1, inplace=True)
labels = df.predicted_category
df.drop(['predicted_category'], axis=1, inplace=True)
print(df.shape, len(labels))

(60321, 3) 60321


In [5]:
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm
0,27673,24h,I went on a successful date with someone I fel...
1,27674,24h,I was happy when my son got 90% marks in his e...
2,27675,24h,I went to the gym this morning and did yoga.
3,27676,24h,We had a serious talk with some friends of our...
4,27677,24h,I went with grandchildren to butterfly display...


In [6]:
SEQ_LEN = 60

In [7]:
def remove_stopwords(sentence):
    # removing some known errors and differences between american and british english
    sentence = sentence.replace("\n", " ").replace(";", " ; ").replace(":", " : ").replace(",", " , ").replace(".", " . ").replace("?", " ? ").replace("/", " / ").replace("\\", " \ ").replace("'s", "").replace("n't", " not").replace("travelled", "traveled").replace("traveller", "traveler").replace("cancelled", "canceled").replace("favourite", "favorite").replace("i'm", "i am").replace("i've", "i have").replace("colour", "color").replace("neighbour", "neighbor").replace("jewellery", "jewelry").replace("theatre", "theater").replace("i'd", "i would").replace("didnt", "did not").replace("doesnt", "does not").replace("wasnt", "was not").replace("programme", "program").replace("organise", "organize")
    
    split = sentence.split()
    if len(split) > SEQ_LEN:
        return ' '.join([w for w in sentence.split() if w not in stopwords.words('english')])
    else:
        return sentence

In [8]:
df['cleaned_hm2'] = df.cleaned_hm.apply(remove_stopwords)

In [9]:
df.drop(['cleaned_hm'], axis=1, inplace=True)
df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm2
0,27673,24h,I went on a successful date with someone I fel...
1,27674,24h,I was happy when my son got 90% marks in his e...
2,27675,24h,I went to the gym this morning and did yoga .
3,27676,24h,We had a serious talk with some friends of our...
4,27677,24h,I went with grandchildren to butterfly display...


In [10]:
labels_to_cats = {'achievement':      (1, 0, 0, 0, 0, 0, 0),
                  'affection':        (0, 1, 0, 0, 0, 0, 0),
                  'enjoy_the_moment': (0, 0, 1, 0, 0, 0, 0),
                  'nature':           (0, 0, 0, 1, 0, 0, 0),
                  'exercise':         (0, 0, 0, 0, 1, 0, 0),
                  'bonding':          (0, 0, 0, 0, 0, 1, 0),
                  'leisure':          (0, 0, 0, 0, 0, 0, 1)}

In [11]:
cats_to_labels = dict()
for k, v in labels_to_cats.items():
    cats_to_labels[v] = k

y = []
for label in labels:
    y.append(labels_to_cats[label])

y = np.array(y)
len(y)

60321

In [12]:
x_train, x_val, y_train, y_val = train_test_split(df, y, test_size=0.2)
print(x_train.shape, x_val.shape)
print(y_train.shape, y_val.shape)

(48256, 3) (12065, 3)
(48256, 7) (12065, 7)


In [13]:
x_train_new = []
for element in x_train.cleaned_hm2:
    x_train_new.append(np.array(element))
x_train_new = np.array(x_train_new)

x_val_new = []
for element in x_val.cleaned_hm2:
    x_val_new.append(np.array(element))
x_val_new = np.array(x_val_new)

print(x_val_new.shape, x_train_new.shape)

(12065,) (48256,)


In [14]:
x_train_new[:5]

array(['meatball sandwich for dinner',
       'I took a walk down my favorite trail on a beautiful sunny day and took photos of the scenery . ',
       'My fiance finding a new job and it pays way more than his present one . ',
       '3 months ago I got a promotion at my job and it made me so happy because all my hard work had been accounted for .  ',
       "I have a daughter .  She is my life ,  I'm very happy for my daughter ,  her name is BIHU  ,  she is very beautiful and sweet ,  I'm very happy for this .  "],
      dtype='<U4906')

In [15]:
y_train[:5]

array([[0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0]])

In [16]:
input_tensor = Input(shape=(1,), dtype='string')
embedding = ElmoEmbeddingLayer()(input_tensor)
dense = Dense(128, activation='relu')(embedding)
pred = Dense(7, activation='softmax')(dense)
model = Model(inputs=[input_tensor], outputs=pred)

INFO:tensorflow:Using C:\Users\AMANDE~1\AppData\Local\Temp\tfhub_modules to cache modules.
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [17]:
model.compile(optimizer=Adam(lr=1e-3, decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy', f1])

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               131200    
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 903       
Total params: 132,107
Trainable params: 132,107
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train_new, y_train, batch_size=8, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [None]:
model.save('D:/Datasets/hackerearth/models/ELMo-d128-e5.h5')

In [None]:
score = model.evaluate(x_val_new, y_val, batch_size=16, verbose=1)

In [None]:
score

use a new score variable, we want to be able to compare
<br>
[0.3796765271084861, 0.8582677165354331, 0.8589423810798401]
<br>
[0.362189538052504, 0.8653957728968089, 0.8657008712142521]

In [None]:
model.fit(x_train_new, y_train, batch_size=8, epochs=5, verbose=1)

In [None]:
model.save('D:/Datasets/hackerearth/models/ELMo-d128-e10.h5')

In [None]:
score = model.evaluate(x_val_new, y_val, batch_size=16, verbose=1)

In [None]:
score

In [31]:
input_tensor = Input(shape=(1,), dtype='string')
embedding = ElmoEmbeddingLayer()(input_tensor)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [33]:
embedding.shape

TensorShape([Dimension(None), Dimension(1024)])

In [None]:
lstm = Bidirectional(CuDNNLSTM(128, return_sequences=False))(embedding)
dense = Dense(128, activation='relu')(lstm)
pred = Dense(7, activation='softmax')(dense)
model2 = Model(inputs=[input_tensor], outputs=pred)