In [1]:
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate, Lambda
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Data Exploratory Analysis

In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

In [3]:
train_df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train_df['target'].values

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
from sklearn.metrics import f1_score

In [6]:
train_df.question_text.str.len().describe().round(1)

count    1306122.0
mean          70.7
std           38.8
min            1.0
25%           45.0
50%           60.0
75%           85.0
max         1017.0
Name: question_text, dtype: float64

In [7]:
test_df.question_text.str.len().describe().round(1)

count    56370.0
mean        70.5
std         38.7
min         11.0
25%         45.0
50%         60.0
75%         85.0
max        588.0
Name: question_text, dtype: float64

In [8]:
from nltk import word_tokenize

In [9]:
train_df[train_df.question_text.str.len() > 100].target.value_counts()

0    187558
1     32373
Name: target, dtype: int64

In [10]:
32373/(187558+32373)

0.1471961660702675

In [11]:
train_df['char_length'] = train_df.question_text.str.len()

In [12]:
test_df[test_df.question_text.str.len() < 20].question_text.head()

16        What is fapping?
632    What is NuAge Skin?
732          Is UPSE good?
907        How can I post?
973    Who created idioms?
Name: question_text, dtype: object

In [13]:
positive = train_df[train_df.target == 1]

In [14]:
positive.head()

Unnamed: 0,qid,question_text,target,char_length
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1,67
30,00013ceca3f624b09f42,Which babies are more sweeter to their parents...,1,86
110,0004a7fcb2bf73076489,If blacks support school choice and mandatory ...,1,102
114,00052793eaa287aff1e1,I am gay boy and I love my cousin (boy). He is...,1,137
115,000537213b01fd77b58a,Which races have the smallest penis?,1,36


In [15]:
train_df.iloc[107]

qid                                           000485e6dd4b149fe051
question_text    Can I start freelancing after finishing Udacit...
target                                                           0
char_length                                                     75
Name: 107, dtype: object

In [16]:
del train_df, test_df

### Baseline CNN

In [17]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook as tqdm
import time

In [18]:
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

In [19]:
def data_preparation(predict=False, sub_train=True):
    start_time = time.time()
    train_df = pd.read_csv("../data/train.csv")
    if sub_train:
        train_df = train_df.sample(frac=1)
    print("Train shape : ",train_df.shape)
    if predict:
        test_df = pd.read_csv("../data/test.csv")
        print("Test shape : ",test_df.shape)
        test_X = test_df["question_text"].fillna("_##_").values    
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.08, random_state=2018)
    
    ## fill up the missing values
    train_X = train_df["question_text"].values
    val_X = val_df["question_text"].values

    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features, filters='')
    print('fitting text to tokenizer..')
    check_point1 = time.time()
    tokenizer.fit_on_texts(list(train_X))
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    check_point2 = time.time()
    print('fitting took {:.2f} seconds to finish'.format(check_point2 - check_point1))
#     save_text_tokenizer(tokenizer, "tokenizer")
    
    print('transforming text to sequence of word indices..')
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    check_point3 = time.time()
    print('transforming took {:.2f} seconds to finish'.format(check_point3 - check_point2))
    if predict:
        test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    print('padding sentence to the same length..')
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    check_point4 = time.time()
    print('padding took {:.2f} seconds to finish'.format(check_point4 - check_point3))
    
    if predict:
        test_X = pad_sequences(test_X, maxlen=maxlen)
        
    print('it took {:.2f} seconds to finish data prepartation'.format(time.time() - start_time))

    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values  
    
#     trn_idx = np.random.permutation(len(train_X))
#     val_idx = np.random.permutation(len(val_X))

#     train_X = train_X[trn_idx]
#     val_X = val_X[val_idx]
#     train_y = train_y[trn_idx]
#     val_y = val_y[val_idx]    
    
    if predict:
        return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index, tokenizer
    else:
        return train_X, val_X, train_y, val_y, tokenizer.word_index

In [20]:
train_X, val_X, test_X, train_y, val_y, word_index, tokenizer = data_preparation(predict=True)

Train shape :  (391837, 3)
Test shape :  (56370, 2)
fitting text to tokenizer..
Found 202128 unique tokens.
fitting took 5.24 seconds to finish
transforming text to sequence of word indices..
transforming took 4.23 seconds to finish
padding sentence to the same length..
padding took 2.37 seconds to finish
it took 15.23 seconds to finish data prepartation


In [21]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [22]:
def load_embedding(word_index, embedding_fname='glove.840B.300d.txt'):
    EMBEDDING_FILE = '../data/' + embedding_fname
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding='utf-8'))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [23]:
embedding_matrix1 = load_embedding(word_index)

HBox(children=(IntProgress(value=0, max=202128), HTML(value='')))




In [24]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True
        
    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)
    
    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [25]:
def model_lstm_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(LSTM(128, return_sequences=True))(x)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [26]:
def model_cnn(embedding_matrix):
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)

    maxpool_pool = []
    for i in range(len(filter_sizes)):
        conv = Conv2D(num_filters, kernel_size=(filter_sizes[i], embed_size),
                                     kernel_initializer='he_normal', activation='elu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - filter_sizes[i] + 1, 1))(conv))

    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [27]:
# model = model_cnn(embedding_matrix1)
model = model_lstm_atten(embedding_matrix1)
for e in range(2):
    model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y))
    pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

    best_thresh = 0.5
    best_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score = f1_score(val_y, (pred_val_y > thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
            best_score = score

    print("Val F1 Score: {:.4f}".format(best_score))

# pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)

Train on 360490 samples, validate on 31347 samples
Epoch 1/1
Val F1 Score: 0.6111
Train on 360490 samples, validate on 31347 samples
Epoch 1/1
Val F1 Score: 0.6192


In [None]:
model.predict(test_X)

In [None]:


model.save('my_model2.h5', 'w') 

In [None]:
model2 = load_model('my_model2.h5')

In [None]:
model2.predict(test_X)

### best threshold for cutting the line

In [None]:
best_thresh

### working function for prediction, it is not fast, though

In [None]:
def predict_label(query='What is happiness?', maxlen = 70):
    best_thresh = 0.9
    from keras.models import load_model
    import pickle
    from keras.preprocessing.sequence import pad_sequences
    import time
    start = time.time()
    # loading
    with open('../data/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)
    test_X = tokenizer.texts_to_sequences(query)
    test_X = pad_sequences(test_X, maxlen=maxlen)
    model = load_model('../data/my_model2.h5')
    score = model.predict(test_X)
    print('took {:.2f} seconds to finish'.format(time.time() - start))
    print(score)
    if score[0] > best_thresh:
        return 1
    else:
        return 0

In [None]:
predict_label('How are you?')