In [None]:
import numpy as np, pandas as pd, random as rn, os, gc, re

seed = 32
np.random.seed(seed)
rn.seed(seed)
import tensorflow as tf
session_conf = tf.ConfigProto(intra_op_parallelism_threads = 1,
                              inter_op_parallelism_threads = 1)
tf.set_random_seed(seed) 
sess = tf.Session(graph = tf.get_default_graph(), config = session_conf)
from keras import backend as K
K.set_session(sess)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score

from keras.layers import Input, Dense, CuDNNLSTM, Bidirectional, Activation, Conv1D
from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D
from keras.layers import Add, Flatten, BatchNormalization, GlobalAveragePooling1D
from keras.layers import concatenate, SpatialDropout1D, CuDNNGRU
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Model, load_model
from keras.optimizers import Adam, RMSprop, SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

embedding_file = "../input/embeddings/glove.840B.300d/glove.840B.300d.txt"
embed_size = 300
max_features = 100000
max_len = 60

batch_size = 1024
epochs = 10

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                       "'cause": "because", "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", 
                       "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", 
                       "how'd": "how did", 
                       "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  
                       "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                       "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                       "i'd": "i would", 
                       "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have",
                       "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", 
                       "it'd've": "it would have", "it'll": "it will", 
                       "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", 
                       "mayn't": "may not", 
                       "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", 
                       "mustn't": "must not", "mustn't've": "must not have", 
                       "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", 
                       "oughtn't": "ought not", "oughtn't've": "ought not have", 
                       "shan't": "shall not", "sha'n't": "shall not", 
                       "shan't've": "shall not have", 
                       "she'd": "she would", "she'd've": "she would have", 
                       "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", 
                       "should've": "should have", 
                       "shouldn't": "should not", "shouldn't've": "should not have", 
                       "so've": "so have", "so's": "so as", "this's": "this is",
                       "that'd": "that would", 
                       "that'd've": "that would have", "that's": "that is", 
                       "there'd": "there would", 
                       "there'd've": "there would have", "there's": "there is", 
                       "here's": "here is",
                       "they'd": "they would", "they'd've": "they would have", 
                       "they'll": "they will", 
                       "they'll've": "they will have", "they're": "they are", 
                       "they've": "they have", 
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                       "we'd've": "we would have", "we'll": "we will", 
                       "we'll've": "we will have", 
                       "we're": "we are", "we've": "we have", "weren't": "were not", 
                       "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  
                       "what's": "what is", "what've": "what have", "when's": "when is", 
                       "when've": "when have", "where'd": "where did", "where's": "where is",
                       "where've": "where have", "who'll": "who will", 
                       "who'll've": "who will have", 
                       "who's": "who is", "who've": "who have", "why's": "why is", 
                       "why've": "why have", "will've": "will have", "won't": "will not", 
                       "won't've": "will not have", "would've": "would have", 
                       "wouldn't": "would not", 
                       "wouldn't've": "would not have", "y'all": "you all", 
                       "y'all'd": "you all would",
                       "y'all'd've": "you all would have", "y'all're": "you all are",
                       "y'all've": "you all have","you'd": "you would", 
                       "you'd've": "you would have", 
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", 
                       "you've": "you have" }

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 
                'travelling': 'traveling', 
                'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 
                'labour': 'labor', 
                'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 
                'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 
                'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 
                'howcan': 'how can', 
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 
                'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 
                'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 
                'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', 
                '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 
                'demonitisation': 'demonetization', 'demonitization': 'demonetization', 
                'demonetisation': 'demonetization'}

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", 
                 "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", 
                 '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', 
                 '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }    

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', 
          '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', 
          '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', 
          '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', 
          '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', 
          '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', 
          '¹', '≤', '‡', '√', 'β', 'α', '∅', 'θ', '÷', '₹']

def clean_punct(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def clean_text(x):
    x = x.lower()
    for dic in [contraction_mapping, mispell_dict, punct_mapping]:
        for word in dic.keys():
            x = x.replace(word, dic[word])
    return x 

train["question_text"] = train["question_text"].apply(lambda x: clean_punct(x))
test["question_text"] = test["question_text"].apply(lambda x: clean_punct(x))
print("Text cleaning completed!")

In [None]:
sincere = train[train["target"] == 0]
insincere = train[train["target"] == 1]

train = pd.concat([sincere[:int(len(sincere)*0.8)], insincere[:int(len(insincere)*0.8)]])
val = pd.concat([sincere[int(len(sincere)*0.8):], insincere[int(len(insincere)*0.8):]])

In [None]:
X_train, y_train = train["question_text"], train["target"].values
X_val, y_val = val["question_text"], val["target"].values

tokenizer = Tokenizer(num_words = max_features, 
                      filters = '"#$%&()*+/:;<=>@[\]^_`{|}~',
                      lower = True)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(test["question_text"])

X_train = pad_sequences(X_train, maxlen = max_len)
X_val = pad_sequences(X_val, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [None]:
train_idx = np.random.permutation(len(X_train))
val_idx = np.random.permutation(len(X_val))

X_train = X_train[train_idx]
X_val = X_val[val_idx]
y_train = y_train[train_idx]
y_val = y_val[val_idx]

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_file))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
print("There are {} words being used.".format(nb_words))

In [None]:
from keras.engine import Layer, InputSpec
from keras.layers import K

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
def visual_model(hist = None):
    plt.figure(figsize = (10, 8))
    plt.plot(hist.history["acc"])
    plt.plot(hist.history["val_acc"])
    plt.title("Model Accuracy")
    plt.xlabel("epochs")
    plt.ylabel("accuracy")
    plt.legend(["train", "val"], loc = "upper left")
    plt.show()

    plt.figure(figsize = (10, 8))
    plt.plot(hist.history["loss"])
    plt.plot(hist.history["val_loss"])
    plt.title("Model Loss")
    plt.xlabel("epochs")
    plt.ylabel("loss")
    plt.legend(["train", "val"], loc = "upper left")
    plt.show()

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve

def threshold_search(y_true, y_proba, plot=False):
    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    if plot:
        plt.plot(thresholds, F, '-b')
        plt.plot([best_th], [best_score], '*r')
        plt.show()
    return best_th, best_score 

### Simple LSTM Model

In [None]:
def build_lstm_model(units, dr):
    inp = Input(shape = (max_len, ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr)(embed_layer)
    x = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x)
    x = Attention(max_len)(x)
    
    x = Dense(128)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    
#     x = Dense(64)(x)
#     x = BatchNormalization()(x)
#     x = Activation("relu")(x)
    
    out = Dense(1, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(), metrics = ["accuracy"])
    
    return model

In [None]:
units, dr = 64, 0.5
model = build_lstm_model(units, dr)
slstm_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                       validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

### Double LSTM Model

In [None]:
def build_lstm_model(units, dr):
    inp = Input(shape = (max_len, ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr)(embed_layer)
    x = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x)
    x = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x)
    x = Attention(max_len)(x)
    
    x = Dense(64, activation = "relu")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    
#     x = Dense(64, activation = "relu")(x)
#     x = BatchNormalization()(x)
#     x = Activation("relu")(x)
    
    out = Dense(1, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(), metrics = ["accuracy"])
    
    return model

In [None]:
units, dr = 64, 0.5
model = build_lstm_model(units, dr)
dlstm_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                       validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

### RNN Model

In [None]:
def build_rnn_model(units, dr):
    inp = Input(shape = (max_len, ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr)(embed_layer)
    x = Bidirectional(CuDNNGRU(units, return_sequences = True))(x)
    x = Bidirectional(CuDNNGRU(units, return_sequences = True))(x)
    x = Attention(max_len)(x)
    
    x = Dense(64, activation = "relu")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    
#     x = Dense(64, activation = "relu")(x)
#     x = BatchNormalization()(x)
#     x = Activation("relu")(x)
    
    out = Dense(1, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(), metrics = ["accuracy"])
    
    return model

In [None]:
units, dr = 64, 0.5
model = build_rnn_model(units, dr)
rnn_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                     validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

### RCNN Model

In [None]:
def build_rcnn_model(units,filters, dr):
    inp = Input(shape = (max_len, ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr)(embed_layer)
    x = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x)
    x = Conv1D(filters, 2, activation = "relu", kernel_initializer = "glorot_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    main = concatenate([avg_pool, max_pool])
    
    out = Dense(1, activation = "sigmoid")(main)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(), metrics = ["accuracy"])
    
    return model

In [None]:
units, filters, dr = 64, 32, 0.5
model = build_rcnn_model(units, filters, dr)
rcnn_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                     validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

### D-LSTM Model

In [None]:
from keras.layers import RepeatVector, TimeDistributed, concatenate

def build_rcnn_model(units,filters, dr):
    inp = Input(shape = (max_len, ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr)(embed_layer)
    x = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x)
    x = Bidirectional(CuDNNLSTM(units, return_sequences = True))(x)
    
    atten = Attention(max_len)(x)
    max_pool = GlobalMaxPooling1D()(x)
    avg_pool = GlobalAveragePooling1D()(x)
    x = concatenate([atten, max_pool, avg_pool])
    x = Dense(64, activation = "relu")(x)
    
    out = Dense(1, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(), metrics = ["accuracy"])
    
    return model

In [None]:
units, filters, dr = 64, 32, 0.5
model = build_rcnn_model(units, filters, dr)
dlstm_pool_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                            validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

### Capsule Network

In [None]:
from keras.engine import Layer, InputSpec
from keras.layers import K

def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [None]:
from keras.initializers import glorot_uniform, orthogonal

def build_capsule_model(units = 40, dr = 0.3, 
                        num_capsules = 10, dim_capsules = 10, routs = 4):
    inp = Input(shape = (max_len, ))
#     num_input = Input(shape = (num_feat.shape[1], ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr, seed = seed)(embed_layer)
    x = Bidirectional(CuDNNLSTM(units, kernel_initializer = glorot_uniform(seed = seed), 
                                recurrent_initializer = orthogonal(gain = 1.0, seed = seed), 
                                return_sequences = True))(x)

    x = Capsule(num_capsule = num_capsules, dim_capsule = dim_capsules, routings = routs, share_weights = True)(x)
    x = Flatten()(x)
    
#     x = concatenate([x, num_input])
    main = Dense(128, kernel_initializer = glorot_uniform(seed = seed))(x)
#     main = BatchNormalization()(main)
    main = Activation("relu")(main)

    
    main = Dropout(dr-0.2, seed = seed)(main)
    
    out = Dense(1, activation = "sigmoid", 
                kernel_initializer = glorot_uniform(seed = seed))(main)
#     model = Model(inputs = [inp, num_input], outputs = out)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy",
                  optimizer = Adam(), 
                  metrics = ["accuracy"])
    
    return model

In [None]:
units, dr = 62, 0.3
num_capsules = 10
dim_capsules = 10
routs = 4
model = build_capsule_model(units = 40, dr = 0.3, 
                            num_capsules = 10, dim_capsules = 10, routs = 4)
capsule_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                         validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

## RWA Model

In [None]:
import numpy as np

from keras.layers import Recurrent
import keras.backend as K
from keras import activations
from keras import initializers
from keras import regularizers
from keras import constraints
from keras.engine import Layer
from keras.engine import InputSpec
from keras.legacy import interfaces


class RWA(Recurrent):
    """
    # References
     - [Machine Learning on Sequential Data Using a Recurrent Weighted Average](https://arxiv.org/abs/1703.01253)
    """
    @interfaces.legacy_recurrent_support
    def __init__(self, units,
                 activation='tanh',
                 recurrent_activation='tanh',
                 features_initializer='glorot_uniform',
                 recurrent_initializer='glorot_uniform',
                 average_initializer = 'glorot_uniform',
                 initial_attention_initializer = 'zeros',
                 bias_initializer='zeros',
                 features_regularizer=None,
                 recurrent_regularizer=None,
                 average_regularizer=None,
                 initial_attention_regularizer = None,
                 bias_regularizer=None,
                 features_constraint=None,
                 recurrent_constraint=None,
                 average_constraint=None,
                 initial_attention_constraint = None,
                 bias_constraint=None,
#                  dropout=0.,
#                  recurrent_dropout=0.,
                 **kwargs):
        super(RWA, self).__init__(**kwargs)
        self.units = units
        self.activation = activations.get(activation)
        self.recurrent_activation = activations.get(recurrent_activation)
        self.features_initializer = initializers.get(features_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.average_initializer = initializers.get(average_initializer)
        self.initial_attention_initializer = initializers.get(initial_attention_initializer)
        self.bias_initializer = initializers.get(bias_initializer)

        self.features_regularizer = regularizers.get(features_regularizer)
        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
        self.average_regularizer = regularizers.get(average_regularizer)
        self.initial_attention_regularizer = regularizers.get(initial_attention_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)

        self.features_constraint = constraints.get(features_constraint)
        self.recurrent_constraint = constraints.get(recurrent_constraint)
        self.average_constraint = constraints.get(average_constraint)
        self.initial_attention_constraint = constraints.get(initial_attention_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        self.go_backwards = False
        self.supports_masking = False
        self.unroll = False
        # self.return_sequences = False
        self.stateful = False

#         self.dropout = min(1., max(0., dropout))
#         self.recurrent_dropout = min(1., max(0., recurrent_dropout))


    def call(self, inputs, mask=None, training=None, initial_state=None):
        # input shape: `(samples, time (padded with zeros), input_dim)`
        # note that the .build() method of subclasses MUST define
        # self.input_spec and self.state_spec with complete input shapes.
        if initial_state is not None:
            if not isinstance(initial_state, (list, tuple)):
                initial_states = [initial_state]
            else:
                initial_states = list(initial_state)
        if isinstance(inputs, list):
            initial_states = inputs[1:]
            inputs = inputs[0]
        else:
            initial_states = self.get_initial_states(inputs)

        if len(initial_states) != len(self.states):
            raise ValueError('Layer has ' + str(len(self.states)) +
                             ' states but was passed ' +
                             str(len(initial_states)) +
                             ' initial states.')
        input_shape = K.int_shape(inputs)
        constants = self.get_constants(inputs, training=None)
        preprocessed_input = self.preprocess_input(inputs, training=None)
        h = initial_states[0]
        h+= self.recurrent_activation(self.initial_attention)
        initial_states[0]=h
        last_output, outputs, states = K.rnn(self.step,
                                             preprocessed_input,
                                             initial_states,
                                             go_backwards=self.go_backwards,
                                             mask=mask,
                                             constants=constants,
                                             unroll=self.unroll,
                                             input_length=input_shape[1])
        # return last_output
#         if self.stateful:
#             updates = []
#             for i in range(len(states)):
#                 updates.append((self.states[i], states[i]))
#             self.add_update(updates, inputs)

        # Properly set learning phase
#         if 0 < self.dropout + self.recurrent_dropout:
#             last_output._uses_learning_phase = True
#             outputs._uses_learning_phase = True

        if self.return_sequences:
            return outputs
        else:
            return last_output

    # def compute_output_shape(self, input_shape):
    #     if isinstance(input_shape, list):
    #         input_shape = input_shape[0]
    #     return (input_shape[0], self.units)

    def build(self, input_shape):
        if isinstance(input_shape, list):
            input_shape = input_shape[0]

        batch_size = input_shape[0] if self.stateful else None
        self.input_dim = input_shape[2]
        self.input_spec[0] = InputSpec(shape=(batch_size, None, self.input_dim))
        #states: h, d, n, a_max
        state_shape = (batch_size, None, self.units) if self.stateful else (batch_size, self.units)
        self.state_spec = [InputSpec(shape=state_shape),
                           InputSpec(shape=state_shape),
                          InputSpec(shape=state_shape),
                          InputSpec(shape=state_shape)]

        self.states = [None, None, None, None]
        #W_u and b_u
        self.features_kernel = self.add_weight((self.input_dim, self.units),
                                      name='features_kernel',
                                      initializer=self.features_initializer,
                                      regularizer=self.features_regularizer,
                                      constraint=self.features_constraint)
        self.features_bias = self.add_weight((self.units,),
                                        name='features_bias',
                                        initializer=self.bias_initializer,
                                        regularizer=self.bias_regularizer,
                                        constraint=self.bias_constraint)

        #W_g and b_g

        self.recurrent_kernel = self.add_weight(
                                        (self.input_dim+self.units, self.units),
                                        name='recurrent_kernel',
                                        initializer=self.recurrent_initializer,
                                        regularizer=self.recurrent_regularizer,
                                        constraint=self.recurrent_constraint)
        self.recurrent_bias = self.add_weight((self.units,),
                                        name='recurrent_bias',
                                        initializer=self.bias_initializer,
                                        regularizer=self.bias_regularizer,
                                        constraint=self.bias_constraint)

        #W_a
        self.average_kernel = self.add_weight(
                                        (self.input_dim+self.units, self.units),
                                        name='average_kernel',
                                        initializer=self.average_initializer,
                                        regularizer=self.average_regularizer,
                                        constraint=self.average_constraint)

        #s

        self.initial_attention = self.add_weight((self.units, ),
                                        name='initial_attention',
                                        initializer=self.initial_attention_initializer,
                                        regularizer=self.initial_attention_regularizer,
                                        constraint=self.initial_attention_constraint)

        self.built = True

    def preprocess_input(self, inputs, training=None):
        return inputs

    def get_initial_states(self, inputs):
        #states: h, d, n, a_max
        # build an all-zero tensor of shape (samples, output_dim)
        initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
        initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
        initial_state = K.expand_dims(initial_state)  # (samples, 1)
        initial_state = K.tile(initial_state, [1, self.units])  # (samples, output_dim)
        initial_states = [initial_state for _ in range(len(self.states)-1)]

        initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
        initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
        initial_state = K.expand_dims(initial_state)  # (samples, 1)
        initial_state = K.tile(initial_state, [1, self.units])
        dtype = initial_state.dtype.name
        min_value = np.asscalar(np.array([1E38]).astype(dtype))
        initial_state = initial_state - min_value
        initial_states.append(initial_state)
        return initial_states

    def get_constants(self, inputs, training=None):
        constants = []
        return constants

    def step(self, inputs, states):
        h = states[0]
        d = states[1]
        n = states[2]
        a_max = states[3]
#         dp_mask = states[2]
#         rec_dp_mask = states[3]
        inputs_joined = K.concatenate([inputs, h], axis=-1)
        u = K.dot(inputs,self.features_kernel)
        u = K.bias_add(u, self.features_bias)

        g = K.dot(inputs_joined, self.recurrent_kernel)
        g = K.bias_add(g, self.recurrent_bias)

        a = K.dot(inputs_joined, self.average_kernel)

        z = u * self.recurrent_activation(g)

        a_newmax = K.maximum(a_max, a)
        exp_diff = K.exp(a_max - a_newmax)
        exp_scaled = K.exp(a - a_newmax)

        n = n*exp_diff + z*exp_scaled
        d = d*exp_diff + exp_scaled
        h_new = self.activation(n/d)
        a_max = a_newmax
        h = h_new

        return h, [h, d, n, a_max]

    def get_config(self):
        config = {'units': self.units,
                  'activation': activations.serialize(self.activation),
                  'recurrent_activation': activations.serialize(self.recurrent_activation),
                  'features_initializer': initializers.serialize(self.features_initializer),
                  'recurrent_initializer': initializers.serialize(self.recurrent_initializer),
                  'average_initializer': initializers.serialize(self.average_initializer),
                  'initial_attention_initializer':  initializers.serialize(self.initial_attention_initializer),
                  'bias_initializer': initializers.serialize(self.bias_initializer),
                  'features_regularizer': regularizers.serialize(self.features_regularizer),
                  'recurrent_regularizer': regularizers.serialize(self.recurrent_regularizer),
                    'average_regularizer': regularizers.serialize(self.average_regularizer),
                    'initial_attention_regularizer': regularizers.serialize(self.initial_attention_regularizer),
                  'bias_regularizer': regularizers.serialize(self.bias_regularizer),
                  'features_constraint': constraints.serialize(self.features_constraint),
                  'recurrent_constraint': constraints.serialize(self.recurrent_constraint),
                  'average_constraint': constraints.serialize(self.average_constraint),
                  'initial_attention_constraint': constraints.serialize(self.initial_attention_constraint),
                  'bias_constraint': constraints.serialize(self.bias_constraint),
#                   'dropout': self.dropout,
#                   'recurrent_dropout': self.recurrent_dropout
                 }
        base_config = super(RWA, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def build_rwa_model(units = 40, dr = 0.3):
    inp = Input(shape = (max_len, ))
#     num_input = Input(shape = (num_feat.shape[1], ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr, seed = seed)(embed_layer)
    x = Bidirectional(RWA(units, return_sequences = True))(x)
    x = Capsule(5, 5)(x)
    max_ = GlobalMaxPooling1D()(x)
    avg_ = GlobalAveragePooling1D()(x)
    
    main = concatenate([max_, avg_])
    main = Dense(128)(main)
    main = Activation("relu")(main)
#     main = BatchNormalization()(main)

    main = Dropout(dr-0.2, seed = seed)(main)
    
    out = Dense(1, activation = "sigmoid")(main)
#     model = Model(inputs = [inp, num_input], outputs = out)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy",
                  optimizer = Adam(), 
                  metrics = ["accuracy"])
    
    return model

In [None]:
model = build_rwa_model(units = 64, dr = 0.3)
rwa_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                     validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

### DPCNN Model

In [None]:
from keras.layers import PReLU, add

act, pad, kernel_ini = "linear", "same", "he_uniform"
def build_dpcnn_model(units = 0, k = 0, num_block = 0, dr = 0.0):
    inp = Input(shape = (max_len, ))
    emb = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                    input_length = max_len, trainable = False)(inp)
    emb = SpatialDropout1D(dr)(emb)
    emb_short_cut = PReLU()(emb)
    emb_short_cut = Conv1D(units, kernel_size = 1, padding = pad, activation = act,
                           kernel_initializer = kernel_ini)(emb_short_cut)
    emb_short_cut = PReLU()(emb_short_cut)
    emb_short_cut = Conv1D(units, kernel_size = 1, padding = pad, activation = act,
                           kernel_initializer = kernel_ini)(emb_short_cut)
    
    # Main block
    for b in range(1, num_block + 1):
        if b == 1:
            block = emb
            short_cut = emb_short_cut
        else:
            block = block
            short_cut = block
                    
#         block = BatchNormalization()(block)
        block = PReLU()(block)
        block = Conv1D(units, kernel_size = k, padding = pad, activation = act, 
                       kernel_initializer = kernel_ini)(block)
#         block = BatchNormalization()(block)
        block = PReLU()(block)
        block = Conv1D(units, kernel_size = k, padding = pad, activation = act,
                      kernel_initializer = kernel_ini)(block)
        block = add([short_cut, block])
        block = MaxPooling1D(pool_size = 3, strides = 2, padding = pad)(block)
        
    # Final block
    short_cut = block
#     block = BatchNormalization()(block)
    block = PReLU()(block)
    block = Conv1D(units, kernel_size = k, padding = pad, activation = act,
                  kernel_initializer = kernel_ini)(block)
#     block = BatchNormalization()(block)
    block = PReLU()(block)
    block = Conv1D(units, kernel_size = k, padding = pad, activation = act,
                  kernel_initializer = kernel_ini)(block)
    block = add([short_cut, block])
    max_pool = GlobalMaxPooling1D()(block)
    avg_pool = GlobalAveragePooling1D()(block)
    block = concatenate([max_pool, avg_pool])
    
    # output block
    out_put = Dense(128, activation = act)(block)
#     out_put = BatchNormalization()(out_put)
    out_put = PReLU()(out_put)
    out_put = Dense(64, activation = act)(block)
#     out_put = BatchNormalization()(out_put)
    out_put = PReLU()(out_put)
    
    out_put = Dense(1, activation = "sigmoid")(out_put)
    model = Model(inputs = inp, outputs = out_put)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(), metrics = ["accuracy"])
    return model

In [None]:
model = build_dpcnn_model(units = 64, k = 3, num_block = 3, dr = 0.3)
dpcnn_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                       validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

### QRNN Model

In [None]:
from keras.layers import Wrapper
import keras.backend as K

class DropConnect(Wrapper):
    def __init__(self, layer, prob=1., **kwargs):
        self.prob = prob
        self.layer = layer
        super(DropConnect, self).__init__(layer, **kwargs)
        if 0. < self.prob < 1.:
            self.uses_learning_phase = True

    def build(self, input_shape):
        if not self.layer.built:
            self.layer.build(input_shape)
            self.layer.built = True
        super(DropConnect, self).build()

    def compute_output_shape(self, input_shape):
        return self.layer.compute_output_shape(input_shape)

    def call(self, x):
        if 0. < self.prob < 1.:
            self.layer.kernel = K.in_train_phase(K.dropout(self.layer.kernel, self.prob), self.layer.kernel)
            self.layer.bias = K.in_train_phase(K.dropout(self.layer.bias, self.prob), self.layer.bias)
        return self.layer.call(x)

In [None]:
from keras import backend as K
from keras import activations, initializers, regularizers, constraints
from keras.layers import Layer, InputSpec
from keras.utils.conv_utils import conv_output_length

def _dropout(x, level, noise_shape=None, seed=None):
    x = K.dropout(x, level, noise_shape, seed)
    x *= (1. - level) # compensate for the scaling by the dropout
    return x

class QRNN(Layer):
    '''Quasi RNN
    # Arguments
        units: dimension of the internal projections and the final output.
    # References
        - [Quasi-recurrent Neural Networks](http://arxiv.org/abs/1611.01576)
    '''
    def __init__(self, units, window_size=2, stride=1,
                 return_sequences=False, go_backwards=False, 
                 stateful=False, unroll=False, activation='tanh',
                 kernel_initializer='uniform', bias_initializer='zero',
                 kernel_regularizer=None, bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None, bias_constraint=None, 
                 dropout=0, use_bias=True, input_dim=None, input_length=None,
                 **kwargs):
        self.return_sequences = return_sequences
        self.go_backwards = go_backwards
        self.stateful = stateful
        self.unroll = unroll

        self.units = units 
        self.window_size = window_size
        self.strides = (stride, 1)

        self.use_bias = use_bias
        self.activation = activations.get(activation)
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)

        self.dropout = dropout
        self.supports_masking = True
        self.input_spec = [InputSpec(ndim=3)]
        self.input_dim = input_dim
        self.input_length = input_length
        if self.input_dim:
            kwargs['input_shape'] = (self.input_length, self.input_dim)
        super(QRNN, self).__init__(**kwargs)

    def build(self, input_shape):
        if isinstance(input_shape, list):
            input_shape = input_shape[0]

        batch_size = input_shape[0] if self.stateful else None
        self.input_dim = input_shape[2]
        self.input_spec = InputSpec(shape=(batch_size, None, self.input_dim))
        self.state_spec = InputSpec(shape=(batch_size, self.units))

        self.states = [None]
        if self.stateful:
            self.reset_states()

        kernel_shape = (self.window_size, 1, self.input_dim, self.units * 3)
        self.kernel = self.add_weight(name='kernel',
                                      shape=kernel_shape,
                                      initializer=self.kernel_initializer,
                                      regularizer=self.kernel_regularizer,
                                      constraint=self.kernel_constraint)
        if self.use_bias:
            self.bias = self.add_weight(name='bias', 
                                        shape=(self.units * 3,),
                                        initializer=self.bias_initializer,
                                        regularizer=self.bias_regularizer,
                                        constraint=self.bias_constraint)

        self.built = True

    def compute_output_shape(self, input_shape):
        if isinstance(input_shape, list):
            input_shape = input_shape[0]

        length = input_shape[1]
        if length:
            length = conv_output_length(length + self.window_size - 1,
                                        self.window_size, 'valid',
                                        self.strides[0])
        if self.return_sequences:
            return (input_shape[0], length, self.units)
        else:
            return (input_shape[0], self.units)

    def compute_mask(self, inputs, mask):
        if self.return_sequences:
            return mask
        else:
            return None

    def get_initial_states(self, inputs):
        # build an all-zero tensor of shape (samples, units)
        initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
        initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
        initial_state = K.expand_dims(initial_state)  # (samples, 1)
        initial_state = K.tile(initial_state, [1, self.units])  # (samples, units)
        initial_states = [initial_state for _ in range(len(self.states))]
        return initial_states

    def reset_states(self, states=None):
        if not self.stateful:
            raise AttributeError('Layer must be stateful.')
        if not self.input_spec:
            raise RuntimeError('Layer has never been called '
                               'and thus has no states.')

        batch_size = self.input_spec.shape[0]
        if not batch_size:
            raise ValueError('If a QRNN is stateful, it needs to know '
                             'its batch size. Specify the batch size '
                             'of your input tensors: \n'
                             '- If using a Sequential model, '
                             'specify the batch size by passing '
                             'a `batch_input_shape` '
                             'argument to your first layer.\n'
                             '- If using the functional API, specify '
                             'the time dimension by passing a '
                             '`batch_shape` argument to your Input layer.')

        if self.states[0] is None:
            self.states = [K.zeros((batch_size, self.units))
                           for _ in self.states]
        elif states is None:
            for state in self.states:
                K.set_value(state, np.zeros((batch_size, self.units)))
        else:
            if not isinstance(states, (list, tuple)):
                states = [states]
            if len(states) != len(self.states):
                raise ValueError('Layer ' + self.name + ' expects ' +
                                 str(len(self.states)) + ' states, '
                                 'but it received ' + str(len(states)) +
                                 'state values. Input received: ' +
                                 str(states))
            for index, (value, state) in enumerate(zip(states, self.states)):
                if value.shape != (batch_size, self.units):
                    raise ValueError('State ' + str(index) +
                                     ' is incompatible with layer ' +
                                     self.name + ': expected shape=' +
                                     str((batch_size, self.units)) +
                                     ', found shape=' + str(value.shape))
                K.set_value(state, value)

    def __call__(self, inputs, initial_state=None, **kwargs):
        # If `initial_state` is specified,
        # and if it a Keras tensor,
        # then add it to the inputs and temporarily
        # modify the input spec to include the state.
        if initial_state is not None:
            if hasattr(initial_state, '_keras_history'):
                # Compute the full input spec, including state
                input_spec = self.input_spec
                state_spec = self.state_spec
                if not isinstance(state_spec, list):
                    state_spec = [state_spec]
                self.input_spec = [input_spec] + state_spec

                # Compute the full inputs, including state
                if not isinstance(initial_state, (list, tuple)):
                    initial_state = [initial_state]
                inputs = [inputs] + list(initial_state)

                # Perform the call
                output = super(QRNN, self).__call__(inputs, **kwargs)

                # Restore original input spec
                self.input_spec = input_spec
                return output
            else:
                kwargs['initial_state'] = initial_state
        return super(QRNN, self).__call__(inputs, **kwargs)

    def call(self, inputs, mask=None, initial_state=None, training=None):
        # input shape: `(samples, time (padded with zeros), input_dim)`
        # note that the .build() method of subclasses MUST define
        # self.input_spec and self.state_spec with complete input shapes.
        if isinstance(inputs, list):
            initial_states = inputs[1:]
            inputs = inputs[0]
        elif initial_state is not None:
            pass
        elif self.stateful:
            initial_states = self.states
        else:
            initial_states = self.get_initial_states(inputs)

        if len(initial_states) != len(self.states):
            raise ValueError('Layer has ' + str(len(self.states)) +
                             ' states but was passed ' +
                             str(len(initial_states)) +
                             ' initial states.')
        input_shape = K.int_shape(inputs)
        if self.unroll and input_shape[1] is None:
            raise ValueError('Cannot unroll a RNN if the '
                             'time dimension is undefined. \n'
                             '- If using a Sequential model, '
                             'specify the time dimension by passing '
                             'an `input_shape` or `batch_input_shape` '
                             'argument to your first layer. If your '
                             'first layer is an Embedding, you can '
                             'also use the `input_length` argument.\n'
                             '- If using the functional API, specify '
                             'the time dimension by passing a `shape` '
                             'or `batch_shape` argument to your Input layer.')
        constants = self.get_constants(inputs, training=None)
        preprocessed_input = self.preprocess_input(inputs, training=None)

        last_output, outputs, states = K.rnn(self.step, preprocessed_input,
                                            initial_states,
                                            go_backwards=self.go_backwards,
                                            mask=mask,
                                            constants=constants,
                                            unroll=self.unroll,
                                            input_length=input_shape[1])
        if self.stateful:
            updates = []
            for i in range(len(states)):
                updates.append((self.states[i], states[i]))
            self.add_update(updates, inputs)

        # Properly set learning phase
        if 0 < self.dropout < 1:
            last_output._uses_learning_phase = True
            outputs._uses_learning_phase = True

        if self.return_sequences:
            return outputs
        else:
            return last_output

    def preprocess_input(self, inputs, training=None):
        if self.window_size > 1:
            inputs = K.temporal_padding(inputs, (self.window_size-1, 0))
        inputs = K.expand_dims(inputs, 2)  # add a dummy dimension

        output = K.conv2d(inputs, self.kernel, strides=self.strides,
                          padding='valid',
                          data_format='channels_last')
        output = K.squeeze(output, 2)  # remove the dummy dimension
        if self.use_bias:
            output = K.bias_add(output, self.bias, data_format='channels_last')

        if self.dropout is not None and 0. < self.dropout < 1.:
            z = output[:, :, :self.units]
            f = output[:, :, self.units:2 * self.units]
            o = output[:, :, 2 * self.units:]
            f = K.in_train_phase(1 - _dropout(1 - f, self.dropout), f, training=training)
            return K.concatenate([z, f, o], -1)
        else:
            return output

    def step(self, inputs, states):
        prev_output = states[0]

        z = inputs[:, :self.units]
        f = inputs[:, self.units:2 * self.units]
        o = inputs[:, 2 * self.units:]

        z = self.activation(z)
        f = f if self.dropout is not None and 0. < self.dropout < 1. else K.sigmoid(f)
        o = K.sigmoid(o)

        output = f * prev_output + (1 - f) * z
        output = o * output

        return output, [output]

    def get_constants(self, inputs, training=None):
        return []
 
    def get_config(self):
        config = {'units': self.units,
                  'window_size': self.window_size,
                  'stride': self.strides[0],
                  'return_sequences': self.return_sequences,
                  'go_backwards': self.go_backwards,
                  'stateful': self.stateful,
                  'unroll': self.unroll,
                  'use_bias': self.use_bias,
                  'dropout': self.dropout,
                  'activation': activations.serialize(self.activation),
                  'kernel_initializer': initializers.serialize(self.kernel_initializer),
                  'bias_initializer': initializers.serialize(self.bias_initializer),
                  'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
                  'bias_regularizer': regularizers.serialize(self.bias_regularizer),
                  'activity_regularizer': regularizers.serialize(self.activity_regularizer),
                  'kernel_constraint': constraints.serialize(self.kernel_constraint),
                  'bias_constraint': constraints.serialize(self.bias_constraint),
                  'input_dim': self.input_dim,
                  'input_length': self.input_length}
        base_config = super(QRNN, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def build_qrnn_model(units = 40, dr = 0.3):
    inp = Input(shape = (max_len, ))
#     num_input = Input(shape = (num_feat.shape[1], ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    
    x = SpatialDropout1D(dr, seed = seed)(embed_layer)
    x = QRNN(units, window_size = 3, stride = 1, return_sequences = True)(x)
    x = DropConnect(QRNN(units, window_size = 3, stride = 1, return_sequences = True), 
                    prob = dr)(x)
    x = DropConnect(QRNN(units, window_size = 3, stride = 1, return_sequences = True), 
                    prob = dr)(x)
    att_ = Attention(max_len)(x)
    max_ = GlobalMaxPooling1D()(x)
    avg_ = GlobalAveragePooling1D()(x)
    
    main = concatenate([att_, max_, avg_])
    main = DropConnect(Dense(128, activation = "relu"),
                       prob = dr)(main)
    
    out = Dense(1, activation = "sigmoid")(main)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy",
                  optimizer = Adam(), 
                  metrics = ["accuracy"])
    
    return model

In [None]:
model = build_qrnn_model(units = 62, dr = 0.3)
qrnn_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                      validation_data = (X_val, y_val), verbose = 2)

In [None]:
val_pred = model.predict(X_val, batch_size = 256, verbose = 1)

threshold, score = threshold_search(y_val, val_pred)
print("F1 score at threshold {0} is {1}".format(threshold, score))

Without using DropConnect layers between QRNN layers, model performance is terrible.

## Model Evaluation

In [None]:
models_hist = {"simple_lstm": slstm_hist, 
               "double_lstm": dlstm_hist,
               "rnn": rnn_hist,
               "rcnn": rcnn_hist,
               "double_lstm_with_pooling": dlstm_pool_hist,
               "capsule_net": capsule_hist,
               "rwa": rwa_hist,
               "dpcnn": dpcnn_hist,
               "qrnn": qrnn_hist}

plt.figure(figsize = (10, 8))
for hist in models_hist: 
    plt.plot(models_hist[hist].history["acc"])
plt.title("Model Train Accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend(models_hist.keys(), loc = "upper left")
plt.savefig("Train_Accuracy.png")
plt.show()

plt.figure(figsize = (10, 8))
for hist in models_hist: 
    plt.plot(models_hist[hist].history["val_acc"])
plt.title("Model Val Accuracy")
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend(models_hist.keys(), loc = "upper left")
plt.savefig("Val_Accuracy.png")
plt.show()

In [None]:
plt.figure(figsize = (10, 8))
for hist in models_hist: 
    plt.plot(models_hist[hist].history["loss"])
plt.title("Model Train Loss")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend(models_hist.keys(), loc = "upper left")
plt.savefig("Train_Loss.png")
plt.show()

plt.figure(figsize = (10, 8))
for hist in models_hist: 
    plt.plot(models_hist[hist].history["val_loss"])
plt.title("Model Val Loss")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend(models_hist.keys(), loc = "upper left")
plt.savefig("Val_Loss.png")
plt.show()

### Make Prediction

In [None]:
test_pred = model.predict(X_test, verbose = 1)
test_pred = (test_pred > threshold).astype(int)

In [None]:
submission = pd.DataFrame({"qid": test["qid"]})
submission["prediction"] = test_pred
submission.to_csv("submission.csv", index = False)
submission.head()