In [1]:
import numpy as np, pandas as pd, random as rn, os, gc, re, time
start = time.time()
seed = 32
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['OMP_NUM_THREADS'] = '4'
np.random.seed(seed)
rn.seed(seed)
import tensorflow as tf
session_conf = tf.ConfigProto(intra_op_parallelism_threads = 1,
                              inter_op_parallelism_threads = 1)
tf.set_random_seed(seed)
sess = tf.Session(graph = tf.get_default_graph(), config = session_conf)
from keras import backend as K
K.set_session(sess)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, precision_recall_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

from keras.layers import Input, Dense, CuDNNLSTM, Bidirectional, Activation, Conv1D
from keras.layers import Dropout, Embedding, GlobalMaxPooling1D, MaxPooling1D, AlphaDropout
from keras.layers import Add, Flatten, BatchNormalization, GlobalAveragePooling1D
from keras.layers import concatenate, SpatialDropout1D, CuDNNGRU, Lambda, GaussianDropout, GaussianNoise
from keras.layers import PReLU, ReLU, ELU
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.initializers import he_normal, he_uniform, glorot_normal
from keras.initializers import glorot_uniform, zeros, orthogonal
from keras.models import Model, load_model
from keras.optimizers import Adam, RMSprop, SGD
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint

train = pd.read_csv("../input/train.csv").fillna("missing")

embedding_file1 = "../input/embeddings/glove.840B.300d/glove.840B.300d.txt"
embedding_file2 = "../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt"

embed_size = 300
max_features = 100000
max_len = 60

Using TensorFlow backend.


In [2]:
"""
misspell list (quora vs. glove)
"""
mispell_dict = {
        'Terroristan': 'terrorist Pakistan',
        'terroristan': 'terrorist Pakistan',
        'BIMARU': 'Bihar, Madhya Pradesh, Rajasthan, Uttar Pradesh',
        'Hinduphobic': 'Hindu phobic',
        'hinduphobic': 'Hindu phobic',
        'Hinduphobia': 'Hindu phobic',
        'hinduphobia': 'Hindu phobic',
        'Babchenko': 'Arkady Arkadyevich Babchenko faked death',
        'Boshniaks': 'Bosniaks',
        'Dravidanadu': 'Dravida Nadu',
        'mysoginists': 'misogynists',
        'MGTOWS': 'Men Going Their Own Way',
        'mongloid': 'Mongoloid',
        'unsincere': 'insincere',
        'meninism': 'male feminism',
        'jewplicate': 'jewish replicate',
        'unoin': 'Union',
        'daesh': 'Islamic State of Iraq and the Levant',
        'Kalergi': 'Coudenhove-Kalergi',
        'Bhakts': 'Bhakt',
        'bhakts': 'Bhakt',
        'Tambrahms': 'Tamil Brahmin',
        'Pahul': 'Amrit Sanskar',
        'SJW': 'social justice warrior',
        'SJWs': 'social justice warrior',
        'incel': ' involuntary celibates',
        'incels': ' involuntary celibates',
        'emiratis': 'Emiratis',
        'weatern': 'western',
        'westernise': 'westernize',
        'Pizzagate': 'Pizzagate conspiracy theory',
        'naïve': 'naive',
        'Skripal': 'Sergei Skripal',
        'Remainers': 'British remainer',
        'remainers': 'British remainer',
        'bremainer': 'British remainer',
        'antibrahmin': 'anti Brahminism',
        'HYPSM': 'Harvard, Yale, Princeton, Stanford, MIT',
        'HYPS': 'Harvard, Yale, Princeton, Stanford',
        'kompromat': 'compromising material',
        'Tharki': 'pervert',
        'tharki': 'pervert',
        'mastuburate': 'masturbate',
        'Zoë': 'Zoe',
        'indans': 'Indian',
        'xender': 'gender',
        'Naxali ': 'Naxalite ',
        'Naxalities': 'Naxalites',
        'Bathla': 'Namit Bathla',
        'Mewani': 'Indian politician Jignesh Mevani',
        'clichéd': 'cliche',
        'cliché': 'cliche',
        'clichés': 'cliche',
        'Wjy': 'Why',
        'Fadnavis': 'Indian politician Devendra Fadnavis',
        'Awadesh': 'Indian engineer Awdhesh Singh',
        'Awdhesh': 'Indian engineer Awdhesh Singh',
        'Khalistanis': 'Sikh separatist movement',
        'madheshi': 'Madheshi',
        'BNBR': 'Be Nice, Be Respectful',
        'Bolsonaro': 'Jair Bolsonaro',
        'XXXTentacion': 'Tentacion',
        'Padmavat': 'Indian Movie Padmaavat',
        'Žižek': 'Slovenian philosopher Slavoj Žižek',
        'Adityanath': 'Indian monk Yogi Adityanath',
        'Brexit': 'British Exit',
        'Brexiter': 'British Exit supporter',
        'Brexiters': 'British Exit supporters',
        'Brexiteer': 'British Exit supporter',
        'Brexiteers': 'British Exit supporters',
        'Brexiting': 'British Exit',
        'Brexitosis': 'British Exit disorder',
        'brexit': 'British Exit',
        'brexiters': 'British Exit supporters',
        'jallikattu': 'Jallikattu',
        'fortnite': 'Fortnite ',
        'Swachh': 'Swachh Bharat mission campaign ',
        'Quorans': 'Quoran',
        'Qoura ': 'Quora ',
        'quoras': 'Quora',
        'Quroa': 'Quora',
        'QUORA': 'Quora',
        'narcissit': 'narcissist',
        # extra in sample
        'Doklam': 'Tibet',
        'Drumpf': 'Donald Trump fool',
        'Drumpfs': 'Donald Trump fools',
        'Strzok': 'Hillary Clinton scandal',
        'rohingya': 'Rohingya ',
        'wumao': 'cheap Chinese stuff',
        'wumaos': 'cheap Chinese stuff',
        'Sanghis': 'Sanghi',
        'Tamilans': 'Tamils',
        'biharis': 'Biharis',
        'Rejuvalex': 'hair growth formula',
        'Feku': 'Fake',
        'deplorables': 'deplorable',
        'muhajirs': 'Muslim immigrant',
        'Gujratis': 'Gujarati',
        'Chutiya': 'Fucker',
        'Chutiyas': 'Fucker',
        'thighing': 'masturbate',
        '卐': 'Nazi Germany',
        'Pribumi': 'Native Indonesian',
        'Gurmehar': 'Gurmehar Kaur Indian student activist',
        'Novichok': 'Soviet Union agents',
        'Khazari': 'Khazars',
        'Demonetization': 'demonetization',
        'demonetisation': 'demonetization',
        'demonitisation': 'demonetization',
        'demonitization': 'demonetization',
        'demonetisation': 'demonetization',
        'cryptocurrencies': 'cryptocurrency',
        'Hindians': 'North Indian who hate British',
        'vaxxer': 'vocal nationalist ',
        'remoaner': 'remainer ',
        'bremoaner': 'British remainer ',
        'Jewism': 'Judaism',
        'Eroupian': 'European',
        'WMAF': 'White male married Asian female',
        'moeslim': 'Muslim',
        'cishet': 'cisgender and heterosexual person',
        'Eurocentric': 'Eurocentrism ',
        'Jewdar': 'Jew dar',
        'Asifa': 'abduction, rape, murder case ',
        'marathis': 'Marathi',
        'Trumpanzees': 'Trump chimpanzee fool',
        'Crimean': 'Crimea people ',
        'atrracted': 'attract',
        'LGBT': 'lesbian, gay, bisexual, transgender',
        'Boshniak': 'Bosniaks ',
        'Myeshia': 'widow of Green Beret killed in Niger',
        'demcoratic': 'Democratic',
        'raaping': 'rape',
        'Dönmeh': 'Islam',
        'feminazism': 'feminism nazi',
        'langague': 'language',
        'Hongkongese': 'HongKong people',
        'hongkongese': 'HongKong people',
        'Kashmirians': 'Kashmirian',
        'Chodu': 'fucker',
        'penish': 'penis',
        'micropenis': 'tiny penis',
        'Madridiots': 'Real Madrid idiot supporters',
        'Ambedkarite': 'Dalit Buddhist movement ',
        'ReleaseTheMemo': 'cry for the right and Trump supporters',
        'harrase': 'harass',
        'Barracoon': 'Black slave',
        'Castrater': 'castration',
        'castrater': 'castration',
        'Rapistan': 'Pakistan rapist',
        'rapistan': 'Pakistan rapist',
        'Turkified': 'Turkification',
        'turkified': 'Turkification',
        'Dumbassistan': 'dumb ass Pakistan',
        'facetards': 'Facebook retards',
        'rapefugees': 'rapist refugee',
        'superficious': 'superficial',
        # extra from kagglers
        'colour': 'color',
        'centre': 'center',
        'favourite': 'favorite',
        'travelling': 'traveling',
        'counselling': 'counseling',
        'theatre': 'theater',
        'cancelled': 'canceled',
        'labour': 'labor',
        'organisation': 'organization',
        'wwii': 'world war 2',
        'citicise': 'criticize',
        'youtu ': 'youtube ',
        'sallary': 'salary',
        'Whta': 'What',
        'narcisist': 'narcissist',
        'narcissit': 'narcissist',
        'howdo': 'how do',
        'whatare': 'what are',
        'howcan': 'how can',
        'howmuch': 'how much',
        'howmany': 'how many',
        'whydo': 'why do',
        'doI': 'do I',
        'theBest': 'the best',
        'howdoes': 'how does',
        'mastrubation': 'masturbation',
        'mastrubate': 'masturbate',
        'mastrubating': 'masturbating',
        'pennis': 'penis',
        'Etherium': 'Ethereum',
        'bigdata': 'big data',
        '2k17': '2017',
        '2k18': '2018',
        'qouta': 'quota',
        'exboyfriend': 'ex boyfriend',
        'airhostess': 'air hostess',
        'whst': 'what',
        'watsapp': 'whatsapp',
        # extra
        'bodyshame': 'body shaming',
        'bodyshoppers': 'body shopping',
        'bodycams': 'body cams',
        'Cananybody': 'Can any body',
        'deadbody': 'dead body',
        'deaddict': 'de addict',
        'Northindian': 'North Indian ',
        'northindian': 'north Indian ',
        'northkorea': 'North Korea',
        'Whykorean': 'Why Korean',
        'koreaboo': 'Korea boo ',
        'Brexshit': 'British Exit bullshit',
        'shithole': 'shithole ',
        'shitpost': 'shit post',
        'shitslam': 'shit Islam',
        'shitlords': 'shit lords',
        'Fck': 'Fuck',
        'fck': 'fuck',
        'Clickbait': 'click bait ',
        'clickbait': 'click bait ',
        'mailbait': 'mail bait',
        'healhtcare': 'healthcare',
        'trollbots': 'troll bots',
        'trollled': 'trolled',
        'trollimg': 'trolling',
        'cybertrolling': 'cyber trolling',
        'sickular': 'India sick secular ',
        'suckimg': 'sucking',
        'Idiotism': 'idiotism',
        'Niggerism': 'Nigger',
        'Niggeriah': 'Nigger'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

def spacing_misspell(text):
    """
    'deadbody' -> 'dead body'
    """
    misspell_list = [
        '(F|f)uck',
        'Trump',
        '\W(A|a)nti',
        '(W|w)hy',
        '(W|w)hat',
        'How',
        'care\W',
        '\Wover',
        'gender',
        'people',
    ]
    misspell_re = re.compile('(%s)' % '|'.join(misspell_list))
    return misspell_re.sub(r" \1 ", text)

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', 
          '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
          '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  
          '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', 
          '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', 
          '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', 
          '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', 
          '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', 
          '¹', '≤', '‡', '√', 'β', 'α', '∅', 'θ', '÷', '₹']

def space_punct(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

train["question_text"] = train["question_text"].apply(lambda x: replace_typical_misspell(x))
train["question_text"] = train["question_text"].apply(lambda x: space_punct(x))
print("Text cleaning completed!")

Text cleaning completed!


In [3]:
def add_num_features(data):
    data["total_length"] = data["question_text"].apply(len)
    data["num_words"] = data["question_text"].str.count("\S+")
    data["total_length"] = data["total_length"].fillna(0)
    data["num_words"] = data["num_words"].fillna(0)
    
    return data

def scale_num_features(tr, val, te):
    scaler = StandardScaler()
    scaler.fit(tr)
    
    tr = scaler.transform(tr)
    val = scaler.transform(val)
    te = scaler.transform(te)
    
    return tr, val, te

# train = add_num_features(train)

In [4]:
def get_glove(embedding_file):
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_file))
    
#     all_embs = np.stack(embeddings_index.values())
#     emb_mean, emb_std = all_embs.mean(), all_embs.std()
#     embed_size = all_embs.shape[1]
    return embeddings_index

def get_para(embedding_file):
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_file, 
                                                                   encoding="utf8", 
                                                                   errors='ignore') if len(o)>100)
#     all_embs = np.stack(embeddings_index.values())
#     emb_mean, emb_std = all_embs.mean(), all_embs.std()
#     embed_size = all_embs.shape[1]
    return embeddings_index

glove_index = get_glove(embedding_file1)
# para_index = get_para(embedding_file2)

In [5]:
def get_embed(tokenizer = None, embeddings_index = None, emb_mean = None, emb_std = None):
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return nb_words, embedding_matrix

In [6]:
# punctuation = "".join(puncts)
tokenizer = Tokenizer(num_words = max_features, 
                      filters = '"#$%&()*+/:;-<=>@[\]^_`{|}~', 
#                       filters = punctuation,
                      lower = False)
tokenizer.fit_on_texts(train["question_text"])

train_token = tokenizer.texts_to_sequences(train["question_text"])
train_seq = pad_sequences(train_token, maxlen = max_len)
del train_token; gc.collect()

0

In [7]:
target = train["target"].values
# sincere = train[train["target"] == 0]["qid"]
# insincere = train[train["target"] == 1]["qid"]

# n_sincere = len(sincere)
# n_insincere = len(insincere)

In [8]:
nb_words, embedding_matrix1 = get_embed(tokenizer = tokenizer, embeddings_index = glove_index, 
                                        emb_mean = -0.005838499, 
                                        emb_std = 0.48782197)
# nb_words, embedding_matrix2 = get_embed(tokenizer = tokenizer, embeddings_index = para_index, 
#                                         emb_mean = -0.0053247833, 
#                                         emb_std = 0.49346462)
# embedding_matrix = np.mean([embedding_matrix1, embedding_matrix2], axis = 0)
# del embedding_matrix1, embedding_matrix2; gc.collect()

print("Embedding matrix completed!")

Embedding matrix completed!


In [9]:
from keras.engine import Layer, InputSpec
from keras.layers import K

def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [10]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [11]:
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [12]:
class KMaxPooling(Layer):
    """
    k-max-pooling
    """

    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim = 3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[2] * self.k))

    def call(self, inputs):
        # swap last two dimensions since top_k will be applied along the last dimension
        shifted_input = tf.transpose(inputs, [0, 2, 1])

        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]

        # return flattened output
        return Flatten()(top_k)

In [13]:
from keras.optimizers import Optimizer
from keras import backend as K
import six
import copy
from six.moves import zip
from keras.utils.generic_utils import serialize_keras_object
from keras.utils.generic_utils import deserialize_keras_object
from keras.legacy import interfaces

class AdamW(Optimizer):
    """Adam optimizer.
    Default parameters follow those provided in the original paper.
    # Arguments
        lr: float >= 0. Learning rate.
        beta_1: float, 0 < beta < 1. Generally close to 1.
        beta_2: float, 0 < beta < 1. Generally close to 1.
        epsilon: float >= 0. Fuzz factor.
        decay: float >= 0. Learning rate decay over each update.
        weight_decay: float >= 0. Decoupled weight decay over each update.
    # References
        - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
        - [Optimization for Deep Learning Highlights in 2017](http://ruder.io/deep-learning-optimization-2017/index.html)
        - [Fixing Weight Decay Regularization in Adam](https://arxiv.org/abs/1711.05101)
    """

    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,  # decoupled weight decay (1/6)
                 epsilon=1e-8, decay=0., **kwargs):
        super(AdamW, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.init_lr = lr # decoupled weight decay (2/6)
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')
            self.wd = K.variable(weight_decay, name='weight_decay') # decoupled weight decay (3/6)
        self.epsilon = epsilon
        self.initial_decay = decay

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        wd = self.wd # decoupled weight decay (4/6)

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))
        eta_t = lr / self.init_lr # decoupled weight decay (5/6)

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - eta_t * wd * p # decoupled weight decay (6/6)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'beta_1': float(K.get_value(self.beta_1)),
                  'beta_2': float(K.get_value(self.beta_2)),
                  'decay': float(K.get_value(self.decay)),
                  'weight_decay': float(K.get_value(self.wd)),
                  'epsilon': self.epsilon}
        base_config = super(AdamW, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [14]:
from keras.layers import Wrapper
import keras.backend as K

class DropConnect(Wrapper):
    def __init__(self, layer, prob=1., **kwargs):
        self.prob = prob
        self.layer = layer
        super(DropConnect, self).__init__(layer, **kwargs)
        if 0. < self.prob < 1.:
            self.uses_learning_phase = True

    def build(self, input_shape):
        if not self.layer.built:
            self.layer.build(input_shape)
            self.layer.built = True
        super(DropConnect, self).build()

    def compute_output_shape(self, input_shape):
        return self.layer.compute_output_shape(input_shape)

    def call(self, x):
        if 0. < self.prob < 1.:
            self.layer.kernel = K.in_train_phase(K.dropout(self.layer.kernel, self.prob), self.layer.kernel)
            self.layer.bias = K.in_train_phase(K.dropout(self.layer.bias, self.prob), self.layer.bias)
        return self.layer.call(x)

In [15]:
def get_f1(true, val):
    precision, recall, thresholds = precision_recall_curve(true, val)
    thresholds = np.append(thresholds, 1.001) 
    F = 2 / (1/precision + 1/recall)
    best_score = np.max(F)
    best_threshold = thresholds[np.argmax(F)]
    
    return best_threshold, best_score    

In [16]:
def build_lstm_model(units = 40, dr = 0.3, 
                     num_capsules = 10, dim_capsules = 10, 
                     routs = 4, _k = 2,
                     nb_words = nb_words, embedding_matrix = embedding_matrix1):
    
    inp = Input(shape = (max_len, ))
#     num_inp = Input(shape = (num_feat.shape[1], ))
    embed_layer = Embedding(nb_words, embed_size, input_length = max_len,
                            weights = [embedding_matrix], trainable = False)(inp)
    x = SpatialDropout1D(dr, seed = seed)(embed_layer)
    
#     x = GaussianDropout(dr)(embed_layer)
    x = Bidirectional(CuDNNLSTM(units, kernel_initializer = glorot_normal(seed = seed), 
                                recurrent_initializer = orthogonal(gain = 1.0, seed = seed), 
                                return_sequences = True))(x)
    x = Capsule(num_capsule = num_capsules, dim_capsule = dim_capsules, routings = routs)(x)
    x, x_h, x_c = Bidirectional(CuDNNGRU(units, kernel_initializer = glorot_normal(seed = seed),
                                         recurrent_initializer = orthogonal(gain = 1.0, seed = seed),
                                         return_sequences = True, return_state = True))(x)
#     x = Conv1D(32, 2, strides = 2, padding = "valid", 
#                kernel_initializer = glorot_normal(seed = seed))(x)
#    att = Attention(num_capsule//2)(x) # if conv1d applied

    att = Attention(num_capsules)(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
#     k_max_pool = KMaxPooling(k = _k)(x)
    
    main = concatenate([x_h, att, avg_pool, max_pool])
#     main = Dense(256, kernel_initializer = glorot_normal(seed = seed))(main)
#     main = Activation("relu")(main)
#     main = Dropout(0.2, seed = seed)(main)
    main = DropConnect(Dense(256, kernel_initializer = glorot_normal(seed = seed)), 0.1)(main)
#     main = Dense(64, kernel_initializer = glorot_normal(seed = seed))(main)
    main = Activation("relu")(main)
#     main = Dropout(0.1, seed = seed)(main)
    
#     main = Dense(32, kernel_initializer = glorot_normal(seed = seed))(main)
#     main = Activation("relu")(main)
#     main = Dropout(0.05, seed = seed)(main)
    
    out = Dense(1, activation = "sigmoid", 
                kernel_initializer = glorot_normal(seed = seed))(main)
    model = Model(inputs = inp, outputs = out)
    model.compile(loss = "binary_crossentropy",
                  optimizer = AdamW(weight_decay = 0.5e-4), 
                  metrics = None)
    
    return model

In [17]:
fold = 5
batch_size = 1024
epochs = 9
units, dr = 64, 0.3
num_capsules = 10
dim_capsules = 10
routs = 3
k = 3

oof_pred = np.zeros((len(target), 1))
thresholds = [0]*fold

kfold = StratifiedKFold(n_splits = fold, shuffle = True, random_state = seed)
fold_idx = kfold.split(train_seq, target)

for i, (train_idx, val_idx) in enumerate(fold_idx):
    f = i+1
    print("-"*70)
    print("Training {}/{} fold".format(i+1, fold))
    
#     train_idx = pd.concat([sincere[:int(n_sincere*(i/fold))], 
#                            sincere[int(n_sincere*(f/fold)):],
#                            insincere[:int(n_insincere*(i/fold))], 
#                            insincere[int(n_insincere*(f/fold)):]]).index
#     val_idx = pd.concat([sincere[int(n_sincere*(i/fold)):int(n_sincere*(f/fold))],
#                          insincere[int(n_insincere*(i/fold)):int(n_insincere*(f/fold))]]).index
    
    X_train, y_train = train_seq[train_idx], target[train_idx]
    X_val, y_val = train_seq[val_idx], target[val_idx]
    
#     test_num = test[["total_length", "num_words"]]
#     train_num, val_num, test_num = scale_num_features(train_num, val_num, test_num)
    
    np.random.seed(seed)
    train_idx = np.random.permutation(len(X_train))
#     val_idx = np.random.permutation(len(X_val))

    X_train = X_train[train_idx]
#     X_val = X_val[val_idx]
#     train_num = train_num[train_idx]
    y_train = y_train[train_idx]
#     y_val = y_val[val_idx]

    best_model = "fold_{}_best_model.h5".format(f)
    check_point = ModelCheckpoint(best_model, monitor = "val_loss", mode = "min",
                                  save_best_only = True, 
                                  save_weights_only = True,
                                  verbose = 1)
    clr = CyclicLR(base_lr = 0.0008, max_lr = 0.0025,
                   step_size = 4*int(len(X_train)/batch_size), 
                   mode = "exp_range",
                   gamma = 0.99994)

    K.clear_session()
    model = build_lstm_model(units = units, dr = dr, 
                             num_capsules = num_capsules, dim_capsules = dim_capsules, 
                             routs = routs, _k = k,
                             nb_words = nb_words, embedding_matrix = embedding_matrix1)
    
    lstm_hist = model.fit(X_train, y_train, batch_size = batch_size, epochs = epochs, 
                          validation_data = (X_val, y_val), 
                          callbacks = [check_point, clr],
                          verbose = 2)

    model.load_weights(best_model)
    val_pred = model.predict(X_val, batch_size = batch_size, verbose = 2)
    threshold, score = get_f1(y_val, val_pred)
    oof_pred[val_idx] = val_pred
    thresholds[i] = threshold
    
    print("F1 score at threshold {} is {}.\n".format(threshold, score))
    del model; gc.collect()

----------------------------------------------------------------------
Training 1/5 fold
Train on 1044897 samples, validate on 261225 samples
Epoch 1/9
 - 136s - loss: 0.1311 - val_loss: 0.1114

Epoch 00001: val_loss improved from inf to 0.11142, saving model to fold_1_best_model.h5
Epoch 2/9
 - 131s - loss: 0.1093 - val_loss: 0.1036

Epoch 00002: val_loss improved from 0.11142 to 0.10363, saving model to fold_1_best_model.h5
Epoch 3/9
 - 131s - loss: 0.1040 - val_loss: 0.1019

Epoch 00003: val_loss improved from 0.10363 to 0.10191, saving model to fold_1_best_model.h5
Epoch 4/9
 - 131s - loss: 0.1009 - val_loss: 0.0991

Epoch 00004: val_loss improved from 0.10191 to 0.09906, saving model to fold_1_best_model.h5
Epoch 5/9
 - 131s - loss: 0.0979 - val_loss: 0.0982

Epoch 00005: val_loss improved from 0.09906 to 0.09815, saving model to fold_1_best_model.h5
Epoch 6/9
 - 131s - loss: 0.0949 - val_loss: 0.0981

Epoch 00006: val_loss improved from 0.09815 to 0.09806, saving model to fold_1_

  after removing the cwd from sys.path.


F1 score at threshold 0.3771718442440033 is 0.6931580172213172.

----------------------------------------------------------------------
Training 2/5 fold
Train on 1044897 samples, validate on 261225 samples
Epoch 1/9
 - 133s - loss: 0.1313 - val_loss: 0.1093

Epoch 00001: val_loss improved from inf to 0.10927, saving model to fold_2_best_model.h5
Epoch 2/9
 - 131s - loss: 0.1090 - val_loss: 0.1036

Epoch 00002: val_loss improved from 0.10927 to 0.10357, saving model to fold_2_best_model.h5
Epoch 3/9
 - 131s - loss: 0.1042 - val_loss: 0.1010

Epoch 00003: val_loss improved from 0.10357 to 0.10095, saving model to fold_2_best_model.h5
Epoch 4/9
 - 131s - loss: 0.1007 - val_loss: 0.0996

Epoch 00004: val_loss improved from 0.10095 to 0.09965, saving model to fold_2_best_model.h5
Epoch 5/9
 - 131s - loss: 0.0976 - val_loss: 0.0989

Epoch 00005: val_loss improved from 0.09965 to 0.09893, saving model to fold_2_best_model.h5
Epoch 6/9
 - 131s - loss: 0.0946 - val_loss: 0.0978

Epoch 00006: v

In [18]:
print("Thresholds of each fold: \n{}".format(thresholds))
threshold, score = get_f1(target, oof_pred)
print("F1 score at threshold {:.4f} is {:.4f}".format(threshold, score))

Thresholds of each fold: 
[0.3771718442440033, 0.3656523823738098, 0.35094016790390015, 0.36642345786094666, 0.39201632142066956]
F1 score at threshold 0.3659 is 0.6918


  after removing the cwd from sys.path.


In [19]:
test = pd.read_csv("../input/test.csv").fillna("missing")

num_test = test.shape[0]
pred = np.zeros((num_test, 1))

test["question_text"] = test["question_text"].apply(lambda x: replace_typical_misspell(x))
test["question_text"] = test["question_text"].apply(lambda x: space_punct(x))

test_token = tokenizer.texts_to_sequences(test["question_text"])
X_test = pad_sequences(test_token, maxlen = max_len)
del test_token; gc.collect()

for f in range(1, 6):
    model = build_lstm_model(units = units, dr = dr, 
                             num_capsules = num_capsules, dim_capsules = dim_capsules, 
                             routs = routs, _k = k,
                             nb_words = nb_words, embedding_matrix = embedding_matrix1)
    model.load_weights("fold_{}_best_model.h5".format(f))
    pred += model.predict(X_test, batch_size = batch_size, verbose = 2)
    
pred = pred/fold
pred = (pred > threshold).astype(int)

In [20]:
test["prediction"] = pred
test[["qid", "prediction"]].to_csv("submission.csv", index = False)
test[["qid", "prediction"]].head()

Unnamed: 0,qid,prediction
0,00014894849d00ba98a9,0
1,000156468431f09b3cae,0
2,000227734433360e1aae,0
3,0005e06fbe3045bd2a92,0
4,00068a0f7f41f50fc399,0


In [21]:
end = time.time()
print("Train and Predict completed in {}".format(end-start))

Train and Predict completed in 6331.5123999118805
