# Quora Toxic Questions Competition
by Luis Andrade

**Background**

Quora is a platform that empowers people to learn from each other. On Quora, people can ask questions and connect with others who contribute unique insights and quality answers. A key challenge is to weed out insincere questions -- those founded upon false premises, or that intend to make a statement rather than look for helpful answers.

**Goal**

The goal of this competition was to develop models that identify and flag insincere questions. 

**Keywords**
* NLP
* Word embeddings
* Deep Learning
* Binary classification

**Notebook Structure**
1. Data loading and preprocessing
2. Embeddings creation
3. Model creation
4. Trainnig & evaluation
5. Submission

In [3]:
import os
import time
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

%matplotlib inline

In [4]:
# list available files
print(os.listdir("../input"))
print(os.listdir("../input/embeddings"))


['train.csv', 'test.csv', 'sample_submission.csv', 'embeddings']
['GoogleNews-vectors-negative300', 'wiki-news-300d-1M', 'paragram_300_sl999', 'glove.840B.300d']


In [5]:
# some config values 
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

## Data loading & preprocessing

In [18]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

In [7]:
def load_and_prec():
    """Function to load the data and preprocess it"""
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    
    train_df["question_text"] = train_df["question_text"].str.lower()
    test_df["question_text"] = test_df["question_text"].str.lower()
    
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))
    
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.001, random_state=2018) # hahaha


    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    val_X = val_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values
    
    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    val_X = pad_sequences(val_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values  
    
    #shuffle the data
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))
    val_idx = np.random.permutation(len(val_X))
    
    train_X = train_X[trn_idx]
    val_X = val_X[val_idx]
    train_y = train_y[trn_idx]
    val_y = val_y[val_idx]    

    return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index

## Embedding Creation

In [8]:
def get_embeddings(embeddings_index, word_index, name):
    """Compute the embeddings matrix"""
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index))
    # Warm-start embedding matrix with normal distribution with same mean 
    # and standard deviation as the pre-trained embeddings 
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 

In [9]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    return get_embeddings(embeddings_index, word_index, "glove")


In [10]:
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" "))
                            for o in open(EMBEDDING_FILE) if len(o)>100)

    return get_embeddings(embeddings_index, word_index, "fasttext")

In [11]:
def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) 
                            for o in open(EMBEDDING_FILE, encoding="utf8",
                                          errors='ignore')
                            if len(o)>100)
    return get_embeddings(embeddings_index, word_index, "para")

## Model Creation

In this section different deep learning models are created

In [12]:
def model_cnn(embedding_matrix):
    """CNN model"""
    filter_sizes = [1,2,3,5]
    num_filters = 36

    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    maxpool_pool = []
    for f_size in filter_sizes:
        conv = Conv2D(num_filters, kernel_size=(f_size, embed_size),
                      kernel_initializer='he_normal', activation='relu')(x)
        maxpool_pool.append(MaxPool2D(pool_size=(maxlen - f_size + 1, 1))(conv))
        
    z = Concatenate(axis=1)(maxpool_pool)   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(1, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [13]:
class Attention(Layer):
    """Attention Layer"""
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [14]:
def model_lstm_atten(embedding_matrix):
    """Multi BiLSTM with Attention Layer"""
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [15]:
def model_lstm_du(embedding_matrix):
    """BiLSTM with mixed pooling and dropout"""
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPool1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(64, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

## Trainning & Evaluation

In [20]:
def train_pred(model, epochs=2, verbose=0):
    model.fit(train_X, train_y, batch_size=512, epochs=epochs,
              validation_data=(val_X, val_y))
    pred_val_y = model.predict([val_X], batch_size=1024, verbose=verbose)
    pred_test_y = model.predict([test_X], batch_size=1024, verbose=verbose)
    
    """
     #Plot training & validation accuracy values
     plt.plot(model.history['acc'])
     plt.plot(model.history['val_acc'])
     plt.title('Model accuracy')
     plt.ylabel('Accuracy')
     plt.xlabel('Epoch')
     plt.legend(['Train', 'Test'], loc='upper left')
     plt.show()

     # Plot training & validation loss values
     plt.plot(model.history.history['loss'])
     plt.plot(model.history.history['val_loss'])
     plt.title('Model loss')
     plt.ylabel('Loss')
     plt.xlabel('Epoch')
     plt.legend(['Train', 'Test'], loc='upper left')
     plt.show()
     
     """
    return pred_val_y, pred_test_y

In [17]:
train_X, val_X, test_X, train_y, val_y, word_index = load_and_prec()
emb_matrix_glove = load_glove(word_index)
print("Done with glove")
#emb_matrix_para = load_para(word_index)
#print("Done with paragram")
#embedding_matrix = np.mean([emb_matrix_glove, emb_matrix_para], axis = 0)
#print("Finished embedding matrix")

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


  This is separate from the ipykernel package so we can avoid doing imports until


Done with glove
Done with paragram
Finished embedding matrix


In [21]:
outputs = {}

model1 = model_cnn(emb_matrix_glove)
pred_val_y, pred_test_y = train_pred(model1, epochs=3, verbose=3)
outputs["cnn para"] = (pred_val_y, pred_test_y)

model2 = model_lstm_atten(emb_matrix_glove)
pred_val_y, pred_test_y = train_pred(model2, epochs=3, verbose=3)
outputs["bilstm glove"] = (pred_val_y, pred_test_y)

model3 = model_lstm_du(emb_matrix_glove)
pred_val_y, pred_test_y = train_pred(model3, epochs=3, verbose=3)
outputs["lstm maxpool mix"] = (pred_val_y, pred_test_y)

Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1


In [22]:
# "Stack" all model predictions by averaging over their outputs
pred_val_y = np.mean([output[0] for output in outputs.values()], axis=0)


In [23]:
def find_best_threshold(pred_val_y, val_y):
    thresholds = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
        thresholds.append([thresh, res])
        print("F1 score at threshold {0} is {1}".format(thresh, res))

    thresholds.sort(key=lambda x: x[1], reverse=True)
    best_thresh = thresholds[0][0]
    print("Best threshold: ", best_thresh)
    return best_thresh

In [24]:
# Find best threshold 
best_thresh = find_best_threshold(pred_val_y, val_y)
pred_test_y = (pred_test_y > best_thresh).astype(int)


F1 score at threshold 0.1 is 0.6135458167330677
F1 score at threshold 0.11 is 0.6178861788617885
F1 score at threshold 0.12 is 0.6229508196721312
F1 score at threshold 0.13 is 0.6244725738396625
F1 score at threshold 0.14 is 0.6324786324786326
F1 score at threshold 0.15 is 0.6406926406926408
F1 score at threshold 0.16 is 0.6371681415929205
F1 score at threshold 0.17 is 0.6371681415929205
F1 score at threshold 0.18 is 0.6457399103139013
F1 score at threshold 0.19 is 0.6425339366515838
F1 score at threshold 0.2 is 0.6481481481481481
F1 score at threshold 0.21 is 0.6635071090047393
F1 score at threshold 0.22 is 0.6571428571428573
F1 score at threshold 0.23 is 0.6570048309178744
F1 score at threshold 0.24 is 0.6601941747572816
F1 score at threshold 0.25 is 0.6699507389162562
F1 score at threshold 0.26 is 0.6799999999999999
F1 score at threshold 0.27 is 0.6868686868686867
F1 score at threshold 0.28 is 0.6938775510204082
F1 score at threshold 0.29 is 0.6871794871794871
F1 score at threshold 

## Submission

In [25]:
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)