# Load packages

In [1]:
%matplotlib inline
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Input, LSTM, Embedding, Bidirectional, GlobalMaxPool1D, Activation, CuDNNLSTM
from keras.layers.normalization import BatchNormalization
import gc
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score
## Parameters
maxwords_question = 100
maxlen_word = 300 
maxwords_vocabulary = 50000 

Using TensorFlow backend.


# Load and preprocess data

In [2]:
# Load data
data_df = pd.read_csv("../input/train.csv")

# Split into train and test
train_df, test_df = train_test_split(data_df, test_size = 0.1, random_state=55)

## Lower case all alphabets
train_df["processed_text"] = train_df["question_text"].apply(lambda x: x.lower())
test_df["processed_text"] = test_df["question_text"].apply(lambda x: x.lower())

## Tokenizer -- turns each question into a sequence of integers (each integer being the index of a token in a dictionary)
tokenizer = Tokenizer(num_words = maxwords_vocabulary)
tokenizer.fit_on_texts(list(train_df["processed_text"]) + list(test_df["processed_text"]))

# Tokenize train and test data
train_df_tokens = tokenizer.texts_to_sequences(train_df["processed_text"])
train_df_labels = train_df["target"].values
test_df_tokens = tokenizer.texts_to_sequences(test_df["processed_text"])

## Pad the tokenized sequences to length maxwords_question
train_df_tokens = pad_sequences(train_df_tokens, maxlen = maxwords_question)
test_df_tokens = pad_sequences(test_df_tokens, maxlen = maxwords_question)


# Create validation dataset 

In [None]:
## Create and tokenize validation dataset
train_df2, val_df2 = train_test_split(train_df, test_size=0.05, random_state=42)
print("======Train data========")
print(train_df2.target.value_counts())
train_df2_tokens = tokenizer.texts_to_sequences(train_df2["processed_text"])
train_df2_labels = train_df2["target"].values
print("====Validation data=======")
print(val_df2.target.value_counts())
val_df2_tokens = tokenizer.texts_to_sequences(val_df2["processed_text"])
val_df2_labels = val_df2["target"].values
val_df2.head()

## Pad the tokenized sequences to length maxwords_question
train_df2_tokens = pad_sequences(train_df2_tokens, maxlen = maxwords_question)
val_df2_tokens = pad_sequences(val_df2_tokens, maxlen = maxwords_question)

# Load GloVe Embeddings

In [3]:
## Load GloVe embeddings from disk
embeddingGlovePath = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def split_key_value(item): 
    key, *value = item.split(" ")
    return key, np.asarray(value, dtype='float32')
embeddings_dict = {}
for item in open(embeddingGlovePath):
    key, value = split_key_value(item)
    embeddings_dict[key] = value

## Make embeddings matrix from the loaded GloVe embeddings
word_index = tokenizer.word_index
glove_embs = np.stack(embeddings_dict.values())
glove_embs_mean, glove_embs_std = glove_embs.mean(), glove_embs.std()
glove_embs_size = glove_embs.shape[1]
del glove_embs
glove_embs_matrix = np.random.normal(glove_embs_mean, glove_embs_std, (maxwords_vocabulary, glove_embs_size))
for word, idx in word_index.items():
    if idx >= maxwords_vocabulary: 
        continue
    if word in embeddings_dict.keys():
        word_emb_vector = embeddings_dict[word]
    if word_emb_vector is not None: 
        glove_embs_matrix[idx] = word_emb_vector
del embeddings_dict
gc.collect()

0

# Bi-directional LSTM

In [4]:
inp = Input(shape = (maxwords_question,) )
x = Embedding(maxwords_vocabulary, maxlen_word, weights=[glove_embs_matrix])(inp)
x = Bidirectional(CuDNNLSTM(128, return_sequences=False))(x)
x = Dense(64, activation="relu")(x)
x = Dense(8, activation="relu")(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs = inp, outputs = x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               440320    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 520       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 15,457,297
Trainable params: 15,457,297
Non-trainable params: 0
________________________________________________________________

# Train the model

In [5]:
model.fit(train_df_tokens, train_df_labels, batch_size = 256, epochs=2, shuffle = True)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fe2c4bce828>

# Predict test labels

In [6]:
test_pred = model.predict([test_df_tokens], batch_size=512, verbose=1) > 0.35
print('Mean Accuracy (test data): ', accuracy_score(test_pred, test_df.target))
print('F1 score insincere questions (test data): ',f1_score(test_pred, test_df.target, pos_label=1))

Mean Accuracy (test data):  0.9576841508885027
F1 score insincere questions (test data):  0.676348304737366
