In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#df_train.columns

In [None]:
df_train = pd.read_csv("../input/quora-insincere-questions-classification/train.csv")
df_test = pd.read_csv("../input/quora-insincere-questions-classification/test.csv")

print("Training Data Shape", df_train.shape)
print("Testing Data Shape", df_test.shape)

In [None]:
#Splitting Training and Val
df_train, df_val = train_test_split(df_train, test_size = 0.1, random_state = 2020)

#Paramaeters
embed_size = 300
max_features = 50000
max_length = 100

#Missing Values

x_train = df_train['question_text'].fillna("na_vals").values
x_val = df_val['question_text'].fillna("na_vals").values
x_test = df_test['question_text'].fillna("na_vals").values

y_train = df_train['target']
y_val = df_val['target']

In [None]:
len(x_train)

In [None]:
#Preprocessing Text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)
sequences_train = tokenizer.texts_to_sequences(list(x_train))
sequences_val = tokenizer.texts_to_sequences(x_val)
sequences_test = tokenizer.texts_to_sequences(x_test)

#Padding
padded_train = pad_sequences(sequences_train, maxlen = max_length)
padded_val = pad_sequences(sequences_val, maxlen = max_length)
padded_test = pad_sequences(sequences_test, maxlen = max_length)

In [None]:
# No external Embeddings - such as Word2Vec or Glove, building on just the vocabulary from training dataset
import keras

model = keras.Sequential([
        keras.layers.Embedding(max_features, embed_size),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(32, activation = 'relu'),
        keras.layers.Dense(8, activation = 'relu'),
        keras.layers.Dense(1, activation = 'sigmoid')
    
])
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

In [None]:
num_epochs = 10
history = model.fit(padded_train,y_train, batch_size = 512, epochs = num_epochs, validation_data = (padded_val,y_val))

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

# Bidirection GRU Model
from keras.layers import CuDNNGRU

modelGRU = keras.Sequential([
        keras.layers.Embedding(max_features, embed_size),
        keras.layers.Bidirectional(CuDNNGRU(64,return_sequences = True)),
        keras.layers.GlobalAveragePooling1D(),
        keras.layers.Dense(16, activation = 'relu'),
        keras.layers.Dense(8, activation = 'relu'),
        keras.layers.Dense(1, activation = 'sigmoid')
    
])
modelGRU.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

Embeddings Available
* GoogleNews-vectors-negative300 - https://code.google.com/archive/p/word2vec/
* glove.840B.300d - https://nlp.stanford.edu/projects/glove/
* paragram_300_sl999 - https://cogcomp.org/page/resource_view/106
* wiki-news-300d-1M - https://fasttext.cc/docs/en/english-vectors.html

# Using Pre-Trained embeddings
#!ls ../input/quora-insincere-questions-classification/embeddings/
import zipfile
#from os import path, getcwd, chdir

#path = f"{getcwd()}/../input/quora-insincere-questions-classification/embeddings.zip"

zip_ref = zipfile.ZipFile("../input/quora-insincere-questions-classification/embeddings.zip", 'r')
zip_ref.extractall(".")
zip_ref.close()

from subprocess import check_output
print(check_output(["ls", "embeddings"]).decode("utf8"))

# Not using Standard Preprocessing but custom preprocessing and building vocab from the data

In [None]:
from tqdm import tqdm
tqdm.pandas()