# Quora insincere question classifier
 Detect toxic content to improve online conversation

In [1]:
import pandas as pd
import keras
import string

from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer

from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 1. Load Data

In [2]:
X_train = pd.read_csv("../Dataset/train.csv")
#X_train.head()
#X_train.shape

test_set = pd.read_csv("../Dataset/test.csv")
#test_set.shape

Use only a subset

In [3]:
X_train = X_train[:1000000]

## 2. Preprocess question_text column

Split the data into features and target label

In [4]:
y_train = X_train['target']
X_train = X_train["question_text"]
#X_train = X_train.drop('target', axis = 1)

test_set = test_set["question_text"]

Add space and stem words

In [5]:
from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer

X_train = X_train.apply(lambda x: str(x).translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation })))
test_set = test_set.apply(lambda x: str(x).translate(str.maketrans({key: " {0} ".format(key) for key in string.punctuation })))

stemmer = SnowballStemmer("english")

X_train = X_train.apply(lambda x: stemmer.stem(x))
test_set = test_set.apply(lambda x: stemmer.stem(x))

## 3. Label encoding

Define and fit tokenizer 

In [None]:
fullData = pd.concat([X_train,test_set],ignore_index=True)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(fullData)

Transform Label encode text

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)

test_set = tokenizer.texts_to_sequences(test_set)

## 4. One hot encoding

One-Hot Encode each row

In [None]:
from functools import reduce
import operator
# One-hot encoding the output into vector mode, each of length num_words
tokenizer = Tokenizer(num_words=500)

X_train = tokenizer.sequences_to_matrix(X_train, mode='binary')
test_set = tokenizer.sequences_to_matrix(test_set, mode='binary')

One hot encode Labels

In [None]:
y_train = pd.get_dummies(y_train)

## 5. Split features into training and validation set

In [None]:
from sklearn.model_selection import train_test_split
x_train, X_test, y_train, y_test = train_test_split(X_train, 
                                                    y_train, 
                                                    test_size=0.2,
                                                    random_state=28)

Define the f1 score function

In [None]:
import keras.backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

## 6. Define and train the model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
# Build the model architecture
QuoraClassifier = Sequential()

# Adding layers
# x_train.shape[1] = 500 = length of the one hot encoded vector.
QuoraClassifier.add(Dense(128, input_dim= x_train.shape[1]))
QuoraClassifier.add(Activation('tanh'))

# Adding output layer
# Dense will be equal to 2, since we one-hot-encoded the labels consisting of two values (0 and 1)
QuoraClassifier.add(Dense(2))
QuoraClassifier.add(Activation('sigmoid'))


# Compile the model using a loss function and an optimizer.
QuoraClassifier.compile(loss="binary_crossentropy", optimizer="adam", metrics = [f1])


QuoraClassifier.summary()

In [None]:
history = QuoraClassifier.fit(x_train, y_train.values, nb_epoch=20, batch_size=100, verbose=1, validation_data=(X_test, y_test.values))

## 7. Model evaluation

In [None]:
score = QuoraClassifier.evaluate(X_test, y_test.values, verbose=1)
print("Accuracy: ", score[1])

## 8. Predicting on the test set 

Transform it into a dataframe

In [None]:
test_pred = QuoraClassifier.predict(test_set)
test_pred = (test_pred > 0.35).astype(int)

Transform it into a dataframe

In [None]:
df = pd.DataFrame(data=test_pred)
#convert one hot encoded to original values
s2 = df.idxmax(axis=1)

Save Predictions to Submission file

In [None]:
#save to submission
submission = pd.read_csv("../input/sample_submission.csv")
submission['prediction'] = s2
submission.to_csv('submission.csv', index=False)