In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using Theano backend.


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
EMBEDDING_FILE = "../glove.6B.50d.txt"

In [3]:
embedding_size = 50
max_features = 20000
maxlen = 100

In [4]:
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [5]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

In [6]:
list_sentences_train = tokenizer.texts_to_sequences(list_sentences_train)
list_sentences_test = tokenizer.texts_to_sequences(list_sentences_test)

In [7]:
X_train = pad_sequences(list_sentences_train,maxlen=maxlen)
X_test  = pad_sequences(list_sentences_test,maxlen=maxlen)

In [8]:
def get_coefs(word,*arr):
    return word,np.asarray(arr,dtype="float32")

In [9]:
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE,encoding="utf8"))

In [10]:
all_embedding = np.stack(embeddings_index.values())
embedding_mean = all_embedding.mean()
embedding_std  = all_embedding.std()


In [11]:
embedding_mean,embedding_std

(0.020940498, 0.6441043)

In [12]:
word_index = tokenizer.word_index
num_words = min(max_features,len(word_index))
embedding_matrix = np.random.normal(embedding_mean,embedding_std,(num_words,embedding_size))
for word,i in word_index.items():
    if (i>=max_features): continue
    embedding_vector = embeddings_index.get(word)
    if(embedding_vector is not None):
        embedding_matrix[i] = embedding_vector

In [13]:
inp = Input(shape = (maxlen,))
x = Embedding(max_features,embedding_size,weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50,return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(6,activation='sigmoid')(x)
model = Model(inputs=inp,outputs=x)
model.compile(loss=  'binary_crossentropy',optimizer='adam', metrics=['accuracy'])

  'RNN dropout is no longer supported with the Theano backend '


In [14]:
model.fit(X_train,y,batch_size=32,epochs=2,validation_split=0.1)



Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2aad7b82da0>

In [15]:
y_test = model.predict(X_test,batch_size=1024,verbose=1)



In [16]:
hamse_na_ho_payega = test[['id']]

In [18]:
i = 0
for topic in list_classes:
    hamse_na_ho_payega[topic] = y_test[:,i]
    i+=1

In [19]:
hamse_na_ho_payega.to_csv('hamse_na_ho_payega.csv',index=False)