In [None]:
import keras as k
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import CuDNNGRU , CuDNNLSTM , Bidirectional , GRU
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

In [None]:
file = open("glove.twitter.27B.100d.txt")
num_vocab = 0
word_to_vec = {}
for line in file.readlines():
  
    row = line.strip().split(' ')
    word = row[0]
    vec = row[1:] 
    num_vocab += 1
    word_to_vec[word] = vec
      
print("Glove preprocessed")

file.close()

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

In [4]:
X_train = train["comment_text"].fillna("Nothing").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("Nothing").values


In [5]:
tokenizer = Tokenizer(num_words = num_vocab)

In [6]:
tokenizer.fit_on_texts(list(X_train) + list(X_test))

In [7]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [8]:
maxlen = 100

X_train = pad_sequences(X_train , maxlen)
X_test = pad_sequences(X_test , maxlen)

In [9]:
word_index = tokenizer.word_index

In [10]:
min_words = min(len(word_index) , num_vocab)
print(min_words)

394787


In [14]:
embedding_matrix = np.random.randn(min_words , 100) * 0.001

In [15]:
added_words = 0
for word , i in word_index.items():
      if i >= min_words:
        continue
    vector = word_to_vec.get(word)
    if vector:
        added_words+=1
        embedding_matrix[i] = vector

In [16]:
added_words

130494

In [17]:
X_train , X_val , y_train , y_val = train_test_split(X_train , y_train , train_size = 0.95 , random_state = 0)



In [18]:
class RocEval(Callback):
    def __init__(self , validation_data = () , interval = 1):
        super(Callback , self).__init__()
        self.interval = interval
        self.X_val , self.y_val = validation_data
    
    def on_epoch_end(self , epoch , logs = {}):
        y_pred = self.model.predict(self.X_val, verbose=0)
        score = roc_auc_score(self.y_val, y_pred)
        print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [19]:
del tokenizer
del word_to_vec

In [45]:
def model(input_shape):
    inp = k.layers.Input(shape = input_shape)
    X = k.layers.Embedding(input_dim = min_words , output_dim = 100 , weights = [embedding_matrix])(inp)
    X = k.layers.SpatialDropout1D(0.25)(X)
    X = Bidirectional(GRU(50 , return_sequences = False , recurrent_dropout = 0.25))(X)
    X = k.layers.Dense(6 , activation = 'sigmoid')(X)
  
    return k.Model(inputs = inp , outputs = X)
  

In [48]:
toxic_model = model((100,))

In [49]:
toxic_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 100, 100)          39478700  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 100, 100)          0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 100)               45300     
_________________________________________________________________
dense_4 (Dense)              (None, 6)                 606       
Total params: 39,524,606
Trainable params: 39,524,606
Non-trainable params: 0
_________________________________________________________________


In [50]:
toxic_model.compile(optimizer = 'rmsprop' , loss = 'binary_crossentropy' , metrics = ['accuracy'])

In [51]:
RocAuc = RocEval(validation_data=(X_val, y_val), interval=1)

In [52]:
def Roc_score(y_true , y_pred):
  return roc_auc_score(y_true , y_pred)

In [53]:
saver = k.callbacks.ModelCheckpoint("my_model.h5", monitor='val_loss', verbose=2, save_best_only=True)

In [54]:
toxic_model.fit(X_train, y_train, batch_size=1024, epochs=10, validation_data=(X_val, y_val),
                 callbacks=[RocAuc , saver])

Train on 151592 samples, validate on 7979 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.963825 


Epoch 00001: val_loss improved from inf to 0.05689, saving model to my_model.h5
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.978510 


Epoch 00002: val_loss improved from 0.05689 to 0.04623, saving model to my_model.h5
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.982313 


Epoch 00003: val_loss improved from 0.04623 to 0.04356, saving model to my_model.h5
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.984541 


Epoch 00004: val_loss improved from 0.04356 to 0.04216, saving model to my_model.h5
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.986245 


Epoch 00005: val_loss improved from 0.04216 to 0.04111, saving model to my_model.h5
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.986953 


Epoch 00006: val_loss improved from 0.04111 to 0.04048, saving model to my_model.h5
Epoch 7/10

KeyboardInterrupt: ignored

In [51]:
prediction1 = toxic_model.predict(X_test , batch_size = 1024 , verbose = 1)



In [None]:
best_model = load_model("my_model.h5")
prediction2 = best_model.predict(X_test , batch_size = 1024 , verbose = 1)

In [85]:
prediction = (prediction1 + prediction2) / 2

In [None]:
prediction = prediction.argmax(1)

In [52]:
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = prediction
submission.to_csv("submission.csv" , index = False)