In [1]:
import os 
os.chdir("/mnt/disks/kaggle/jigsaw-toxic-comment-classification-challenge/home")

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from keras.models import Sequential, Model
from keras.layers import Dense, Conv1D, Input, MaxPool1D, Concatenate, Embedding, Flatten
from keras.optimizers import Adam

In [3]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
sample_submission = pd.read_csv("../input/sample_submission.csv")

In [5]:
dict_char = dict()

# train
for com in train["comment_text"].values:
    for c in com:
        if c not in dict_char.keys():
            dict_char[c] = [1, 0]
        else:
            dict_char[c][0] += 1
# test
for com in test["comment_text"].values:
    for c in com:
        if c not in dict_char.keys():
            dict_char[c] = [0, 1]
        else:
            dict_char[c][1] += 1

In [6]:
# start from 3 because, 0:empty, 1:start, 2:end
char_index = dict()
cno = 3
for c in dict_char.keys():
    char_index[c] = cno
    cno += 1

In [7]:
TEXT_LENGTH = max(max([len(com) for com in train["comment_text"].values]), max([len(com) for com in test["comment_text"].values]))

In [9]:
X_train_char = np.zeros((train.shape[0], TEXT_LENGTH+2), dtype="int32")
for i in range(train.shape[0]):
    com = train["comment_text"].values[i]
    ind = 0
    X_train_char[i, ind] = 1
    for c in com:
        ind += 1
        X_train_char[i, ind] = char_index[c]
    ind += 1
    X_train_char[i, ind] = 2

In [26]:
Y_train = np.array(train.iloc[:, 2:], dtype="int32")

In [28]:
# model
model = Sequential()
model.add(Embedding(cno, 8, input_length=TEXT_LENGTH+2))
model.add(Conv1D(16, 7, activation="relu", padding="same"))
model.add(Conv1D(16, 7, activation="relu", padding="same"))
model.add(Conv1D(16, 7, activation="relu", padding="same"))
model.add(MaxPool1D(10))
model.add(Conv1D(32, 7, activation="relu", padding="same"))
model.add(Conv1D(32, 7, activation="relu", padding="same"))
model.add(Conv1D(32, 7, activation="relu", padding="same"))
model.add(MaxPool1D(100))
model.add(Conv1D(32, 1, activation="relu", padding="same"))
model.add(Conv1D(32, 1, activation="relu", padding="same"))
model.add(Flatten())
model.add(Dense(128, activation="relu"))
model.add(Dense(6, activation="sigmoid"))

model.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 5002, 8)           44360     
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 5002, 16)          912       
_________________________________________________________________
conv1d_31 (Conv1D)           (None, 5002, 16)          1808      
_________________________________________________________________
conv1d_32 (Conv1D)           (None, 5002, 16)          1808      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 500, 16)           0         
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 500, 32)           3616      
_________________________________________________________________
conv1d_34 (Conv1D)           (None, 500, 32)           7200      
__________

In [29]:
model.fit(X_train_char, Y_train, batch_size=32, epochs=10, validation_split=0.2)

Train on 127656 samples, validate on 31915 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd7fab32ac8>

In [30]:
os.mkdir("../models")

In [31]:
model.save("../models/first_model.h5")

In [32]:
X_test_char = np.zeros((test.shape[0], TEXT_LENGTH+2), dtype="int32")
for i in range(test.shape[0]):
    com = test["comment_text"].values[i]
    ind = 0
    X_test_char[i, ind] = 1
    for c in com:
        ind += 1
        X_test_char[i, ind] = char_index[c]
    ind += 1
    X_test_char[i, ind] = 2

In [33]:
pred_test = model.predict(X_test_char)

In [34]:
pred_test.shape

(153164, 6)

In [35]:
sub1 = sample_submission.copy()

In [37]:
sub1.iloc[:, 1:] = pred_test

In [38]:
sub1.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.994524,0.1126254,0.950155,0.028201,0.739588,0.097122
1,0000247867823ef7,0.013503,1.193749e-05,0.001597,0.000102,0.002074,0.000849
2,00013b17ad220c46,0.025214,4.437889e-05,0.00313,0.000343,0.004499,0.001786
3,00017563c3f7919a,0.001357,8.285598e-08,0.000141,1e-06,0.000124,5.4e-05
4,00017695ad8997eb,0.031977,6.83598e-05,0.003991,0.000499,0.005923,0.002247


In [39]:
os.mkdir("../submit")

In [40]:
sub1.to_csv("../submit/sub1_hrd_0211.csv", index=False)