In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
train = train.sample(frac=1)

In [3]:
train["toxic"].describe()

count    95851.000000
mean         0.096368
std          0.295097
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: toxic, dtype: float64

In [3]:
list_sentences_train = train["comment_text"].fillna("...").values
#list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
list_classes = ["toxic"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("...").values

In [6]:
from keras.preprocessing import text, sequence

max_features = 20000
maxlen = 256

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [9]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2

CONV_SIZE = 64
LSTM_SIZE = 64
EMB_SIZE = 64

def get_model_orig():
    embed_size = 64
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, EMB_SIZE, embeddings_regularizer=l2(1e-3))(inp)
    #x = Reshape((maxlen, embed_size, 1))(x)
    #x = Bidirectional(LSTM(LSTM_SIZE, return_sequences=True))(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Bidirectional(LSTM(LSTM_SIZE))(x)
    #x = Dense(LSTM_SIZE, activation="selu")(x)
    x = Dense(len(list_classes), activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model

model = get_model_orig()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 256, 64)           1280000   
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,280,065
Trainable params: 1,280,065
Non-trainable params: 0
_________________________________________________________________


In [10]:
batch_size = 32
epochs = 20000

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.05, callbacks=callbacks_list)

Train on 91058 samples, validate on 4793 samples
Epoch 1/20000
Epoch 2/20000
Epoch 3/20000
Epoch 4/20000
Epoch 5/20000
Epoch 6/20000
Epoch 7/20000
Epoch 8/20000


<keras.callbacks.History at 0x7f6be8a9efd0>

In [21]:
model.load_weights(file_path)

y_test = model.predict(X_te)



sample_submission = pd.read_csv("./data/sample_submission.csv")

sample_submission[list_classes] = y_test



sample_submission.to_csv("baseline.csv", index=False)