In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Any results you write to the current directory are saved as output.

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import backend as K


##########################################################
# DATA PREPROCESSING
##########################################################

max_features = 20000 # 20000 
maxlen = 100 # 100


list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
#list_classes = ["toxic"]

train = pd.read_csv("data/train.csv")
tokenizer = text.Tokenizer(num_words=max_features)



#Creation of training data
train = train.sample(frac=1)
list_sentences_train = train["comment_text"].fillna("CVxTz").values
tokenizer.fit_on_texts(list(list_sentences_train))
y = train[list_classes].values
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)



#Creation of testing data
test = pd.read_csv("data/test.csv")
list_sentences_test = test["comment_text"].fillna("CVxTz").values
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)


##########################################################
# MODEL
##########################################################

def get_model():
    embed_size = 32
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Dropout(0.125)(x)
    x = Bidirectional(LSTM(16, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.125)(x)
    x = Dense(8, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(8, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


model = get_model()

batch_size = 256
epochs = 5

model.summary()


file_path="weights_base_balanced.hdf5"

model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

'''model.load_weights(file_path)
'''
print("Predicting...")
y_test = model.predict(X_te)
print("done")



Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 32)           640000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 32)           6272      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 264       
__________