In [1]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import operator

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, SpatialDropout1D, Reshape, Conv2D, MaxPooling2D, merge, Flatten, Bidirectional, CuDNNGRU, add, Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint


import sys

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = 'Dataset/'
#EMBEDDING_FILE='features/fast-text-300.txt'
EMBEDDING_FILE='features/glove.840B.300d.txt'
TRAIN_DATA_FILE=path + 'train.csv'
TEST_DATA_FILE=path + 'test.csv'

MAX_SEQUENCE_LENGTH = 700
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300

In [3]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

#Glove Vectors
embeddings_index = {}
f = open(EMBEDDING_FILE, 'r', encoding='utf-8')
for line in f:
    values = line.split()
    try:
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        print("Err on ", values[:3])
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Indexing word vectors
Err on  ['.', '.', '.']
Err on  ['at', 'name@domain.com', '0.0061218']
Err on  ['.', '.', '.']
Err on  ['to', 'name@domain.com', '0.33865']
Err on  ['.', '.', '0.035974']
Err on  ['.', '.', '.']
Err on  ['email', 'name@domain.com', '0.33529']
Err on  ['or', 'name@domain.com', '0.48374']
Err on  ['contact', 'name@domain.com', '0.016426']
Err on  ['Email', 'name@domain.com', '0.37344']
Err on  ['on', 'name@domain.com', '0.037295']
Err on  ['At', 'Killerseats.com', '-0.13854']
Err on  ['by', 'name@domain.com', '0.6882']
Err on  ['in', 'mylot.com', '-0.18148']
Err on  ['emailing', 'name@domain.com', '0.39173']
Err on  ['Contact', 'name@domain.com', '0.14933']
Err on  ['at', 'name@domain.com', '0.44321']
Err on  ['•', 'name@domain.com', '-0.13288']
Err on  ['at', 'Amazon.com', '-0.5275']
Err on  ['is', 'name@domain.com', '-0.1197']
Total 2195884 word vectors.


In [4]:
import pandas as pd
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

In [5]:
########################################
# Load the cleaned words
########################################

cl_path = 'features/cleanwords.txt'
clean_word_dict = {}
with open(cl_path, 'r', encoding='utf-8') as cl:
    for line in cl:
        line = line.strip('\n')
        typo, correct = line.split(',')
        clean_word_dict[typo] = correct

In [6]:
########################################
## process texts in datasets
########################################
import re

print('Processing text dataset')
from collections import defaultdict
# Regex to remove all Non-Alpha Numeric and space
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)

# regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)
word_count_dict = defaultdict(int)
toxic_dict = {}

def text_to_wordlist(text, remove_stopwords=False, stem_words=False, count_null_words=True, clean_wiki_tokens=True):
    # Clean the text, with the option to remove stopwords and to stem words.
    # dirty words
    text = text.lower()
    text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
    text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
    
    if clean_wiki_tokens:
        # Clean the image
        text = re.sub(r"image:[a-zA-Z0-9]*.jpg", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*.png", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*.gif", " ", text)
        text = re.sub(r"image:[a-zA-Z0-9]*.bmp", " ", text)

        # Special wiki token
        text = re.sub(r"{{[a-zA-Z0-9]*}}", " ", text)
        text = re.sub(r'"{2,}', " ", text)
        text = re.sub(r'={2,}', " ", text)
        text = re.sub(r':{2,}', " ", text)
        
        #text = re.sub(, "", text)
    
    for typo, correct in clean_word_dict.items():
        text = re.sub(typo, " " + correct + " ", text)

    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = replace_numbers.sub('', text)
    #text = special_character_removal.sub('',text)

    if count_null_words:
        text = text.split()
        for t in text:
            word_count_dict[t] += 1
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    return (text)

list_sentences_train = train_df["comment_text"].fillna("no comment").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
list_sentences_test = test_df["comment_text"].fillna("no comment").values

comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(comments + test_comments)

sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

Processing text dataset
Found 332483 unique tokens
Shape of data tensor: (159571, 700)
Shape of label tensor: (159571, 6)
Shape of test_data tensor: (153164, 700)


In [7]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))

null_words = open('null-word.txt', 'w', encoding='utf-8')

for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        null_words.write(word + ', ' + str(word_count_dict[word]) +'\n')
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        null_words.write(word + ', ' + str(word_count_dict[word]) + '\n')
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

#24146

Preparing embedding matrix
Null word embeddings: 24784


In [8]:
test_comments[:20]

['yo bitch ja rule is more succesful then you will ever be whats up with you and hating you sad mofuckas i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me ja rule is about pride in da music man dont diss that shit on him and nothin is wrong bein like tupac he was a brother too fuckin white boys get things right next time',
 'from rfc the title is fine as it is imo',
 '" sources * zawe ashton on lapland — "',
 ': if you have a look back at the source the information i updated was the correct form i can only guess the source had not updated i shall update the information once again but thank you for your message',
 'i do not anonymously edit articles at all',
 'thank you for understanding i think very highly of you and would not revert without discussion',
 'please do not add nonsense to wikipedia such edits are considered vandalism and quickly undone if you would like to experiment please use the sandbox instead thank you -',
 ': dear god this sit

In [9]:
# sort null word
null_count = {}
with open('null-word.txt', 'r', encoding='utf-8') as nullword:
    for line in nullword:
        w, c = line.strip('\n').split(', ')
        null_count[w] = int(c)
null_count = sorted(null_count.items(), key=operator.itemgetter(1), reverse=True)
with open('null-word.txt', 'w', encoding='utf-8') as output:
    for w, c in null_count:
        output.write(w + ", " + str(c) + '\n')

# Model Zoo

In [10]:
import keras.backend as K
from keras.layers import Dense, Activation, Multiply, Add, Lambda
import keras.initializers
 
def highway_layers(value, n_layers, activation="tanh", gate_bias=-3):
    dim = K.int_shape(value)[-1]
    gate_bias_initializer = keras.initializers.Constant(gate_bias)
    for i in range(n_layers):     
        gate = Dense(units=dim, bias_initializer=gate_bias_initializer)(value)
        gate = Activation("sigmoid")(gate)
        negated_gate = Lambda(
            lambda x: 1.0 - x,
            output_shape=(dim,))(gate)
        transformed = Dense(units=dim)(value)
        transformed = Activation(activation)(value)
        transformed_gated = Multiply()([gate, transformed])
        identity_gated = Multiply()([negated_gate, value])
        value = Add()([transformed_gated, identity_gated])
    return value

In [11]:
########################################
## define the RNN with Attention model structure
########################################

from keras.optimizers import RMSprop
from keras.layers import AveragePooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D, Lambda
from keras import optimizers
adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=1, clipnorm=1)

def get_dropout_bi_gru():
    recurrent_units = 48
    dropout_rate = 0.35
    dense_size = 32
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
    embedding_layer = Embedding(nb_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)(input_layer)
    embedding_layer = SpatialDropout1D(0.15)(embedding_layer)
    x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(embedding_layer)
    x = Dropout(0.35)(x)
    x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(x)
    
    last = Lambda(lambda t: t[:, -1])(x)
    maxpool = GlobalMaxPooling1D()(x)
    average = GlobalAveragePooling1D()(x)
    concatenated = concatenate([last, maxpool, average], axis=1)
    x = Dropout(0.5)(maxpool)
    x = Dense(72, activation="relu")(x)
    output_layer = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam_optimizer,
    metrics=['accuracy'])
    return model

In [12]:
from __future__ import absolute_import, division

import sys
from os.path import dirname
from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K


class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [13]:
########################################
## define the RNN with Attention model structure
########################################

from keras import optimizers
adam_optimizer = optimizers.Adam(lr=1e-3, clipvalue=1, clipnorm=1, decay=1e-10)

def get_plain_attention_rnn():
    recurrent_units = 72
    dropout_rate = 0.35
    dense_size = 32
    input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
    embedding_layer = Embedding(nb_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)(input_layer)
    embedding_layer = SpatialDropout1D(0.25)(embedding_layer)
    x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(embedding_layer)
    x = Dropout(0.4)(x)
    x = Bidirectional(CuDNNGRU(recurrent_units, return_sequences=True))(x)
    
    last = Lambda(lambda t: t[:, -1])(x)
    maxpool = GlobalMaxPooling1D()(x)
    attn = AttentionWeightedAverage()(x)
    average = GlobalAveragePooling1D()(x)
    
    concatenated = concatenate([attn, maxpool, last, average], axis=1)
    x = Dropout(0.62)(concatenated)
    x = Dense(72, activation="relu")(x)
    output_layer = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam_optimizer,
    metrics=['accuracy'])
    return model

In [14]:
from keras.layers import Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate
from keras.engine import Layer, InputSpec
from keras.layers import Flatten
import tensorflow as tf

class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """
    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[2] * self.k))

    def call(self, inputs):
        
        # swap last two dimensions since top_k will be applied along the last dimension
        shifted_input = tf.transpose(inputs, [0, 2, 1])
        
        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
        
        # return flattened output
        return Flatten()(top_k)

In [15]:
########################################
## define the RNN with Attention model structure
########################################

from keras import optimizers
adam_optimizer = optimizers.Adam(lr=1e-3)

def get_kmax_text_cnn():
    embedding_layer = Embedding(nb_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)

    filter_nums = 120
    drop = 0.5
    
    comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences= embedding_layer(comment_input)
    embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
    
    conv_0 = Conv1D(filter_nums, 1, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)

    #conv_0 = Conv1D(filter_nums / 2, 1, kernel_initializer="normal", padding="valid", activation="relu")(conv_0)
    #conv_1 = Conv1D(filter_nums / 2, 2, kernel_initializer="normal", padding="valid", activation="relu")(conv_1)
    #conv_2 = Conv1D(filter_nums / 2, 3, strides=2, kernel_initializer="normal", padding="valid", activation="relu")(conv_2)

    maxpool_0 = KMaxPooling(k=4)(conv_0)
    maxpool_1 = KMaxPooling(k=4)(conv_1)
    maxpool_2 = KMaxPooling(k=4)(conv_2)

    merged_tensor = merge([maxpool_0, maxpool_1, maxpool_2], mode='concat', concat_axis=1)
    output = Dropout(drop)(merged_tensor)
    output = Dense(units=120, activation="relu")(output)
    output = Dropout(0.3)(output)
    output = Dense(units=6, activation='sigmoid')(output)

    model = Model(inputs=comment_input, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [16]:
########################################
## define the RNN with Attention model structure
########################################

from keras import optimizers
adam_optimizer = optimizers.Adam(lr=1e-3)

def get_higway_cnn():
    embedding_layer = Embedding(nb_words,
            EMBEDDING_DIM,
            weights=[embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)

    filter_nums = 120
    drop = 0.5
    
    comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences= embedding_layer(comment_input)

    conv_0 = Conv1D(filter_nums, 1, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)
    conv_1 = Conv1D(filter_nums, 2, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)
    conv_2 = Conv1D(filter_nums, 3, kernel_initializer="normal", padding="valid", activation="relu")(embedded_sequences)

    maxpool_0 = KMaxPooling(k=4)(conv_0)
    maxpool_1 = KMaxPooling(k=4)(conv_1)
    maxpool_2 = KMaxPooling(k=4)(conv_2)

    merged_tensor = merge([maxpool_0, maxpool_1, maxpool_2], mode='concat', concat_axis=1)
    output = Dropout(drop)(merged_tensor)
    output = Dense(units=96)(output)
    output = BatchNormalization()(output)
    output = Activation('relu')(output)
    output = highway_layers(output, 3)
    output = Dense(units=6, activation='sigmoid')(output)

    model = Model(inputs=comment_input, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    return model

In [17]:
from keras.layers import concatenate, Reshape, Permute

# 10-Fold Cross Valiadtion

In [18]:
from sklearn.metrics import roc_auc_score

import numpy as np

STAMP = 'pavel_rnn_%.2f_%.2f'%(0.5,0.5)

def _train_model_by_auc(model, batch_size, train_x, train_y, val_x, val_y):
    best_auc = -1
    best_weights = None
    best_epoch = 0

    current_epoch = 1

    while True:
        model.fit(train_x, train_y, batch_size=batch_size, epochs=1, validation_data=[val_x, val_y])
        y_pred = model.predict(val_x, batch_size=batch_size)
        current_auc = roc_auc_score(val_y, y_pred)
        print("Epoch {} auc {:.6f} best_auc {:.6f}".format(current_epoch, current_auc, best_auc))
        current_epoch += 1
        if best_auc < current_auc or best_auc == -1:
            best_auc = current_auc
            best_weights = model.get_weights()
            best_epoch = current_epoch
        else:
            if current_epoch - best_epoch == 5:
                break

    model.set_weights(best_weights)
    return model, best_auc

def _train_model_by_logloss(model, batch_size, train_x, train_y, val_x, val_y, fold_id):
    early_stopping =EarlyStopping(monitor='val_loss', patience=6)
    bst_model_path = STAMP + str(fold_id) + '.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)
    hist = model.fit(train_x, train_y,
        validation_data=(val_x, val_y),
        epochs=50, batch_size=batch_size, shuffle=True,
        callbacks=[early_stopping, model_checkpoint])
    bst_val_score = min(hist.history['val_loss'])
    predictions = model.predict(val_x)
    auc = roc_auc_score(val_y, predictions)
    return model, bst_val_score, auc, predictions

def train_folds(X, y, fold_count, batch_size, get_model_func):
    fold_size = len(X) // fold_count
    models = []
    fold_predictions = []
    score = 0
    total_auc = 0
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_size - 1:
            fold_end = len(X)

        train_x = np.concatenate([X[:fold_start], X[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_x = X[fold_start:fold_end]
        val_y = y[fold_start:fold_end]
    
        print("In fold #", fold_id)
        model, bst_val_score, auc, fold_prediction = _train_model_by_logloss(get_model_func(), batch_size, train_x, train_y, val_x, val_y, fold_id)
        score += bst_val_score
        total_auc += auc
        fold_predictions.append(fold_prediction)
        models.append(model)
    return models, score / fold_count, total_auc / fold_count, fold_predictions

In [19]:
models, val_loss, total_auc, fold_predictions = train_folds(data, y, 10, 256, get_plain_attention_rnn)

In fold # 0
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
In fold # 1
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
In fold # 2
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
In fold # 3
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
In fold # 4
Train on 143614 samples, validate on 15957 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/5

Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


In [20]:
print("Overall val-loss:", val_loss, "AUC", total_auc)

Overall val-loss: 0.03917059923720672 AUC 0.989076054755853


In [21]:
print("Overall val-loss:", val_loss, "AUC", total_auc)

Overall val-loss: 0.03917059923720672 AUC 0.989076054755853


## Predections

In [22]:
train_fold_preditcions = np.concatenate(fold_predictions, axis=0)

In [23]:
training_auc = roc_auc_score(y[:-1], train_fold_preditcions)
print("Training AUC", training_auc)

Training AUC 0.9886303316374511


In [24]:
#test_data = test_df
CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
submit_path_prefix = "results/rnn/dropout-glove-bigru-attall-lp-ct-" + str(MAX_SEQUENCE_LENGTH) 

print("Predicting testing results...")
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_predicts = model.predict(test_data, batch_size=256, verbose=1)
    test_predicts_list.append(test_predicts)
    np.save("predict_path/", test_predicts)

test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts += fold_predict
test_predicts /= len(test_predicts_list)

test_ids = test_df["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
test_predicts["id"] = test_ids
test_predicts = test_predicts[["id"] + CLASSES]
submit_path = submit_path_prefix + "-L{:4f}-A{:4f}.csv".format(val_loss, total_auc)
test_predicts.to_csv(submit_path, index=False)

Predicting testing results...


In [25]:
print("Predicting training results...")

train_ids = train_df["id"].values
train_ids = train_ids.reshape((len(train_ids), 1))

train_predicts = pd.DataFrame(data=train_fold_preditcions, columns=CLASSES) # IT MISS THE LAST ONE's label
train_predicts["id"] = train_ids[:-1]
train_predicts = train_predicts[["id"] + CLASSES]
submit_path = submit_path_prefix + "-Train-L{:4f}-A{:4f}.csv".format(val_loss, training_auc)
train_predicts.to_csv(submit_path, index=False)

Predicting training results...
