In [0]:
! pip install keras

In [0]:
# Glove
# !wget http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip -c
# !unzip glove.840B.300d.zip -d glove.840B.300d/

# Twitter
# !wget http://nlp.stanford.edu/data/glove.twitter.27B.zip -c
# !unzip glove.twitter.27B.zip -d glove.twitter.27B.200d/

# FastTest
!wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/crawl-300d-2M.vec.zip
!unzip crawl-300d-2M.vec.zip -d crawl-300d-2M.vec/

In [0]:
import numpy as np
np.random.seed(32)
import pandas as pd

! pip install pydrive
# these classes allow you to request the Google drive API
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# https://drive.google.com/open?id=1Rk9F9YUMuG9JtuhCaUck1Lxid3OISc1-
file_id = '1Rk9F9YUMuG9JtuhCaUck1Lxid3OISc1-'
downloaded = drive.CreateFile({'id': file_id})
# allows you to temporarily load your file in the notebook VM

downloaded.GetContentFile('train.csv')
train = pd.read_csv('train.csv')

# https://drive.google.com/open?id=1CH6MLJYHK6rtC-p_4kK7Ms17CIisowGZ
file_id = '1CH6MLJYHK6rtC-p_4kK7Ms17CIisowGZ'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('test.csv')
test = pd.read_csv('test.csv')

In [0]:
from google.colab import files

def download_file(file_name):
  files.download(file_name)

In [0]:
embed_size = 300
max_features = 150000 
max_text_len = 150

# EMBEDDING_FILE = "glove.840B.300d/glove.840B.300d.txt"
# EMBEDDING_FILE = "glove.twitter.27B.200d/glove.twitter.27B.200d.txt"
EMBEDDING_FILE = "crawl-300d-2M.vec/crawl-300d-2M.vec"

In [0]:
import sys, os, re, csv, codecs, gc
os.environ["OMP_NUM_THREADS"] = "4"
import nltk
nltk.download("punkt")

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, Add, Flatten, add, PReLU, MaxPooling1D
from keras.optimizers import Adam, RMSprop, Nadam
from keras.layers import SpatialDropout1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import concatenate, GRU, CuDNNGRU, CuDNNLSTM, TimeDistributed
from keras.layers.normalization import BatchNormalization
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split, KFold
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer

import logging
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [0]:
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(**kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None


def pair_loss(y_true, y_pred):
    y_true = tf.cast(y_true, tf.int32)
    parts = tf.dynamic_partition(y_pred, y_true, 2)
    y_pos = parts[1]
    y_neg = parts[0]
    y_pos = tf.expand_dims(y_pos, 0)
    y_neg = tf.expand_dims(y_neg, -1)
    out = K.sigmoid(y_neg - y_pos)
    return K.mean(out)

In [0]:
def clean_corpus(comment):
    comment = comment.lower()
    comment = comment.replace('&', ' and ')
    comment = comment.replace('0', ' zero ')
    comment = comment.replace('1', ' one ')
    comment = comment.replace('2', ' two ')
    comment = comment.replace('3', ' three ')
    comment = comment.replace('4', ' four ')
    comment = comment.replace('5', ' five ')
    comment = comment.replace('6', ' six ')
    comment = comment.replace('7', ' seven ')
    comment = comment.replace('8', ' eight ')
    comment = comment.replace('9', ' nine ')
    comment = comment.replace('\'ve', ' have ')
    comment = comment.replace('\'d', ' would ')
    comment = comment.replace('\'m', ' am ')
    comment = comment.replace('n\'t', ' not ')
    comment = comment.replace('\'s', ' is ')
    comment = comment.replace('\'r', ' are ')
    comment = nltk.word_tokenize(comment)
    comment = " ".join(word for word in comment)
    return comment

In [0]:
category = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train["comment_text"].fillna("no comment", inplace = True)
train["comment_text"] = train["comment_text"].apply(lambda x: clean_corpus(x))

test["comment_text"].fillna("no comment", inplace = True)
test["comment_text"] = test["comment_text"].apply(lambda x: clean_corpus(x))

In [0]:
nrow_train = train.shape[0]
Y_train = train[category].values

k_fold = True
if k_fold:
  print("Doing K fold!")
  raw_text_train = train["comment_text"].str.lower()
  raw_text_test = test["comment_text"].str.lower()

  tk = Tokenizer(num_words = max_features, lower = True)
  tk.fit_on_texts(raw_text_train)

  train["comment_seq"] = tk.texts_to_sequences(raw_text_train.str.lower())
  test["comment_seq"] = tk.texts_to_sequences(raw_text_test.str.lower())

  X_train = pad_sequences(train.comment_seq, maxlen = max_text_len)
  X_test = pad_sequences(test.comment_seq, maxlen = max_text_len)

else:
  print("Tuning model!")
  X_train, X_valid, Y_train, Y_valid = train_test_split(train, Y_train, test_size = 0.1)
  raw_text_train = X_train["comment_text"].str.lower()
  raw_text_valid = X_valid["comment_text"].str.lower()
  raw_text_test = test["comment_text"].str.lower()
  
  tk = Tokenizer(num_words = max_features, lower = True)
  tk.fit_on_texts(raw_text_train)

  X_train["comment_seq"] = tk.texts_to_sequences(raw_text_train.str.lower())
  X_valid["comment_seq"] = tk.texts_to_sequences(raw_text_valid.str.lower())
  test["comment_seq"] = tk.texts_to_sequences(raw_text_test.str.lower())

  X_train = pad_sequences(X_train.comment_seq, maxlen = max_text_len)
  X_valid = pad_sequences(X_valid.comment_seq, maxlen = max_text_len)
  X_test = pad_sequences(test.comment_seq, maxlen = max_text_len)

In [0]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype = "float32")
embeddings_index = dict(get_coefs(*o.strip().split(" ")) for o in open(EMBEDDING_FILE))

In [0]:
# all_embs = np.stack(embeddings_index.values())
# emb_mean,emb_std = all_embs.mean(), all_embs.std()

In [0]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
# embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [0]:
def build_LSTM_model(units = 0, dr = 0.0, lr_i = 0.0, lr_f = 0.0):
  inp = Input(shape = (max_text_len,))
  main = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                   input_length = max_text_len, trainable = False)(inp)
  
  main = SpatialDropout1D(dr)(main)
  main = Bidirectional(CuDNNLSTM(units, return_sequences = True))(main)
  atten = AttentionWeightedAverage()(main)
  cnn = Conv1D(64, kernel_size = 3, padding = "same", kernel_initializer = "he_uniform", activation = "elu")(main)
  avg_pool = GlobalAveragePooling1D()(cnn)
  max_pool = GlobalMaxPooling1D()(cnn)
  main = concatenate([atten, avg_pool, max_pool])
  
  out_put = Dense(6, activation = "sigmoid")(main)
  model = Model(inputs = inp, outputs = out_put)
  model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr_i, decay = lr_f), metrics = ['accuracy'])
  return model

In [0]:
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

batch_sizes = 128
epochs = 10
units = 128
dr = 0.3
lr_i, lr_f = 1e-3, 0
n_folds = 10
lstm_pred = 0
lstm_oof_pred = np.zeros((nrow_train, 6))
val_losses = []

kfold = KFold(n_splits = n_folds, shuffle = True, random_state = 32)
for i, (train_idx, valid_idx) in enumerate(kfold.split(X_train)):
  print("\nRunning fold {}/{}".format(i + 1, n_folds))
  model = None
  model = build_LSTM_model(units = units, dr = dr, lr_i = lr_i, lr_f = lr_f)
  
  x_train, y_train = X_train[train_idx], Y_train[train_idx] 
  x_valid, y_valid = X_train[valid_idx], Y_train[valid_idx]
  
  ra_val = RocAucEvaluation(validation_data = (x_valid, y_valid), interval = 1)

  file_path = "fold " + str(i+1) + " best_model.hdf5"
  check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                save_best_only = True, mode = "min")
  
  hist = model.fit(x_train, y_train, batch_size = batch_sizes, epochs = epochs, validation_data = (x_valid, y_valid), 
                   verbose = 2, callbacks = [check_point, early_stop, ra_val])
  val_losses.append(min(hist.history["val_loss"]))
  model = load_model(file_path, custom_objects = {"AttentionWeightedAverage": AttentionWeightedAverage})
  lstm_pred += model.predict(X_test, batch_size = batch_sizes, verbose = 2)
  lstm_oof_pred[valid_idx] = model.predict(x_valid, batch_size = batch_sizes, verbose = 2)
  
  del file_path, model
  gc.collect()

lstm_pred = lstm_pred/n_folds
print("\noof score of lstm is {}.".format(roc_auc_score(Y_train, lstm_oof_pred)))
print("\nAverage val loss of {} folds is {}.".format(n_folds, np.mean(val_losses)))

In [0]:
lstm_submission = pd.DataFrame()
lstm_submission = lstm_submission.reindex(columns = ["id"] + category)
lstm_submission["id"]= test[["id"]]
lstm_submission[category[0:6]] = lstm_pred
lstm_submission.to_csv("lstm_submission.csv", index = False)
download_file("lstm_submission.csv")

In [0]:
lstm_submission.head()

In [0]:
lstm_oof_prediction = pd.DataFrame()
lstm_oof_prediction = lstm_oof_prediction.reindex(columns = ["id"] + category)
lstm_oof_prediction["id"]= train[["id"]]
lstm_oof_prediction[category[0:6]] = pd.DataFrame(lstm_oof_pred)
lstm_oof_prediction.to_csv("lstm_oof_prediction.csv", index = False)
download_file("lstm_oof_prediction.csv")

In [0]:
def build_GRU_model(units = 0, dr = 0.0):
  inp = Input(shape = (max_text_len,))
  main = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                   input_length = max_text_len, trainable = False)(inp)
  
  main = SpatialDropout1D(dr)(main)
  main = Bidirectional(CuDNNGRU(units, return_sequences = True))(main)
  main = AttentionWeightedAverage()(main)
  main = Dense(64, activation = "relu")(main)
  
  out_put = Dense(6, activation = "sigmoid")(main)
  model = Model(inputs = inp, outputs = out_put)
    
  model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = 1e-3), metrics = ['accuracy'])
  return model

early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

batch_sizes = 128
epochs = 10
units = 128
dr = 0.2
n_folds = 10
gru_pred = 0
gru_oof_pred = np.zeros((nrow_train, 6))
val_losses = []

kfold = KFold(n_splits = n_folds, shuffle = True, random_state = 32)
for i, (train_idx, valid_idx) in enumerate(kfold.split(X_train)):
  print("\nRunning fold {}/{}".format(i + 1, n_folds))
  model = None
  model = build_GRU_model(units = units, dr = dr)
  
  x_train, y_train = X_train[train_idx], Y_train[train_idx] 
  x_valid, y_valid = X_train[valid_idx], Y_train[valid_idx]
  
  ra_val = RocAucEvaluation(validation_data = (x_valid, y_valid), interval = 1)

  file_path = "fold " + str(i+1) + " best_model.hdf5"
  check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                save_best_only = True, mode = "min")
  
  hist = model.fit(x_train, y_train, batch_size = batch_sizes, epochs = epochs, validation_data = (x_valid, y_valid), 
                      verbose = 2, callbacks = [check_point, early_stop, ra_val])
  model = load_model(file_path, custom_objects = {"AttentionWeightedAverage": AttentionWeightedAverage})
  val_losses.append(min(hist.history["val_loss"]))
  gru_pred += model.predict(X_test, batch_size = batch_sizes, verbose = 2)  
  gru_oof_pred[valid_idx] = model.predict(x_valid, batch_size = batch_sizes, verbose = 2)
  
  del file_path, model
  gc.collect()
  
gru_pred = gru_pred/n_folds
print("\noof score of gru is {}.".format(roc_auc_score(Y_train, gru_oof_pred)))
print("\nAverage val loss of {} folds is {}.".format(n_folds, np.mean(val_losses)))

In [0]:
gru_submission = pd.DataFrame()
gru_submission = gru_submission.reindex(columns = ["id"] + category)
gru_submission["id"]= test[["id"]]
gru_submission[category[0:6]] = gru_pred
gru_submission.to_csv("gru_submission.csv", index = False)
download_file("gru_submission.csv")

In [0]:
gru_submission.head()

In [0]:
gru_oof_prediction = pd.DataFrame()
gru_oof_prediction = gru_oof_prediction.reindex(columns = ["id"] + category)
gru_oof_prediction["id"]= train[["id"]]
gru_oof_prediction[category[0:6]] = pd.DataFrame(gru_oof_pred)
gru_oof_prediction.to_csv("gru_oof_prediction.csv", index = False)
download_file("gru_oof_prediction.csv")

In [0]:
gru_oof_prediction.head()

In [0]:
def build_rcnn_model(units = 0, dr = 0.0, lr_i = 0.0, lr_f = 0.0):
  inp = Input(shape = (max_text_len,))
  main = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                   input_length = max_text_len, trainable = False)(inp)
  
  main = SpatialDropout1D(dr)(main)
  main = Bidirectional(CuDNNGRU(units, return_sequences = True))(main)
  cnn = Conv1D(64, kernel_size = 3, padding = "same", kernel_initializer = "he_uniform", activation = "elu")(main)
  avg_pool = GlobalAveragePooling1D()(cnn)
  max_pool = GlobalMaxPooling1D()(cnn)
  main = concatenate([avg_pool, max_pool])
  
  out_put = Dense(6, activation = "sigmoid")(main)
  model = Model(inputs = inp, outputs = out_put)
  model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr_i, decay = lr_f), metrics = ['accuracy'])
  return model

In [0]:
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

batch_sizes = 128
epochs = 10
units = 128
dr = 0.3
lr_i, lr_f = 1e-3, 0
n_folds = 10
rcnn_pred = 0
rcnn_oof_pred = np.zeros((nrow_train, 6))
val_losses = []

kfold = KFold(n_splits = n_folds, shuffle = True, random_state = 32)
for i, (train_idx, valid_idx) in enumerate(kfold.split(X_train)):
  print("\nRunning fold {}/{}".format(i + 1, n_folds))
  model = None
  model = build_rcnn_model(units = units, dr = dr, lr_i = lr_i, lr_f = lr_f)
  
  x_train, y_train = X_train[train_idx], Y_train[train_idx] 
  x_valid, y_valid = X_train[valid_idx], Y_train[valid_idx]
  
  ra_val = RocAucEvaluation(validation_data = (x_valid, y_valid), interval = 1)

  file_path = "fold " + str(i+1) + " best_model.hdf5"
  check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                save_best_only = True, mode = "min")
  
  hist = model.fit(x_train, y_train, batch_size = batch_sizes, epochs = epochs, validation_data = (x_valid, y_valid), 
                   verbose = 2, callbacks = [check_point, early_stop, ra_val])
  val_losses.append(min(hist.history["val_loss"]))
  model = load_model(file_path, custom_objects = {"AttentionWeightedAverage": AttentionWeightedAverage})
  rcnn_pred += model.predict(X_test, batch_size = batch_sizes, verbose = 2)
  rcnn_oof_pred[valid_idx] = model.predict(x_valid, batch_size = batch_sizes, verbose = 2)
  
  del file_path, model
  gc.collect()

rcnn_pred = rcnn_pred/n_folds
print("\noof score of rcnn is {}.".format(roc_auc_score(Y_train, rcnn_oof_pred)))
print("\nAverage val loss of {} folds is {}.".format(n_folds, np.mean(val_losses)))

In [0]:
rcnn_submission = pd.DataFrame()
rcnn_submission = rcnn_submission.reindex(columns = ["id"] + category)
rcnn_submission["id"]= test[["id"]]
rcnn_submission[category[0:6]] = rcnn_pred
rcnn_submission.to_csv("rcnn_submission.csv", index = False)
download_file("rcnn_submission.csv")

In [0]:
rcnn_submission.head()

In [0]:
rcnn_oof_prediction = pd.DataFrame()
rcnn_oof_prediction = rcnn_oof_prediction.reindex(columns = ["id"] + category)
rcnn_oof_prediction["id"]= train[["id"]]
rcnn_oof_prediction[category[0:6]] = pd.DataFrame(rcnn_oof_pred)
rcnn_oof_prediction.to_csv("rcnn_oof_prediction.csv", index = False)
download_file("rcnn_oof_prediction.csv")

In [0]:
rcnn_oof_prediction.head()

In [0]:
act, pad, kernel_ini = "linear", "same", "he_uniform"
def build_dpcnn_model(units = 0, k = 0, num_block = 0, lr = 0.0, dr = 0.0):
    inp = Input(shape = (max_text_len, ))
    emb = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                    input_length = max_text_len, trainable = False)(inp)
    emb = SpatialDropout1D(dr)(emb)
    emb_short_cut = PReLU()(emb)
    emb_short_cut = Conv1D(units, kernel_size = 1, padding = pad, activation = act,
                           kernel_initializer = kernel_ini)(emb_short_cut)
    emb_short_cut = PReLU()(emb_short_cut)
    emb_short_cut = Conv1D(units, kernel_size = 1, padding = pad, activation = act,
                           kernel_initializer = kernel_ini)(emb_short_cut)
    
    # Main block
    for b in range(1, num_block + 1):
        if b == 1:
            block = emb
            short_cut = emb_short_cut
        else:
            block = block
            short_cut = block
                    
#         block = BatchNormalization()(block)
        block = PReLU()(block)
        block = Conv1D(units, kernel_size = k, padding = pad, activation = act, 
                       kernel_initializer = kernel_ini)(block)
#         block = BatchNormalization()(block)
        block = PReLU()(block)
        block = Conv1D(units, kernel_size = k, padding = pad, activation = act,
                      kernel_initializer = kernel_ini)(block)
        block = add([short_cut, block])
        block = MaxPooling1D(pool_size = 3, strides = 2, padding = pad)(block)
        
    # Final block
    short_cut = block
#     block = BatchNormalization()(block)
    block = PReLU()(block)
    block = Conv1D(units, kernel_size = k, padding = pad, activation = act,
                  kernel_initializer = kernel_ini)(block)
#     block = BatchNormalization()(block)
    block = PReLU()(block)
    block = Conv1D(units, kernel_size = k, padding = pad, activation = act,
                  kernel_initializer = kernel_ini)(block)
    block = add([short_cut, block])
    max_pool = GlobalMaxPooling1D()(block)
    avg_pool = GlobalAveragePooling1D()(block)
    block = concatenate([max_pool, avg_pool])
    
    # output block
    out_put = Dense(64, activation = act)(block)
#     out_put = BatchNormalization()(out_put)
    out_put = PReLU()(out_put)
    out_put = Dense(64, activation = act)(block)
#     out_put = BatchNormalization()(out_put)
    out_put = PReLU()(out_put)
  
    out_put = Dense(6, activation = "sigmoid")(out_put)
    model = Model(inputs = inp, outputs = out_put)
    model.compile(loss = "binary_crossentropy", optimizer = Nadam(lr = lr), metrics = ["accuracy"])
    return model

In [0]:
n_folds = 10
dpcnn_pred = 0
dpcnn_oof_pred = np.zeros((nrow_train, 6))
val_losses = []

batch_sizes = 128
units, k, num_block, lr, dr, epochs = 256, 3, 5, 1e-3, 0.3, 10

early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

kfold = KFold(n_splits = n_folds, shuffle = True, random_state = 32)
for i, (train_idx, valid_idx) in enumerate(kfold.split(X_train)):
  print("\nRunning fold {}/{}".format(i + 1, n_folds))
  model = None
  model = build_dpcnn_model(units = units, k = k, num_block = num_block, lr = lr, dr = dr)
  
  x_train, y_train = X_train[train_idx], Y_train[train_idx] 
  x_valid, y_valid = X_train[valid_idx], Y_train[valid_idx]
  
  ra_val = RocAucEvaluation(validation_data = (x_valid, y_valid), interval = 1)

  file_path = "fold " + str(i+1) + " best_model.hdf5"
  check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                save_best_only = True, mode = "min")
  
  hist = model.fit(x_train, y_train, batch_size = batch_sizes, epochs = epochs, validation_data = (x_valid, y_valid), 
                   verbose = 2, callbacks = [check_point, early_stop, ra_val])
  model = load_model(file_path)
  val_losses.append(min(hist.history["val_loss"]))
  dpcnn_pred += model.predict(X_test, batch_size = batch_sizes, verbose = 2)
  dpcnn_oof_pred[valid_idx] = model.predict(x_valid, batch_size = batch_sizes, verbose = 2)

  del file_path, model
  gc.collect()
  
dpcnn_pred = dpcnn_pred/n_folds
print("\noof score of dpcnn is {}.".format(roc_auc_score(Y_train, dpcnn_oof_pred)))
print("\nAverage val loss of {} folds is {}.".format(n_folds, np.mean(val_losses)))

In [0]:
dpcnn_submission = pd.DataFrame()
dpcnn_submission = cnn_submission.reindex(columns = ["id"] + category)
dpcnn_submission["id"]= test[["id"]]
dpcnn_submission[category[0:6]] = dpcnn_pred
dpcnn_submission.to_csv("dpcnn_submission.csv", index = False)
download_file("dpcnn_submission.csv")

In [0]:
dpcnn_submission.head()

In [0]:
dpcnn_oof_prediction = pd.DataFrame()
dpcnn_oof_prediction = dpcnn_oof_prediction.reindex(columns = ["id"] + category)
dpcnn_oof_prediction["id"]= train[["id"]]
dpcnn_oof_prediction[category[0:6]] = pd.DataFrame(dpcnn_oof_pred)
dpcnn_oof_prediction.to_csv("dpcnn_oof_prediction.csv", index = False)
download_file("dpcnn_oof_prediction.csv")

In [0]:
def build_d_lstm_model(lr = 0.0, units = 0, dr = 0.0):
    inp = Input(shape = (max_text_len, ))
    emb = Embedding(nb_words, embed_size, weights = [embedding_matrix], 
                    input_length = max_text_len, trainable = False)(inp)
    main = SpatialDropout1D(dr)(emb)
    main = Bidirectional(CuDNNLSTM(units, return_sequences = True))(main)
    main = Bidirectional(CuDNNLSTM(units))(main)    
    out_put = Dense(6, activation = "sigmoid")(main)
    model = Model(inputs = inp, outputs = out_put)
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr), metrics = ['accuracy'])
    return model

In [0]:
n_folds = 10
d_lstm_pred = 0
d_lstm_oof_pred = np.zeros((nrow_train, 6))
val_losses = []

batch_sizes = 128
units, lr, dr, epochs = 128, 1e-3, 0.3, 10

early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

kfold = KFold(n_splits = n_folds, shuffle = True, random_state = 32)
for i, (train_idx, valid_idx) in enumerate(kfold.split(X_train)):
  print("\nRunning fold {}/{}".format(i + 1, n_folds))
  model = None
  model = build_d_lstm_model(lr = lr, units = units, dr = dr)
  
  x_train, y_train = X_train[train_idx], Y_train[train_idx] 
  x_valid, y_valid = X_train[valid_idx], Y_train[valid_idx]
  
  ra_val = RocAucEvaluation(validation_data = (x_valid, y_valid), interval = 1)

  file_path = "fold " + str(i+1) + " best_model.hdf5"
  check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                                save_best_only = True, mode = "min")
  
  hist = model.fit(x_train, y_train, batch_size = batch_sizes, epochs = epochs, validation_data = (x_valid, y_valid), 
                   verbose = 2, callbacks = [check_point, early_stop, ra_val])
  model = load_model(file_path)
  val_losses.append(min(hist.history["val_loss"]))
  d_lstm_pred += model.predict(X_test, batch_size = batch_sizes, verbose = 2)
  d_lstm_oof_pred[valid_idx] = model.predict(x_valid, batch_size = batch_sizes, verbose = 2)

  del file_path, model
  gc.collect()
  
d_lstm_pred = d_lstm_pred/n_folds
print("\noof score of double lstm is {}.".format(roc_auc_score(Y_train, d_lstm_oof_pred)))
print("\nAverage val loss of {} folds is {}.".format(n_folds, np.mean(val_losses)))

In [0]:
d_lstm_submission = pd.DataFrame()
d_lstm_submission = d_lstm_submission.reindex(columns = ["id"] + category)
d_lstm_submission["id"]= test[["id"]]
d_lstm_submission[category[0:6]] = d_lstm_pred
d_lstm_submission.to_csv("d_lstm_submission.csv", index = False)
download_file("d_lstm_submission.csv")

In [0]:
d_lstm_submission.head()

In [0]:
d_lstm_oof_prediction = pd.DataFrame()
d_lstm_oof_prediction = d_lstm_oof_prediction.reindex(columns = ["id"] + category)
d_lstm_oof_prediction["id"]= train[["id"]]
d_lstm_oof_prediction[category[0:6]] = pd.DataFrame(d_lstm_oof_pred)
d_lstm_oof_prediction.to_csv("d_lstm_oof_prediction.csv", index = False)
download_file("d_lstm_oof_prediction.csv")

In [0]:
from scipy.stats import ks_2samp

def corr(first_file, second_file):
    first_df = first_file
    second_df = second_file

    for cat in category:
        # all correlations
        print("\n Class: %s" % cat)
        print(" Pearson\'s correlation score: %0.6f" %
              first_df[cat].corr(
                  second_df[cat], method = "pearson"))
        print(" Kendall\'s correlation score: %0.6f" %
              first_df[cat].corr(
                  second_df[cat], method = "kendall"))
        print(" Spearman\'s correlation score: %0.6f" %
              first_df[cat].corr(
                  second_df[cat], method = "spearman"))
        ks_stat, p_value = ks_2samp(first_df[cat].values,
                                    second_df[cat].values)
        print(" Kolmogorov-Smirnov test:    KS-stat = %.6f    p-value = %.6e\n"
              % (ks_stat, p_value))
corr(lstm_submission, gru_submission)
corr(lstm_submission, dpcnn_submission)
corr(gru_submission, dpcnn_submission)