In [1]:
import pickle
X_tr, X_va, y_tr, y_va = pickle.load( open( "data/train_data.p", "rb" ) )
embedding_matrix = pickle.load( open( "data/fast_test_emb.p", "rb" ) )
maxlen=256

In [2]:
import tensorflow as tf

def roc_auc_score(y_true, y_pred):
    """ ROC AUC Score.
    Approximates the Area Under Curve score, using approximation based on
    the Wilcoxon-Mann-Whitney U statistic.
    Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
    Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
    Measures overall performance for a full range of threshold levels.
    Arguments:
        y_pred: `Tensor`. Predicted values.
        y_true: `Tensor` . Targets (labels), a probability distribution.
    """
    with tf.name_scope("RocAucScore"):

        pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
        neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))

        pos = tf.expand_dims(pos, 0)
        neg = tf.expand_dims(neg, 1)

        # original paper suggests performance is robust to exact parameter choice
        gamma = 0.2
        p     = 3

        difference = tf.zeros_like(pos * neg) + pos - neg - gamma

        masked = tf.boolean_mask(difference, difference < 0.0)

        return tf.reduce_sum(tf.pow(-masked, p))

  from ._conv import register_converters as _register_converters


In [None]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation
from keras.layers import Add,Conv1D, MaxPooling1D, Average, Lambda, RepeatVector, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate, TimeDistributed
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm

DROPOUT=0.5

def get_model():
    inp = Input(shape=(maxlen, ))
    
    emb = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(inp)
    
# roc: 984 val: 425
#     emb = TimeDistributed(Dense(256, activation='relu'))(emb)
#     emb = Dropout(DROPOUT)(emb)
#     emb = TimeDistributed(Dense(256, activation='relu'))(emb)
#     emb = Dropout(DROPOUT)(emb)
#     emb = TimeDistributed(Dense(256, activation='relu'))(emb)
#     emb = Dropout(DROPOUT)(emb)
    
    emb = TimeDistributed(Dense(256, activation='relu'))(emb)
    emb = Dropout(DROPOUT)(emb)
    emb = TimeDistributed(Dense(256, activation='relu'))(emb)
    emb = Dropout(DROPOUT)(emb)
    emb = TimeDistributed(Dense(256, activation='relu'))(emb)
    emb = Dropout(DROPOUT)(emb)
    emb = TimeDistributed(Dense(256, activation='relu'))(emb)
    emb = Dropout(DROPOUT)(emb)

    emb = GlobalMaxPool1D()(emb)
    #emb = Flatten()(emb)
    pred = Dense(256, activation='relu')(emb)
    pred = Dropout(DROPOUT)(pred)
    final = Dense(6, activation='sigmoid')(pred)
    
    model = Model(inputs=inp, outputs=final)
    model.compile(loss=roc_auc_score,
                  # optimizer='rmsprop', #9882
                  optimizer='adam', #9888
                  metrics=['accuracy'])

    return model

model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 256, 300)          118436400 
_________________________________________________________________
time_distributed_22 (TimeDis (None, 256, 256)          77056     
_________________________________________________________________
dropout_19 (Dropout)         (None, 256, 256)          0         
_________________________________________________________________
time_distributed_23 (TimeDis (None, 256, 256)          65792     
_________________________________________________________________
dropout_20 (Dropout)         (None, 256, 256)          0         
_________________________________________________________________
time_distributed_24 (TimeDis (None, 256, 256)          65792     
__________

In [None]:
from keras.callbacks import Callback
from sklearn import metrics
import numpy as np
import keras

np.random.seed(777)
batch_size = 1024
epochs = 2000

class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        
    
    def on_train_begin(self, logs={}):
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):        
        #y_pred = self.model.predict(self.x, batch_size=batch_size)
        #roc = metrics.roc_auc_score(self.y, y_pred)
        
        y_pred_val = self.model.predict(self.x_val, batch_size=batch_size)
        roc_val = metrics.roc_auc_score(self.y_val, y_pred_val)
        
        print('roc-auc_val: %s' % str(round(roc_val,4)),end=100*' '+'\n')
        
        #print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return   

#from keras.models import load_model
#model = load_model('./weights_base.best.hdf5', custom_objects={'roc_auc_score': roc_auc_score})
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=20, verbose=1)
#
callbacks_list = [checkpoint, early, roc_callback((X_tr, y_tr), (X_va, y_va))]
#callbacks_list = [checkpoint, early]
model.fit(
    X_tr,
    y_tr, 
    class_weight=None, 
    validation_data=(X_va, y_va), 
    shuffle=True, 
    batch_size=batch_size, 
    epochs=epochs, 
    callbacks=callbacks_list
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 127657 samples, validate on 31914 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000

In [1]:
from keras.models import load_model

import tensorflow as tf

def roc_auc_score(y_true, y_pred):
    """ ROC AUC Score.
    Approximates the Area Under Curve score, using approximation based on
    the Wilcoxon-Mann-Whitney U statistic.
    Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
    Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
    Measures overall performance for a full range of threshold levels.
    Arguments:
        y_pred: `Tensor`. Predicted values.
        y_true: `Tensor` . Targets (labels), a probability distribution.
    """
    with tf.name_scope("RocAucScore"):

        pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
        neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))

        pos = tf.expand_dims(pos, 0)
        neg = tf.expand_dims(neg, 1)

        # original paper suggests performance is robust to exact parameter choice
        gamma = 0.2
        p     = 3

        difference = tf.zeros_like(pos * neg) + pos - neg - gamma

        masked = tf.boolean_mask(difference, difference < 0.0)

        return tf.reduce_sum(tf.pow(-masked, p))

model = load_model('./weights_base.best.hdf5', custom_objects={'roc_auc_score': roc_auc_score})
X_te = pickle.load( open( "test_data.p", "rb" ) )

ModuleNotFoundError: ignored

In [None]:
y_test = model.predict([X_te], batch_size=1024, verbose=True)



In [None]:
import pandas as pd
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

sample_submission = pd.read_csv("sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("dicnn_9863.csv", index=False)

In [None]:
!ls -laht dicnn_9863.csv

-rw-r--r-- 1 root root 20M Feb 19 10:25 dicnn_9863.csv


In [None]:
!kg submit dicnn_9863.csv

0.9818


total 36K
drwx------ 4 root root 4.0K Feb 24 01:34 .cache
drwxr-xr-x 1 root root 4.0K Feb 24 01:34 .
drwxr-xr-x 3 root root 4.0K Feb 24 01:34 .config
drwxr-xr-x 5 root root 4.0K Feb 24 01:34 .ipython
drwx------ 3 root root 4.0K Feb 24 01:30 .local
drwxr-xr-x 4 root root 4.0K Feb 24 01:30 .forever
-rw------- 1 root root 1.0K Feb 24 01:30 .rnd
drwxr-xr-x 1 root root 4.0K Feb 24 01:30 ..
drwxr-xr-x 1 root root 4.0K Feb 24 01:29 datalab


In [16]:
!ls -laht

total 9.5G
-rw-r--r--  1 root root 457M Feb 24 17:03 weights_base.best.hdf5
drwxr-xr-x  1 root root 4.0K Feb 24 12:58 .
drwx------  3 root root 4.0K Feb 24 12:58 .nv
drwxr-xr-x  2 root root 4.0K Feb 24 12:58 .keras
drwx------  2 root root 4.0K Feb 24 12:57 .kaggle-cli
drwx------  4 root root 4.0K Feb 24 12:57 .local
drwxr-xr-x 10 root root 4.0K Feb 24 12:57 fastText
-rw-r--r--  1 root root 7.4G Feb 24 12:56 wiki.en.bin.gz
-rw-r--r--  1 root root 164M Feb 24 12:55 train_data.p
-rw-r--r--  1 root root 150M Feb 24 12:55 test_data.p
-rw-r--r--  1 root root 904M Feb 24 12:55 fast_test_emb.p
-rw-r--r--  1 root root 460M Feb 24 12:55 379.h5
drwxr-xr-x  3 root root 4.0K Feb 24 12:55 .gsutil
drwxr-xr-x  1 root root 4.0K Feb 24 12:55 datalab
drwx------  4 root root 4.0K Feb 24 12:54 .cache
drwxr-xr-x  3 root root 4.0K Feb 24 12:54 .config
drwxr-xr-x  5 root root 4.0K Feb 24 12:54 .ipython
drwxr-xr-x  4 root root 4.0K Feb 24 12:20 .forever
-rw-------  1 root root 1.0K Feb 24 12:20 .rnd
drwxr-xr-x