In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import fastText

np.random.seed(123)

In [2]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

In [3]:
train['comment_text'] = train['comment_text'].astype(str)
test['comment_text'] = test['comment_text'].astype(str)

In [4]:
fasttext_skipgram_emb = fastText.load_model('data/wiki.en.bin')
EMBEDDING_DIM = len(fasttext_skipgram_emb.get_word_vector('test'))

In [5]:
def sample():
    train_set = train.sample(frac=0.8)
    val_set = train.drop(train_set.index)
    
    class_weight = {0: len(train) / 2 / (len(train) - sum(train['toxic'])), 1: len(train) / 2 / sum(train['toxic']) }
    
    train_set = train_set.reset_index(drop=True)
    val_set = val_set.reset_index(drop=True)
    
    print(train_set['toxic'].describe())
    print(val_set['toxic'].describe())
    
    return train_set, val_set, class_weight

In [6]:
train_set, val_set, class_weight = sample()

count    127657.000000
mean          0.095772
std           0.294280
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: toxic, dtype: float64
count    31914.000000
mean         0.096133
std          0.294779
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: toxic, dtype: float64


In [7]:
from keras.preprocessing import text, sequence

maxlen = 256
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(train['comment_text'].append(test['comment_text']))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
X_tr = sequence.pad_sequences(tokenizer.texts_to_sequences(train_set['comment_text']), maxlen=maxlen)
X_va = sequence.pad_sequences(tokenizer.texts_to_sequences(val_set['comment_text']), maxlen=maxlen)
X_te = sequence.pad_sequences(tokenizer.texts_to_sequences(test['comment_text']), maxlen=maxlen)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_tr = train_set[list_classes]
y_va = val_set[list_classes]

In [9]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_matrix[i] = fasttext_skipgram_emb.get_word_vector(word)

In [10]:
import tensorflow as tf

def roc_auc_score(y_true, y_pred):
    """ ROC AUC Score.
    Approximates the Area Under Curve score, using approximation based on
    the Wilcoxon-Mann-Whitney U statistic.
    Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
    Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
    Measures overall performance for a full range of threshold levels.
    Arguments:
        y_pred: `Tensor`. Predicted values.
        y_true: `Tensor` . Targets (labels), a probability distribution.
    """
    with tf.name_scope("RocAucScore"):

        pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
        neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))

        pos = tf.expand_dims(pos, 0)
        neg = tf.expand_dims(neg, 1)

        # original paper suggests performance is robust to exact parameter choice
        gamma = 0.2
        p     = 3

        difference = tf.zeros_like(pos * neg) + pos - neg - gamma

        masked = tf.boolean_mask(difference, difference < 0.0)

        return tf.reduce_sum(tf.pow(-masked, p))

In [11]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation
from keras.layers import Add,Conv1D, MaxPooling1D, Average, Lambda, RepeatVector, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm
from sru import SRU

DROPOUT=0.5

def mixing_layer(pred, emb):
    x = RepeatVector(len(list_classes))(pred)
    x = Lambda(lambda x: x * (np.ones([len(list_classes), len(list_classes)]) - np.eye(len(list_classes))))(x)
    c = Concatenate()([emb, x])
    return c


def split_dense(emb_layer, activation):
    k = []
    for i in range(len(list_classes)):
        k.append(Lambda(lambda x: x[:,i,:])(emb_layer))
    k = [Dense(1, activation=activation)(s) for s in k]
    return Concatenate()(k)    

def DPCNN(last_layer):
    conv = Conv1D(EMBEDDING_DIM, 3, padding='same')(last_layer)
    conv = Dropout(DROPOUT)(conv)
    #conv = Conv1D(EMBEDDING_DIM, 3, padding='same')(conv)
    #conv = Dropout(DROPOUT)(conv)
    conv = Add()([conv, last_layer])
    conv = MaxPooling1D(2)(conv)
    return Activation('selu')(conv)

FREE_EMB_SIZE=8

def get_model():
    inp = Input(shape=(maxlen, ))
    
    emb = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(inp)
    emb = Bidirectional(GRU(128, recurrent_dropout=DROPOUT, dropout=DROPOUT, return_sequences=True))(emb)
    emb = Bidirectional(GRU(128, recurrent_dropout=DROPOUT, dropout=DROPOUT, return_sequences=True))(emb)
    emb = GlobalMaxPool1D()(emb)
    
    pred = Dense(256, activation='selu')(emb)
    pred = Dropout(DROPOUT)(pred)
    final = Dense(6, activation='sigmoid')(pred)
    
    model = Model(inputs=inp, outputs=final)
    model.compile(loss=roc_auc_score,
                  optimizer='adam')

    return model

model = get_model()

model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 256, 300)          118436400 
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256, 256)          329472    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256, 256)          295680    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 256)               0         
_______________________________________________________

In [None]:
from keras.callbacks import Callback
from sklearn import metrics

np.random.seed(777)
batch_size = 1024
epochs = 2000

class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        
    
    def on_train_begin(self, logs={}):
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):        
        #y_pred = self.model.predict(self.x, batch_size=batch_size)
        #roc = metrics.roc_auc_score(self.y, y_pred)
        
        y_pred_val = self.model.predict(self.x_val, batch_size=batch_size)
        roc_val = metrics.roc_auc_score(self.y_val, y_pred_val)
        
        print('roc-auc_val: %s' % str(round(roc_val,4)),end=100*' '+'\n')
        
        #print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return   

file_path="weights.{epoch:02d}-{val_loss:.2f}..hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', period=3)
#early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, roc_callback((X_tr, y_tr), (X_va, y_va))]
#callbacks_list = [checkpoint, early]
model.fit(
    X_tr,
    y_tr, 
    class_weight=None, 
    validation_data=(X_va, y_va), 
    shuffle=True, 
    batch_size=batch_size, 
    epochs=epochs, 
    callbacks=callbacks_list
)

Train on 127657 samples, validate on 31914 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000


Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000


Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000

In [None]:
model.load_weights(file_path)
pred = model.predict(X_va, batch_size=128)
int_pred = pred >= 0.5

In [None]:
from sklearn import metrics

for i, c in enumerate(list_classes):
    print(c)
    print("roc:\t\t%.5f" % metrics.roc_auc_score(val_set[c], pred[:,i]))
    print("f1:\t\t%.5f" % metrics.f1_score(val_set[c], int_pred[:,i]))
    print("precision:\t%.5f" % metrics.precision_score(val_set[c], int_pred[:,i]))
    print("recall:\t\t%.5f" %metrics.recall_score(val_set[c], int_pred[:,i]))
    print("log loss:\t%.f" %metrics.log_loss(val_set[c], pred[:,i]))

    m = metrics.confusion_matrix(val_set[c], int_pred[:,i])
    tp = m[1,1]
    fp = m[0,1]
    tn = m[0,0]
    fn = m[1,0]
    print("tp:\t\t%d"%tp)
    print("fp:\t\t%d"%fp)
    print("tn:\t\t%d"%tn)
    print("fn:\t\t%d"%fn)
    
    print("tpr:\t\t%.3f"%(tp / (tp+fn)))
    print("fpr:\t\t%.3f"%(fp / (fp+tn)))

    
    precision, recall, threshold = metrics.precision_recall_curve(val_set[c], pred[:,i])
    plt.figure(0)
    plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2,
         color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve:')
    plt.show()
    
    fpr, tpr, _ = metrics.roc_curve(val_set[c], pred[:,i])
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

In [None]:
for i, row in val_set[(pred[:, 0] <= 0.1) & (val_set['toxic'] == 1)].sample(1).iterrows():
    print(row['comment_text'])
    print(row)

In [None]:
model.load_weights(file_path)
sample_submission = pd.read_csv("./data/sample_submission.csv")

sample_submission[list_classes] = y_test

sample_submission.to_csv("baseline.csv", index=False)

In [None]:
model.load_weights(file_path)

y_test = model.predict([X_te], batch_size=2048)

sample_submission = pd.read_csv("./data/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("baseline.csv", index=False)

In [None]:
tokenizer.word_counts['pagan']
tokenizer.word_counts