In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

np.random.seed(2)

In [2]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

In [3]:
train['comment_text'] = train['comment_text'].astype(str)
test['comment_text'] = test['comment_text'].astype(str)

In [4]:
def sample2():
    train_set = train.sample(frac=0.8)
    val_set = train.drop(train_set.index)
    
    class_weight = {0: len(train) / 2 / (len(train) - sum(train['toxic'])), 1: len(train) / 2 / sum(train['toxic']) }
    
    train_set = train_set.reset_index(drop=True)
    val_set = val_set.reset_index(drop=True)
    
    print(train_set['toxic'].describe())
    print(val_set['toxic'].describe())
    
    return train_set, val_set, class_weight

In [5]:
train_set, val_set, class_weight = sample2()

count    127657.000000
mean          0.096133
std           0.294774
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: toxic, dtype: float64
count    31914.000000
mean         0.094692
std          0.292794
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: toxic, dtype: float64


In [6]:
from keras.preprocessing import text, sequence

max_features = 50000
maxlen = 256

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train['comment_text'].append(test['comment_text']))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
r_word_index = { v:k for k, v in tokenizer.word_index.items()}

In [8]:
X_tr = sequence.pad_sequences(tokenizer.texts_to_sequences(train_set['comment_text']), maxlen=maxlen)
X_va = sequence.pad_sequences(tokenizer.texts_to_sequences(val_set['comment_text']), maxlen=maxlen)
X_te = sequence.pad_sequences(tokenizer.texts_to_sequences(test['comment_text']), maxlen=maxlen)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
other = ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_tr = train_set[list_classes]
y_va = val_set[list_classes]

In [17]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation
from keras.layers import Lambda, RepeatVector, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm

DROPOUT=0.1

def mixing_layer(pred, max_pool):
    x = RepeatVector(len(list_classes))(pred)
    x = Lambda(lambda x: x * (np.ones([len(list_classes), len(list_classes)]) - np.eye(len(list_classes))))(x)
    c = Concatenate()([max_pool, x])
    return c

def get_model_mix_layer(): #0.0453
    embed_size = len(list_classes) * 64
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = GlobalMaxPool1D()(x)
    x = Activation('selu')(x)
    
    x = Dropout(DROPOUT)(x)    
    max_pool = Reshape([len(list_classes), -1])(x)

    
    pred = Dense(1, activation="sigmoid")(max_pool)
    pred = Flatten()(pred)
    
    mix = mixing_layer(pred, max_pool)
    
    x = Dense(1, activation='sigmoid')(mix)
    x = Flatten()(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model


def get_model(): #0.0463
    embed_size = 300
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Activation('selu')(x)
    emb = GlobalMaxPool1D()(x)
    
    #x = Dense(300, activation='selu')(emb)
    x = Dense(6, activation='sigmoid')(emb)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model

In [19]:
np.random.seed(777)

model = get_model()
print(model.summary())

batch_size = 1024
epochs = 2000

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks_list = [checkpoint, early] #early
model.fit(
    X_tr, 
    y_tr, 
    class_weight=None, 
    validation_data=(X_va, y_va), 
    shuffle=True, 
    batch_size=batch_size, 
    epochs=epochs, 
    callbacks=callbacks_list
)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 256)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 256, 300)          15000000  
_________________________________________________________________
activation_6 (Activation)    (None, 256, 300)          0         
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 300)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 1806      
Total params: 15,001,806
Trainable params: 15,001,806
Non-trainable params: 0
_________________________________________________________________
None
Train on 127657 samples, validate on 31914 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2

<keras.callbacks.History at 0x7f1f500705f8>

In [None]:
model.load_weights(file_path)


pred = model.predict(X_va, batch_size=128)
int_pred = pred >= 0.5

In [None]:
from sklearn import metrics

for i, c in enumerate(list_classes):
    print(c)
    print("roc:\t\t%.3f" % metrics.roc_auc_score(val_set[c], pred[:,i]))
    print("f1:\t\t%.3f" % metrics.f1_score(val_set[c], int_pred[:,i]))
    print("precision:\t%.3f" % metrics.precision_score(val_set[c], int_pred[:,i]))
    print("recall:\t\t%.3f" %metrics.recall_score(val_set[c], int_pred[:,i]))
    print("log loss:\t%.3f" %metrics.log_loss(val_set[c], pred[:,i]))

    m = metrics.confusion_matrix(val_set[c], int_pred[:,i])
    tp = m[1,1]
    fp = m[0,1]
    tn = m[0,0]
    fn = m[1,0]
    print("tp:\t\t%d"%tp)
    print("fp:\t\t%d"%fp)
    print("tn:\t\t%d"%tn)
    print("fn:\t\t%d"%fn)
    
    print("tpr:\t\t%.3f"%(tp / (tp+fn)))
    print("fpr:\t\t%.3f"%(fp / (fp+tn)))

    
    precision, recall, threshold = metrics.precision_recall_curve(val_set[c], pred[:,i])
    plt.figure(0)
    plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2,
         color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve:')
    plt.show()
    
    fpr, tpr, _ = metrics.roc_curve(val_set[c], pred[:,i])
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

In [None]:
for i, row in val_set[(pred[:, 0] <= 0.1) & (val_set['toxic'] == 1)].sample(1).iterrows():
    print(row['comment_text'])
    print(row)

In [None]:
y_va != pred_va

In [None]:
model.load_weights(file_path)
sample_submission = pd.read_csv("./data/sample_submission.csv")

sample_submission[list_classes] = y_test

sample_submission.to_csv("baseline.csv", index=False)

In [None]:
model.load_weights(file_path)

y_test = model.predict(X_te, batch_size=2048)

sample_submission = pd.read_csv("./data/sample_submission.csv")
sample_submission[['toxic'] + other] = y_test
sample_submission.to_csv("baseline.csv", index=False)