In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import fastText

np.random.seed(123)

In [2]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

In [3]:
train['comment_text'] = train['comment_text'].astype(str)
test['comment_text'] = test['comment_text'].astype(str)

In [4]:
# training
#EMBEDDING_DIM = 300
#train['comment_text'].append(test['comment_text']).to_csv('train_text.csv', index=False, header=False)
#model = fasttext.skipgram('train_text.csv', 'model_skipgram', dim=EMBEDDING_DIM, thread=32)

In [5]:
fasttext_skipgram_emb = fastText.load_model('data/wiki.en.bin')
#fasttext_skipgram_emb = fasttext.load_model('model_skipgram.bin')
EMBEDDING_DIM = len(fasttext_skipgram_emb.get_word_vector('test'))

In [6]:
def sample():
    train_set = train.sample(frac=0.8)
    val_set = train.drop(train_set.index)
    
    class_weight = {0: len(train) / 2 / (len(train) - sum(train['toxic'])), 1: len(train) / 2 / sum(train['toxic']) }
    
    train_set = train_set.reset_index(drop=True)
    val_set = val_set.reset_index(drop=True)
    
    print(train_set['toxic'].describe())
    print(val_set['toxic'].describe())
    
    return train_set, val_set, class_weight

In [7]:
train_set, val_set, class_weight = sample()

count    127657.000000
mean          0.095772
std           0.294280
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: toxic, dtype: float64
count    31914.000000
mean         0.096133
std          0.294779
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: toxic, dtype: float64


In [40]:
from keras.preprocessing import text, sequence

max_features = 20000
maxlen = 128

#tokenizer = text.Tokenizer(num_words=max_features)
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(train['comment_text'].append(test['comment_text']))

In [41]:
r_word_index = { v:k for k, v in tokenizer.word_index.items()}
max_features = len(r_word_index)+1
print(max_features)

394787


In [42]:
X_tr = sequence.pad_sequences(tokenizer.texts_to_sequences(train_set['comment_text']), maxlen=maxlen)
X_va = sequence.pad_sequences(tokenizer.texts_to_sequences(val_set['comment_text']), maxlen=maxlen)
X_te = sequence.pad_sequences(tokenizer.texts_to_sequences(test['comment_text']), maxlen=maxlen)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
#other = ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_tr = train_set[list_classes]
y_va = val_set[list_classes]

In [43]:
"""
EMBEDDING_DIM = 300

from tqdm import tqdm

embeddings_index = {}
f = open("data/glove.6B.300d.txt".format(EMBEDDING_DIM))
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
"""

'\nEMBEDDING_DIM = 300\n\nfrom tqdm import tqdm\n\nembeddings_index = {}\nf = open("data/glove.6B.300d.txt".format(EMBEDDING_DIM))\nfor line in tqdm(f):\n    values = line.split()\n    word = values[0]\n    coefs = np.asarray(values[1:], dtype=\'float32\')\n    embeddings_index[word] = coefs\nf.close()\n\nprint(\'Found %s word vectors.\' % len(embeddings_index))\n'

In [44]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_matrix[i] = fasttext_skipgram_emb.get_word_vector(word)
    """
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    """

In [55]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation
from keras.layers import Conv1D, MaxPooling1D, Average, Lambda, RepeatVector, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm

DROPOUT=0.5

def mixing_layer(pred, emb):
    x = RepeatVector(len(list_classes))(pred)
    x = Lambda(lambda x: x * (np.ones([len(list_classes), len(list_classes)]) - np.eye(len(list_classes))))(x)
    c = Concatenate()([emb, x])
    return c


def split_dense(emb_layer, activation):
    k = []
    for i in range(len(list_classes)):
        k.append(Lambda(lambda x: x[:,i,:])(emb_layer))
    k = [Dense(1, activation=activation)(s) for s in k]
    return Concatenate()(k)    


def get_model_mix_layer():
    inp = Input(shape=(maxlen, ))
    
    emb = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False)(inp)
    
    conv = Conv1D(64, 7, activation='selu')(emb)
    
    emb = MaxPooling1D(2)(conv)
    conv = Conv1D(64, 7, activation='selu')(emb)
    
    #emb = MaxPooling1D(2)(conv)
    #conv = Conv1D(64, 8, activation='relu')(emb)
    
    #conv = Conv1D(EMBEDDING_DIM * 2 * 2, 2, activation='relu')(emb)
    #emb = MaxPooling1D(2)(conv)
    
    #emb = Bidirectional(GRU(32, return_sequences=True, recurrent_dropout=DROPOUT, dropout=DROPOUT))(emb)
    #emb = Bidirectional(GRU(32, recurrent_dropout=DROPOUT, dropout=DROPOUT))(emb)
    emb = GlobalMaxPool1D()(conv)
    
    emb = Dropout(DROPOUT)(emb)
       
    #pred = split_dense(emb, 'sigmoid')
    #mix = mixing_layer(pred, emb)
    #final = split_dense(mix, 'sigmoid')
    #final = Dense(256, activation='selu')(emb)
    
    final = Dense(64, activation='selu')(emb)
    final = Dense(6, activation='sigmoid')(final)

    model = Model(inputs=inp, outputs=final)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model

model = get_model_mix_layer()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_24 (InputLayer)        (None, 128)               0         
_________________________________________________________________
embedding_23 (Embedding)     (None, 128, 300)          118436400 
_________________________________________________________________
conv1d_41 (Conv1D)           (None, 122, 64)           134464    
_________________________________________________________________
max_pooling1d_27 (MaxPooling (None, 61, 64)            0         
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 55, 64)            28736     
_________________________________________________________________
global_max_pooling1d_16 (Glo (None, 64)                0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
__________

In [56]:
np.random.seed(777)
batch_size = 1024
epochs = 2000

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks_list = [checkpoint, early] #early
model.fit(
    X_tr,
    y_tr, 
    class_weight=None, 
    validation_data=(X_va, y_va), 
    shuffle=True, 
    batch_size=batch_size, 
    epochs=epochs, 
    callbacks=callbacks_list
)

Train on 127657 samples, validate on 31914 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
 15360/127657 [==>...........................] - ETA: 28s - loss: 0.0369 - acc: 0.9857 - binary_crossentropy: 0.0369

KeyboardInterrupt: 

In [None]:
model.load_weights(file_path)


pred = model.predict(X_va, batch_size=128)
int_pred = pred >= 0.5

In [None]:
from sklearn import metrics

for i, c in enumerate(list_classes):
    print(c)
    print("roc:\t\t%.3f" % metrics.roc_auc_score(val_set[c], pred[:,i]))
    print("f1:\t\t%.3f" % metrics.f1_score(val_set[c], int_pred[:,i]))
    print("precision:\t%.3f" % metrics.precision_score(val_set[c], int_pred[:,i]))
    print("recall:\t\t%.3f" %metrics.recall_score(val_set[c], int_pred[:,i]))
    print("log loss:\t%.3f" %metrics.log_loss(val_set[c], pred[:,i]))

    m = metrics.confusion_matrix(val_set[c], int_pred[:,i])
    tp = m[1,1]
    fp = m[0,1]
    tn = m[0,0]
    fn = m[1,0]
    print("tp:\t\t%d"%tp)
    print("fp:\t\t%d"%fp)
    print("tn:\t\t%d"%tn)
    print("fn:\t\t%d"%fn)
    
    print("tpr:\t\t%.3f"%(tp / (tp+fn)))
    print("fpr:\t\t%.3f"%(fp / (fp+tn)))

    
    precision, recall, threshold = metrics.precision_recall_curve(val_set[c], pred[:,i])
    plt.figure(0)
    plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2,
         color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve:')
    plt.show()
    
    fpr, tpr, _ = metrics.roc_curve(val_set[c], pred[:,i])
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

In [None]:
for i, row in val_set[(pred[:, 0] <= 0.1) & (val_set['toxic'] == 1)].sample(1).iterrows():
    print(row['comment_text'])
    print(row)

In [None]:
model.load_weights(file_path)
sample_submission = pd.read_csv("./data/sample_submission.csv")

sample_submission[list_classes] = y_test

sample_submission.to_csv("baseline.csv", index=False)

In [None]:
model.load_weights(file_path)

y_test = model.predict([X_te, test[manual_features]], batch_size=2048)

sample_submission = pd.read_csv("./data/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("baseline.csv", index=False)

In [None]:
tokenizer.word_counts['pagan']
tokenizer.word_counts