In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

np.random.seed(2)

In [2]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

In [3]:
train['comment_text'] = train['comment_text'].astype(str)
test['comment_text'] = test['comment_text'].astype(str)

In [4]:
def log_col(df, cols):
    for col in cols:
        print(col)
        df['log_'+col] = np.log(df[col]+0.1)
        scaler = StandardScaler()
        df['log_'+col] = scaler.fit_transform(df['log_'+col].values.reshape(-1, 1))
        df['log_'+col].hist()
        plt.show()
    return df

In [5]:
manual_features = ['comment_length', 'num_?', 'num_!', 'num_*', 'num_upper', 'num_you']

In [6]:
train = log_col(train, manual_features)
test = log_col(test, manual_features)

comment_length


KeyError: 'comment_length'

In [7]:
def sample1():
    train_pos = train[train["toxic"] == 1]
    train_neg = train[train["toxic"] == 0]

    print(train_pos.shape)
    print(train_neg.shape)

    train_set = train_pos.sample(frac=0.8)
    train_set = train_set.append(train_neg.sample(n=train_set.shape[0]))

    val_set = train.drop(train_set.index)

    train_set.reset_index(drop=True)
    val_set.reset_index(drop=True)

    print(train_set['toxic'].describe())
    print(val_set['toxic'].describe())
    
    return train_set, val_set, {0: 1, 1:1}

In [4]:
def sample2():
    train_set = train.sample(frac=0.95)
    val_set = train.drop(train_set.index)
    
    class_weight = {0: len(train) / 2 / (len(train) - sum(train['toxic'])), 1: len(train) / 2 / sum(train['toxic']) }
    
    train_set = train_set.reset_index(drop=True)
    val_set = val_set.reset_index(drop=True)
    
    print(train_set['toxic'].describe())
    print(val_set['toxic'].describe())
    
    return train_set, val_set, class_weight

In [5]:
train_set, val_set, class_weight = sample2()

count    151592.000000
mean          0.095955
std           0.294530
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: toxic, dtype: float64
count    7979.000000
mean        0.093746
std         0.291493
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: toxic, dtype: float64


In [10]:
from keras.preprocessing import text, sequence

max_features = 20000
maxlen = 256

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_set['comment_text'])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [11]:
r_word_index = { v:k for k, v in tokenizer.word_index.items()}

In [12]:
X_tr = sequence.pad_sequences(tokenizer.texts_to_sequences(train_set['comment_text']), maxlen=maxlen)
X_va = sequence.pad_sequences(tokenizer.texts_to_sequences(val_set['comment_text']), maxlen=maxlen)
X_te = sequence.pad_sequences(tokenizer.texts_to_sequences(test['comment_text']), maxlen=maxlen)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
other = ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_tr = train_set[list_classes]
y_va = val_set[list_classes]

In [13]:
train_set[manual_features].shape

KeyError: "['comment_length' 'num_?' 'num_!' 'num_*' 'num_upper' 'num_you'] not in index"

In [14]:
EMBEDDING_DIM = 300


from tqdm import tqdm

embeddings_index = {}
f = open("data/glove.6B.300d.txt".format(EMBEDDING_DIM))
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

FileNotFoundError: [Errno 2] No such file or directory: 'data/glove.6B.300d.txt'

In [15]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [16]:
embedding_matrix.shape

(183824, 300)

In [17]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation
from keras.layers import Lambda, RepeatVector, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm

CONV_SIZE = EMBEDDING_DIM
LSTM_SIZE = 128
DROPOUT=0.1

def encoder(inp):
    
    glove2 = Embedding(max_features, 2048)(inp)
    enc2 = GlobalMaxPool1D()(glove2)

    """
    glove = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)(inp)
    enc = Bidirectional(GRU(LSTM_SIZE, dropout=0.1, recurrent_dropout=0.1, activation='selu'))(glove)
    """
    
    """
    enc = Conv1D(CONV_SIZE * 2, 2, strides=1, padding='valid', activation='relu')(glove)
    enc = MaxPooling1D(4)(enc)
    enc = Conv1D(CONV_SIZE * 4, 2, strides=1, padding='valid', activation='relu')(enc)
    enc = GlobalMaxPool1D()(enc)
    
    enc = Flatten()(enc)
    """
    #enc = Concatenate()([enc2, enc])
    return enc2


def mixing_layer(pred, max_pool):
    x = RepeatVector(len(list_classes))(pred)
    x = Lambda(lambda x: x * (np.ones([len(list_classes), len(list_classes)]) - np.eye(len(list_classes))))(x)
    c = Concatenate()([max_pool, x])
    return c

def get_model_mix_layer():
    embed_size = len(list_classes) * 64
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = GlobalMaxPool1D()(x)
    x = Activation('selu')(x)
    
    x = Dropout(DROPOUT)(x)    
    max_pool = Reshape([len(list_classes), -1])(x)

    
    pred = Dense(1, activation="sigmoid")(max_pool)
    pred = Flatten()(pred)
    
    mix = mixing_layer(pred, max_pool)
    
    x = Dense(1, activation='sigmoid')(mix)
    x = Flatten()(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model


def get_model_orig():
    embed_size = 32
    inp = Input(shape=(maxlen, ))
    #other_inp = Input(shape=(len(other), ))
    feat_inp = Input(shape=(len(manual_features), ))
    
    """

    x = Embedding(max_features, embed_size)(inp)
    """
    enc = encoder(inp)

    """
    g = Bidirectional(GRU(LSTM_SIZE, return_sequences=True))(g)
    g = Bidirectional(GRU(LSTM_SIZE))(g)

    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)
    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)
    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)
    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)    
    glove_emb_max = GlobalMaxPool1D()(g)
    """
    
    #x = GlobalMaxPool1D()(x)
    #others = Dense(10, activation="selu")(other_inp)
    
    #x = Concatenate()([enc, feat_inp])
    x=enc
    #x = Dense(100, activation="selu")(x)
    
    pred = Dense(len(other)+1, activation="sigmoid"
                 #bias_constraint=non_neg(), 
                 #kernel_constraint=non_neg()
                )(x)
    
    model = Model(inputs=[inp, feat_inp], outputs=pred)
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, EMB_SIZE, embeddings_regularizer=l2(1e-3))(inp)
    #x = Reshape((maxlen, embed_size, 1))(x)
    #x = Bidirectional(LSTM(LSTM_SIZE, return_sequences=True))(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Bidirectional(LSTM(LSTM_SIZE))(x)
    #x = Dense(LSTM_SIZE, activation="selu")(x)
    x = Dense(len(list_classes), activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model

model = get_model_mix_layer()
model.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 256, 384)     7680000     input_1[0][0]                    
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 384)          0           embedding_1[0][0]                
________________________________________________________________________________________

In [18]:
batch_size = 32
epochs = 2

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks_list = [checkpoint, early] #early
model.fit(
    X_tr, 
    y_tr, 
    class_weight=None, 
    validation_data=(X_va, y_va), 
    shuffle=True, 
    batch_size=batch_size, 
    epochs=epochs, 
    callbacks=callbacks_list
)

Train on 127657 samples, validate on 31914 samples
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
 15168/127657 [==>...........................] - ETA: 3:18 - loss: 0.0322 - acc: 0.9872 - binary_crossentropy: 0.0322

KeyboardInterrupt: 

In [None]:
model.load_weights(file_path)


pred = model.predict(X_va, batch_size=128)
int_pred = pred >= 0.5

In [None]:
from sklearn import metrics

for i, c in enumerate(list_classes):
    print(c)
    print("roc:\t\t%.3f" % metrics.roc_auc_score(val_set[c], pred[:,i]))
    print("f1:\t\t%.3f" % metrics.f1_score(val_set[c], int_pred[:,i]))
    print("precision:\t%.3f" % metrics.precision_score(val_set[c], int_pred[:,i]))
    print("recall:\t\t%.3f" %metrics.recall_score(val_set[c], int_pred[:,i]))
    print("log loss:\t%.3f" %metrics.log_loss(val_set[c], pred[:,i]))

    m = metrics.confusion_matrix(val_set[c], int_pred[:,i])
    tp = m[1,1]
    fp = m[0,1]
    tn = m[0,0]
    fn = m[1,0]
    print("tp:\t\t%d"%tp)
    print("fp:\t\t%d"%fp)
    print("tn:\t\t%d"%tn)
    print("fn:\t\t%d"%fn)
    
    print("tpr:\t\t%.3f"%(tp / (tp+fn)))
    print("fpr:\t\t%.3f"%(fp / (fp+tn)))

    
    precision, recall, threshold = metrics.precision_recall_curve(val_set[c], pred[:,i])
    plt.figure(0)
    plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2,
         color='b')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve:')
    plt.show()
    
    fpr, tpr, _ = metrics.roc_curve(val_set[c], pred[:,i])
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

In [None]:
for i, row in val_set[(pred[:, 0] <= 0.1) & (val_set['toxic'] == 1)].sample(1).iterrows():
    print(row['comment_text'])
    print(row)

In [None]:
y_va != pred_va

In [None]:
model.load_weights(file_path)
sample_submission = pd.read_csv("./data/sample_submission.csv")

sample_submission[list_classes] = y_test

sample_submission.to_csv("baseline.csv", index=False)

In [19]:
model.load_weights(file_path)

y_test = model.predict(X_te, batch_size=2048)

sample_submission = pd.read_csv("./data/sample_submission.csv")
sample_submission[['toxic'] + other] = y_test
sample_submission.to_csv("baseline.csv", index=False)