In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

np.random.seed(2)

In [2]:
train = pd.read_csv("data/train.csv", index_col=0)
test = pd.read_csv("data/test.csv", index_col=0)

In [3]:
def sample2():
    train_set = train.sample(frac=0.8)
    val_set = train.drop(train_set.index)
    
    class_weight = {0: len(train) / 2 / (len(train) - sum(train['toxic'])), 1: len(train) / 2 / sum(train['toxic']) }
    
    train_set = train_set.reset_index(drop=True)
    val_set = val_set.reset_index(drop=True)
    
    print(train_set['toxic'].describe())
    print(val_set['toxic'].describe())
    
    return train_set, val_set, class_weight

In [4]:
train_set, val_set, class_weight = sample2()

count    127657.000000
mean          0.096133
std           0.294774
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: toxic, dtype: float64
count    31914.000000
mean         0.094692
std          0.292794
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: toxic, dtype: float64


In [5]:
from keras.preprocessing import text, sequence

max_features = 20000
maxlen = 256

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_set['comment_text'])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
r_word_index = { v:k for k, v in tokenizer.word_index.items()}

In [7]:
X_tr = sequence.pad_sequences(tokenizer.texts_to_sequences(train_set['comment_text']), maxlen=maxlen)
X_va = sequence.pad_sequences(tokenizer.texts_to_sequences(val_set['comment_text']), maxlen=maxlen)
X_te = sequence.pad_sequences(tokenizer.texts_to_sequences(test['comment_text']), maxlen=maxlen)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
other = ["severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_tr = train_set[list_classes]
y_va = val_set[list_classes]

In [33]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation
from keras.layers import Lambda, RepeatVector, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm

DROPOUT=0.1

def encoder(inp):
    
    glove2 = Embedding(max_features, 2048)(inp)
    enc2 = GlobalMaxPool1D()(glove2)

    """
    glove = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)(inp)
    enc = Bidirectional(GRU(LSTM_SIZE, dropout=0.1, recurrent_dropout=0.1, activation='selu'))(glove)
    """
    
    """
    enc = Conv1D(CONV_SIZE * 2, 2, strides=1, padding='valid', activation='relu')(glove)
    enc = MaxPooling1D(4)(enc)
    enc = Conv1D(CONV_SIZE * 4, 2, strides=1, padding='valid', activation='relu')(enc)
    enc = GlobalMaxPool1D()(enc)
    
    enc = Flatten()(enc)
    """
    #enc = Concatenate()([enc2, enc])
    return enc2


def mixing_layer(pred, max_pool):
    x = RepeatVector(len(list_classes))(pred)
    x = Lambda(lambda x: x * (np.ones([len(list_classes), len(list_classes)]) - np.eye(len(list_classes))))(x)
    c = Concatenate()([max_pool, x])
    return c

def get_model_mix_layer():
    embed_size = len(list_classes) * 64
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = GlobalMaxPool1D()(x)
    x = Activation('selu')(x)
    
    x = Dropout(DROPOUT)(x)    
    max_pool = Reshape([len(list_classes), -1])(x)

    
    pred = Dense(1, activation="sigmoid")(max_pool)
    pred = Flatten()(pred)
    
    mix = mixing_layer(pred, max_pool)
    
    x = Dense(1, activation='sigmoid')(mix)
    x = Flatten()(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model


def get_model_orig():
    embed_size = 32
    inp = Input(shape=(maxlen, ))
    #other_inp = Input(shape=(len(other), ))
    feat_inp = Input(shape=(len(manual_features), ))
    
    """

    x = Embedding(max_features, embed_size)(inp)
    """
    enc = encoder(inp)

    """
    g = Bidirectional(GRU(LSTM_SIZE, return_sequences=True))(g)
    g = Bidirectional(GRU(LSTM_SIZE))(g)

    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)
    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)
    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)
    g = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(g)
    g = MaxPooling1D(2)(g)    
    glove_emb_max = GlobalMaxPool1D()(g)
    """
    
    #x = GlobalMaxPool1D()(x)
    #others = Dense(10, activation="selu")(other_inp)
    
    #x = Concatenate()([enc, feat_inp])
    x=enc
    #x = Dense(100, activation="selu")(x)
    
    pred = Dense(len(other)+1, activation="sigmoid"
                 #bias_constraint=non_neg(), 
                 #kernel_constraint=non_neg()
                )(x)
    
    model = Model(inputs=[inp, feat_inp], outputs=pred)
    
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, EMB_SIZE, embeddings_regularizer=l2(1e-3))(inp)
    #x = Reshape((maxlen, embed_size, 1))(x)
    #x = Bidirectional(LSTM(LSTM_SIZE, return_sequences=True))(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(CONV_SIZE, 2, strides=1, padding='valid', activation='selu')(x)
    x = MaxPooling1D(2)(x)
    x = Bidirectional(LSTM(LSTM_SIZE))(x)
    #x = Dense(LSTM_SIZE, activation="selu")(x)
    x = Dense(len(list_classes), activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model

model = get_model_mix_layer()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 256, 384)     7680000     input_8[0][0]                    
__________________________________________________________________________________________________
global_max_pooling1d_8 (GlobalM (None, 384)          0           embedding_8[0][0]                
__________________________________________________________________________________________________
activation_8 (Activation)       (None, 384)          0           global_max_pooling1d_8[0][0]     
__________________________________________________________________________________________________
dropout_7 

In [34]:
np.random.seed(123)
batch_size = 40
epochs = 2

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='loss', save_best_only=True)
early = EarlyStopping(monitor="loss", mode="min", patience=5)
callbacks_list = [checkpoint, early] #early
model.fit(
    X_tr, 
    y_tr, 
    class_weight=None, 
    #validation_data=(X_va, y_va), 
    shuffle=True, 
    batch_size=batch_size, 
    epochs=epochs, 
    callbacks=callbacks_list
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff630e97e48>

In [35]:
model.load_weights(file_path)

y_test = model.predict(X_te, batch_size=2048)

sample_submission = pd.read_csv("./data/sample_submission.csv")
sample_submission[['toxic'] + other] = y_test
sample_submission.to_csv("baseline.csv", index=False)

In [8]:
y_train = model.predict(X_tr, batch_size=2048)

NameError: name 'model' is not defined