In [1]:
import numpy as np
import pandas as pd
from lib import load_train, load_test
from sklearn.preprocessing import minmax_scale

In [2]:
train = load_train(nrows=100000)

In [3]:
cat_cols = ['ip', 'app', 'device', 'os', 'channel']

In [4]:
# columns features
for col in cat_cols:
    log_col = 'log_%s_count'%col
    col_count = pd.DataFrame(train.groupby(col).size(), columns=[log_col]).reset_index()
    col_count[log_col] = minmax_scale(np.log(col_count[log_col]))
    train = pd.merge(train, col_count, on=col, how='inner')

In [5]:
# ip features
for col in cat_cols[1:]:
    log_col = 'log_%s_distinct_count_by_ip'%col
    col_count = pd.DataFrame({ log_col: train.groupby('ip')[col].nunique()}).reset_index()
    col_count[log_col] = minmax_scale(np.log(col_count[log_col]))
    train = pd.merge(train, col_count, on='ip', how='inner')

In [6]:
# build embedding for most occuring 20% cat_cols, default embedding for the remaining 80%
EMBEDDING_TOP_QUANTILE = .2
embedding_set = {}
for col in cat_cols:
    log_col = 'log_%s_count'%col
    embedding_set[col] = set(train[col][train[log_col] >= train[log_col].quantile(1-EMBEDDING_TOP_QUANTILE)].cat.codes.unique())

In [7]:
import tensorflow as tf

def roc_auc_score(y_true, y_pred):
    """ ROC AUC Score.
    Approximates the Area Under Curve score, using approximation based on
    the Wilcoxon-Mann-Whitney U statistic.
    Yan, L., Dodier, R., Mozer, M. C., & Wolniewicz, R. (2003).
    Optimizing Classifier Performance via an Approximation to the Wilcoxon-Mann-Whitney Statistic.
    Measures overall performance for a full range of threshold levels.
    Arguments:
        y_pred: `Tensor`. Predicted values.
        y_true: `Tensor` . Targets (labels), a probability distribution.
    """
    with tf.name_scope("RocAucScore"):

        pos = tf.boolean_mask(y_pred, tf.cast(y_true, tf.bool))
        neg = tf.boolean_mask(y_pred, ~tf.cast(y_true, tf.bool))

        pos = tf.expand_dims(pos, 0)
        neg = tf.expand_dims(neg, 1)

        # original paper suggests performance is robust to exact parameter choice
        gamma = 0.2
        p     = 3

        difference = tf.zeros_like(pos * neg) + pos - neg - gamma

        masked = tf.boolean_mask(difference, difference < 0.0)

        return tf.reduce_sum(tf.pow(-masked, p))

  from ._conv import register_converters as _register_converters


In [8]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten, Activation
from keras.layers import Add,Conv1D, MaxPooling1D, Average, Lambda, RepeatVector, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, GRU, Conv1D, Reshape, MaxPooling1D, Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from keras.constraints import non_neg, unit_norm
from mlkit.keras import MappedEmbedding

DROPOUT=0.5

FREE_EMB_SIZE=8

zero_one_signals = ['log_%s_count'%col for col in cat_cols] + ['log_%s_distinct_count_by_ip'%col for col in cat_cols[1:]]

def get_model():
    inputs = []
    embs = []
    for col in cat_cols:
        inp = Input(shape=(1, ), dtype='int32')
        emb = Flatten()(MappedEmbedding(embedding_set[col], FREE_EMB_SIZE)(inp))
        inputs.append(inp)
        embs.append(emb)
    
    zo_inputs = []
    for sig in zero_one_signals:
        inp = Input(shape=(1, ))
        zo_inputs.append(inp)
                
    emb = Concatenate()(embs + zo_inputs)
    emb = Dense(32, activation='selu')(emb)
    final = Dense(1, activation='sigmoid')(emb)
    
    model = Model(inputs=inputs+zo_inputs, outputs=final)
    model.compile(loss=roc_auc_score,
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model

model = get_model()
model.summary()

Using TensorFlow backend.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_5 (I

In [14]:
from keras.callbacks import Callback
from sklearn import metrics
import numpy as np
import keras

np.random.seed(777)
batch_size = 1024
epochs = 2000

class ROC_Callback(Callback):
    def __init__(self, validation_data):
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        super(ROC_Callback, self).__init__()
            
    def on_train_begin(self, logs={}):
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        #print("keke", self.model.uses_learning_phase)
        #print("hmm", K.set_learning_phase(False))
        y_pred_val = self.model.predict(self.x_val, batch_size=batch_size)
        #print("hmm", K.set_learning_phase(True))
        roc_val = metrics.roc_auc_score(self.y_val, y_pred_val)
        print('roc-auc_val: %s' % str(round(roc_val,4)),end=100*' '+'\n')
        #print(self.model.layers[6].get_weights())
        #print(self.model.layers[-2].get_weights())
        #print(y_pred_val)
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return   

In [15]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
from keras import backend as K

scores = []
fold = 0
for train_index, test_index in tscv.split(train):
    K.clear_session()
    
    fold = fold + 1
    
    val_data = ([train.iloc[test_index][col].cat.codes for col in cat_cols] + \
                [train.iloc[test_index][col] for col in zero_one_signals], train.iloc[test_index]['is_attributed'])
    
    model = get_model()
    file_path="model/weights_base_%d.best.hdf5"%fold
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', save_best_only=True)
    early = EarlyStopping(monitor="val_loss", mode="min", patience=3, verbose=1)
    
    callbacks_list = [checkpoint, early, ROC_Callback(val_data)]
    
    model.fit(
        [train.iloc[train_index][col].cat.codes for col in cat_cols] + \
        [train.iloc[train_index][col] for col in zero_one_signals],
        train.iloc[train_index]['is_attributed'],
        validation_data=val_data,
        shuffle=False,
        batch_size=batch_size, 
        epochs=epochs,
        callbacks=callbacks_list
    )
    
    model = keras.models.load_model(file_path, custom_objects={'roc_auc_score': roc_auc_score, 'MappedEmbedding': MappedEmbedding})
    #print("huehue", model.uses_learning_phase)
    #print("umm", K.learning_phase())
    #print(model.layers[6].get_weights())
    #print(model.layers[-2].get_weights())
    pred = model.predict(val_data[0], batch_size=batch_size)
    score = metrics.roc_auc_score(val_data[1], pred)
    #print(pred)
    print(score)
    scores.append(score)

print(scores)
print(np.mean(scores))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16670 samples, validate on 16666 samples
Epoch 1/2000
roc-auc_val: 0.5513                                                                                                    
Epoch 2/2000
roc-auc_val: 0.5452                                                                                                    
Epoch 3/2000
roc-auc_val: 0.5415                                                                                                    
Epoch 4/2000
roc-auc_val: 0.5335                                                                                                    
Epoch 5/2000
roc-auc_val: 0.5414                                                                                                    
Epoch 00005: early stopping
0.5452185136270861
Train on 33336 samples, validate on 16666 samples
Epoch 1/2000
roc-auc_val: 0.633                                                                                                    
Epoch 2/2000
roc-auc_val: 0.6628                        

In [11]:
def get_model_2():
    inp = Input(shape=(1, ), dtype='int32')
    emb = Flatten()(MappedEmbedding({1,2,0}, 10)(inp))    
    emb = Dense(32, activation='selu')(emb)
    final = Dense(1, activation='sigmoid')(emb)
    
    model = Model(inputs=inp, outputs=final)
    model.compile(loss=roc_auc_score,
                  optimizer='adam',
                  metrics=['accuracy', 'binary_crossentropy'])

    return model

In [12]:
model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_16 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
input_19 (

In [13]:
test_index = [0]
model.fit([train.iloc[test_index][col].cat.codes for col in cat_cols] + \
                [train.iloc[test_index][col] for col in zero_one_signals], train.iloc[test_index]['is_attributed'])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/1


<keras.callbacks.History at 0x11d7454a8>

In [128]:
model.layers[6].get_weights()

[array([[-0.03214715, -0.16870683,  0.10500881, -0.10001985,  0.03555877,
          0.15037759, -0.17230658,  0.14564137],
        [-0.04172715,  0.13678333, -0.06963095,  0.00411195,  0.009346  ,
         -0.0918017 ,  0.08801676, -0.08637956],
        [-0.03043789,  0.04082359,  0.00169233, -0.02834171, -0.00818788,
          0.07432535, -0.01728122,  0.02455434]], dtype=float32)]

In [129]:
test_index = [1,2,3,4,5,6]
model.predict([train.iloc[test_index][col].cat.codes for col in cat_cols] + \
                [train.iloc[test_index][col] for col in zero_one_signals])

array([[0.31607026],
       [0.31892636],
       [0.4487195 ],
       [0.32271594],
       [0.29100475],
       [0.33080772]], dtype=float32)

In [130]:
model.save('test.h5')

In [131]:
model_2 = keras.models.load_model('test.h5',  custom_objects={'roc_auc_score': roc_auc_score, 'MappedEmbedding': MappedEmbedding})

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [132]:
model_2.layers[-2].get_weights()

[array([[-0.23144867, -0.12154941, -0.27586138, ...,  0.16279975,
         -0.0250041 , -0.16909423],
        [ 0.01860329,  0.00501241, -0.12694931, ...,  0.14690122,
         -0.20517546, -0.01793425],
        [-0.07709707,  0.0790906 ,  0.07163449, ..., -0.11180533,
         -0.18836391,  0.32591605],
        ...,
        [-0.26686233,  0.04572774, -0.00395839, ..., -0.02381474,
          0.09236376, -0.26723504],
        [ 0.10473236, -0.12107859,  0.06303404, ..., -0.17733975,
         -0.23299211,  0.11016463],
        [-0.18442461, -0.01593718, -0.13219152, ..., -0.06198743,
          0.291708  ,  0.23921354]], dtype=float32),
 array([-0.06696922,  0.04911892, -0.06250626,  0.03865436,  0.08189637,
         0.05301873,  0.04343357,  0.05812351,  0.01407997,  0.09384292,
        -0.0411795 , -0.05950984,  0.0006912 , -0.09105127,  0.15746282,
        -0.11641625, -0.03445261,  0.01963986, -0.0724998 ,  0.06706317,
        -0.03925785, -0.08805388, -0.00085283,  0.09721431, -0.048

In [133]:
model_2.predict([train.iloc[test_index][col].cat.codes for col in cat_cols] + \
                [train.iloc[test_index][col] for col in zero_one_signals])

array([[0.31607026],
       [0.31892636],
       [0.4487195 ],
       [0.32271594],
       [0.29100475],
       [0.33080772]], dtype=float32)