## Full Modeling Pipeline - RNN

Model from https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56262

Pipeline includes:
- Data load 
- Data processing
- Feature engineering
- Model running
- Predict test data

In [1]:
import os
import gc
import time
import psutil
import pickle
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
import keras.backend as K
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from keras.layers import (Dense, Input, GRU, Bidirectional, Embedding,
                          Dropout, BatchNormalization, Lambda, PReLU,
                          GaussianDropout, Reshape, concatenate)
from keras.models import Model
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.optimizers import Adam

warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
from contextlib import contextmanager

@contextmanager
def timer(name):
    """ @author: lopuhin @address https://www.kaggle.com/lopuhin/ """
    start_time = time.time()
    yield
    print('[{} done in {:.3f} s.]'.format(name, time.time() - start_time))
    
def reduce_memory_usage(df):
    """ @author: gemartin @address: https://www.kaggle.com/gemartin """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    dtypes = {}
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                    dtypes[col] = np.int8
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                    dtypes[col] = np.int16
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                    dtypes[col] = np.int32
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    dtypes[col] = np.int64
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                    dtypes[col] = np.float16
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                    dtypes[col] = np.float32
                else:
                    df[col] = df[col].astype(np.float64)
                    dtypes[col] = np.float64
        else: 
            df[col] = df[col].astype('category')
            dtypes[col] = 'category'
    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.2f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df, dtypes

In [3]:
def extract_datetime_components(df, cols, drop_raw=False):
    if isinstance(cols, str):
        cols = [cols]

    for col in cols:
        df[col] = pd.to_datetime(df[col])
        df[col + '_day']  = df[col].dt.day.astype(np.int8)
        df[col + '_hour'] = df[col].dt.hour.astype(np.int8)
        
    if drop_raw:
        df.drop(labels=cols, axis=1, inplace=True)

    return df

def add_group_stats(df, cols, value, method):
    if not isinstance(value, str):
        raise NotImplementedError('Only support value to be string format (column name)')

    if isinstance(method, str):
        method = [method]

    method_options = ['nunique', 'count', 'mean', 'median', 'std', 
                      'var', 'max', 'min', 'sum', 'skew', 'kurtosis']
    if any([True if m not in method_options else False for m in method]):
        raise AttributeError('Only support method in {}.'.format(method_options))

    if isinstance(cols, str):
        cols = [cols]

    new_cols = ['_'.join([*cols, value, m]) if m != 'count' else '_'.join([*cols, m]) for m in method]

    df_feats = pd.DataFrame(df.groupby(cols)[value].agg(method)).reset_index()
    df_feats.columns = cols + new_cols
    df = df.merge(df_feats, on=cols, how='left')
    
    # convert data types to save memory
    for col in new_cols:
        c_min = df[col].min()
        c_max = df[col].max()
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
            df[col] = df[col].astype(np.int64)
            
    return df, new_cols

def process_data(df, groupby_extraction_setting, groupby_extraction_for_timedetla):
    # datetime
    df = extract_datetime_components(df, cols=['click_time'])
    
    # group by on categorical
    for setting in groupby_extraction_setting:
        cols = setting[0]
        for value, method in setting[1]:
            df, new_cols = add_group_stats(df, cols, value, method)
            df[new_cols].fillna(df[new_cols].mean(), inplace=True)
            
    # timedelta
    for groupby_cols in groupby_extraction_for_timedetla:
        click_time = df[groupby_cols + ['click_time']].sort_values('click_time')
        prevfix = '_'.join(groupby_cols)
        click_time[f'{prevfix}_click_time_prev1'] = click_time.groupby(groupby_cols)['click_time'].shift(1)
        click_time[f'{prevfix}_click_time_prev2'] = click_time.groupby(groupby_cols)['click_time'].shift(2)
        df[f'{prevfix}_click_time_prev1_diff'] = (click_time['click_time'] - click_time[f'{prevfix}_click_time_prev1']).dt.seconds
        df[f'{prevfix}_click_time_prev2_diff'] = (click_time['click_time'] - click_time[f'{prevfix}_click_time_prev2']).dt.seconds
        df[f'{prevfix}_click_time_prev1_diff'].fillna(df[f'{prevfix}_click_time_prev1_diff'].max(), inplace=True)
        df[f'{prevfix}_click_time_prev2_diff'].fillna(df[f'{prevfix}_click_time_prev2_diff'].max(), inplace=True)
        
    df.drop(columns=['click_time'], axis=1, inplace=True)
    df, dtypes = reduce_memory_usage(df)
    return df, dtypes

In [4]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))
            
def build_model(X_train_numerical, X_valid_numerical,
                X_train_categorical, X_valid_categorical,
                y_train, y_valid):
    
    embedding_size = [16, 16, 16, 8, 8]
    categorical_num = {
        'app'            : (769,  embedding_size[0]),
        'device'         : (4228, embedding_size[1]),
        'os'             : (957,  embedding_size[2]),
        'channel'        : (501,  embedding_size[3]),
        'click_time_hour': (24,   embedding_size[4]),
    }
    categorial_inp = Input(shape=(len(categorical_features), ))
    cat_embeds = []
    for idx, col in enumerate(categorical_features):
        x = Lambda(lambda x: x[:, idx, None])(categorial_inp)
        x = Embedding(categorical_num[col][0], categorical_num[col][1], input_length=1)(x)
        cat_embeds.append(x)
    embeds = concatenate(cat_embeds, axis=2)
    embeds = GaussianDropout(0.2)(embeds)
    categorial_out = Reshape([sum(embedding_size)])(embeds)
    
    numerical_inp = Input(shape=(len(numerical_features),))
    cx = Reshape([1, len(numerical_features)])(numerical_inp)
    
    x = concatenate([embeds, cx], axis=2)
    x = GRU(128)(x)
    x = BatchNormalization()(x)
    x = Dropout(0.20)(x)
    x = Dense(64)(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.20)(x)
    x = Dense(32)(x)
    x = PReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(0.05)(x)
    
    outp = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=[categorial_inp, numerical_inp], output=outp)
    model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
    
    X_train = [X_train_categorical, X_train_numerical]
    X_valid = [X_valid_categorical, X_valid_numerical]
    roc_auc_eval = RocAucEvaluation(validation_data=(X_valid, y_valid), interval=1)
    
    model.fit(X_train, y_train, batch_size=10000, epochs=10, 
              validation_data=(X_valid, y_valid), verbose=2, callbacks=[roc_auc_eval])
    return model

Visualize structure of the model

    
<img src="network_structure.jpg">

In [5]:
with timer('Load full training data'):
    data_path = '../input/talkingdata-adtracking-fraud-detection/'
    
    nov_7_start = 9308568
    nov_7_end   = 68941877

    nov_8_start = 68941878
    nov_8_end   = 131886952
    
    nov_9_start = 131886953
    nov_9_4     = 144708152
    nov_9_15    = 181878211
    
    read_dtypes = {
        'ip':            'uint32',
        'app':           'uint16',
        'device':        'uint16',
        'os':            'uint16',
        'channel':       'uint16',
        'click_id':      'uint32'
    }
    
    train_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
    test_cols = ['ip', 'app', 'device', 'os', 'channel', 'click_time']
    
    # read training data on 11/07 and 11/08
    train = pd.read_csv(os.path.join(data_path, 'train.csv'),
                        dtype=read_dtypes,
                        skiprows=range(1, nov_7_start + 1),
                        nrows=nov_9_4 - nov_7_start,
                        usecols=train_cols)
   
    # sample (save memory)
    train_pos_samples = train.loc[train['is_attributed'] == 1]
    train_neg_samples = train.loc[train['is_attributed'] == 0].sample(frac=0.05)
    del train
    train = pd.concat([train_pos_samples, train_neg_samples], ignore_index=True)
    train = train.sample(frac=1.0, replace=False, random_state=2020)
    # https://www.kaggle.com/c/talkingdata-adtracking-fraud-detection/discussion/56279
    train = train.loc[train['device'] != 3032]
    print(f'Training data size: {train.shape}')
    
    valid = pd.read_csv(os.path.join(data_path, 'train.csv'),
                        dtype=read_dtypes,
                        skiprows=range(1, nov_9_4 + 1),
                        nrows=nov_9_15 - nov_9_4,
                        usecols=train_cols)
    valid = valid.loc[valid['device'] != 3032]
    valid = valid.sample(frac=0.05, replace=False, random_state=2020)
    print(f'Validation data size: {valid.shape}')

Training data size: (7061491, 7)
Validation data size: (1858503, 7)
[Load full training data done in 273.187 s.]


In [6]:
with timer('Process data'):
    groupby_extraction_setting = [
        # 2 way combination
        (['ip'], [('os', 'count'),
                  ('os', 'nunique'),
                  ('app', 'nunique'),
                  ('device', 'nunique'),
                  ('channel', 'nunique')]),
        (['os'], [('ip', 'count')]),
        (['app'], [('ip', 'count')]),
        (['device'], [('ip', 'count')]),
        (['channel'], [('ip', 'count')]),
        (['click_time_hour'], [('ip', 'count')]),

        # 3 way combination
        (['ip', 'os'], [('app', 'count')]),
        (['ip', 'app'], [('os', 'count')]),
        (['ip', 'device'], [('os', 'count')]),
        (['ip', 'channel'], [('os', 'count')]),
        (['ip', 'click_time_hour'], [('os', 'count')]),

        # 4 way combination
        # (['ip', 'os', 'app'], [('device', 'count')]),
        (['ip', 'os', 'device'], [('app', 'count')]),
        # (['ip', 'os', 'channel'], [('app', 'count')]),
        (['ip', 'os', 'click_time_hour'], [('app', 'count')]),
        (['ip', 'app', 'device'], [('os', 'count')]),
        # (['ip', 'app', 'channel'], [('os', 'count')]),
        # (['ip', 'app', 'click_time_hour'], [('os', 'count')]),
        (['ip', 'device', 'channel'], [('os', 'count')]),
        (['ip', 'device', 'click_time_hour'], [('os', 'count')]),
        (['ip', 'channel', 'click_time_hour'], [('os', 'count')]),

        # 5 way combination
        (['ip', 'os', 'app', 'device'], [('channel', 'count')]),
        # (['ip', 'os', 'app', 'channel'], [('device', 'count')]),
        (['ip', 'os', 'app', 'click_time_hour'], [('device', 'count')]),
        # (['ip', 'os', 'device', 'channel'], [('app', 'count')]),
        (['ip', 'os', 'device', 'click_time_hour'], [('app', 'count')]),
        (['ip', 'os', 'channel', 'click_time_hour'], [('app', 'count')]),
        # (['ip', 'app', 'device', 'channel'], [('os', 'count')]),
        (['ip', 'app', 'channel', 'click_time_hour'], [('os', 'count')]),
        # (['ip', 'app', 'device', 'click_time_hour'], [('os', 'count')]),
        (['ip', 'device', 'channel', 'click_time_hour'], [('os', 'count')]),
    ]
    
    groupby_extraction_for_timedetla = [
        ['ip', 'device', 'os'],
        ['ip', 'app', 'channel'],
        ['ip', 'device', 'os', 'app'],
        ['ip', 'device', 'os', 'channel']
    ]
    
    train, train_dtypes = process_data(train, groupby_extraction_setting, groupby_extraction_for_timedetla)
    valid, valid_dtypes = process_data(valid, groupby_extraction_setting, groupby_extraction_for_timedetla)
    gc.collect()

Memory usage of dataframe is 1070.76 MB
Memory usage after optimization is: 808.12 MB
Decreased by 24.53%
Memory usage of dataframe is 269.41 MB
Memory usage after optimization is: 171.92 MB
Decreased by 36.18%
[Process data done in 272.922 s.]


In [7]:
with timer('Get feature names'):
    columns = train.columns.tolist()
    for col in ['ip', 'is_attributed', 'click_time_day']:
        columns.remove(col)

    categorical_features = ['app', 'device', 'os', 'channel', 'click_time_hour']
    numerical_features = [f for f in columns if f not in categorical_features]
    features = categorical_features.copy() + numerical_features.copy()
    target = 'is_attributed'
    
    gc.collect()

[Get feature names done in 0.110 s.]


In [8]:
with timer('Run RNN'):
    # preprocessing IMPORTANT
    scaler = StandardScaler()
    scaler.fit(train[numerical_features].values)
    model = build_model(
        scaler.transform(train[numerical_features].values),
        scaler.transform(valid[numerical_features].values),
        train[categorical_features].values,
        valid[categorical_features].values,
        train[target],
        valid[target])

Train on 7061491 samples, validate on 1858503 samples
Epoch 1/10
 - 132s - loss: 0.0664 - accuracy: 0.9810 - val_loss: 0.0298 - val_accuracy: 0.9927

 ROC-AUC - epoch: 1 - score: 0.957216
Epoch 2/10
 - 126s - loss: 0.0502 - accuracy: 0.9854 - val_loss: 0.0356 - val_accuracy: 0.9883

 ROC-AUC - epoch: 2 - score: 0.958921
Epoch 3/10
 - 128s - loss: 0.0496 - accuracy: 0.9856 - val_loss: 0.0480 - val_accuracy: 0.9835

 ROC-AUC - epoch: 3 - score: 0.956777
Epoch 4/10
 - 128s - loss: 0.0491 - accuracy: 0.9857 - val_loss: 0.0346 - val_accuracy: 0.9885

 ROC-AUC - epoch: 4 - score: 0.958427
Epoch 5/10
 - 126s - loss: 0.0489 - accuracy: 0.9858 - val_loss: 0.0575 - val_accuracy: 0.9820

 ROC-AUC - epoch: 5 - score: 0.952561
Epoch 6/10
 - 125s - loss: 0.0487 - accuracy: 0.9858 - val_loss: 0.0485 - val_accuracy: 0.9825

 ROC-AUC - epoch: 6 - score: 0.954564
Epoch 7/10
 - 134s - loss: 0.0485 - accuracy: 0.9859 - val_loss: 0.0375 - val_accuracy: 0.9879

 ROC-AUC - epoch: 7 - score: 0.954973
Epoch 8/

In [9]:
with timer('Make predictions'):
    # read test data
    test = pd.read_csv(os.path.join(data_path, 'test.csv'),
                       dtype=read_dtypes,
                       usecols=test_cols)
    print(f'Test data size: {test.shape}')
    
    test, test_dtypes = process_data(test, groupby_extraction_setting, groupby_extraction_for_timedetla)
    gc.collect()
    
    submission = pd.read_csv(
        '../input/talkingdata-adtracking-fraud-detection/sample_submission.csv')
    test_pred = model.predict([test[categorical_features].values, test[numerical_features].values])
    submission['is_attributed'] = test_pred
    submission.to_csv('submission.csv', index=False)

Test data size: (18790469, 6)
Memory usage of dataframe is 2903.04 MB
Memory usage after optimization is: 2042.88 MB
Decreased by 29.63%
[Make predictions done in 1473.902 s.]
