In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import sys
import pathlib
from pathlib import Path

import tensorflow as tf
import tensorflow_addons as tfa

import operator
import seaborn as sns

import sklearn
from sklearn import model_selection
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

from tqdm import tqdm
import random
from random import choices

from collections import Counter, defaultdict


import warnings
warnings.filterwarnings("ignore")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# tf setup
print("Tensorflow version " + tf.__version__)
AUTO = tf.data.experimental.AUTOTUNE

MIXED_PRECISION = False
XLA_ACCELERATE = True

if MIXED_PRECISION:
    from tensorflow.keras.mixed_precision import experimental as mixed_precision
    if tpu: policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
    else: policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
    mixed_precision.set_policy(policy)
    print('Mixed precision enabled')

if XLA_ACCELERATE:
    tf.config.optimizer.set_jit(True)
    print('Accelerated Linear Algebra enabled')

Tensorflow version 2.3.1
Accelerated Linear Algebra enabled


# Config

In [3]:
SEED = 2021
START_DATE = 86
FOLDS = 5
DEBUG = False

# CV Strategy

## PurgedGroupTimeSeriesSplit
Click the code button to see. 

In [4]:
# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

## GroupKFold, StratifiedGroupKFold

In [5]:
# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx

# ---- StratifiedGroupKFold ----
class StratifiedGroupKFold(object):
    """
    StratifiedGroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        labels_num = np.max(y) + 1
        y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
        y_distr = Counter()
        groups = X[group].values
        for label, g in zip(y, groups):
            y_counts_per_group[g][label] += 1
            y_distr[label] += 1

        y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
        groups_per_fold = defaultdict(set)

        def eval_y_counts_per_fold(y_counts, fold):
            y_counts_per_fold[fold] += y_counts
            std_per_label = []
            for label in range(labels_num):
                label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(self.n_splits)])
                std_per_label.append(label_std)
            y_counts_per_fold[fold] -= y_counts
            return np.mean(std_per_label)
        
        groups_and_y_counts = list(y_counts_per_group.items())
        random.Random(self.random_state).shuffle(groups_and_y_counts)

        for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
            best_fold = None
            min_eval = None
            for i in range(self.n_splits):
                fold_eval = eval_y_counts_per_fold(y_counts, i)
                if min_eval is None or fold_eval < min_eval:
                    min_eval = fold_eval
                    best_fold = i
            y_counts_per_fold[best_fold] += y_counts
            groups_per_fold[best_fold].add(g)

        all_groups = set(groups)
        for i in range(self.n_splits):
            train_groups = all_groups - groups_per_fold[i]
            test_groups = groups_per_fold[i]

            train_idx = [i for i, g in enumerate(groups) if g in train_groups]
            test_idx = [i for i, g in enumerate(groups) if g in test_groups]

            yield train_idx, test_idx

### Loading the training data

In [6]:
train = pd.read_csv('../input/jane-street-market-prediction/train.csv') 

In [7]:
def reduce_memory_usage(df):
    
    start_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe is {start_memory} MB")
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            
            else:
                """if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)"""
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    pass
        else:
            df[col] = df[col].astype('category')
    
    end_memory = df.memory_usage().sum() / 1024**2
    print(f"Memory usage of dataframe after reduction {end_memory} MB")
    print(f"Reduced by {100 * (start_memory - end_memory) / start_memory} % ")
    return df

In [8]:
train = reduce_memory_usage(train)

Memory usage of dataframe is 2516.843978881836 MB
Memory usage of dataframe after reduction 1247.0233011245728 MB
Reduced by 50.45289610369131 % 


In [9]:
#train = pd.read_feather('../input/janestreet-save-as-feather/train.feather') # faster data load
train = train.query(f'date >= {START_DATE}').reset_index(drop = True) 
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use
train.fillna(train.mean(),inplace=True)
train = train.query('weight > 0').reset_index(drop = True)
# train = train.query('weight != 0').reset_index(drop = True)
#train['action'] = (train['resp'] > 0).astype('int')
train['action'] =  (  (train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) &  (train['resp'] > 0 )   ).astype('int')
features = [c for c in train.columns if 'feature' in c]

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

In [10]:
if DEBUG:
    train = train.sample(10000, random_state=SEED)

### Creating Autoencoder

In [11]:
def create_autoencoder(input_dim,output_dim,noise=0.05):
    i = tf.keras.layers.Input(input_dim)
    encoded = tf.keras.layers.BatchNormalization()(i)
    encoded = tf.keras.layers.GaussianNoise(noise)(encoded)
    encoded = tf.keras.layers.Dense(64,activation='relu')(encoded)
    decoded = tf.keras.layers.Dropout(0.2)(encoded)
    decoded = tf.keras.layers.Dense(input_dim,name='decoded')(decoded)
    x = tf.keras.layers.Dense(32,activation='relu')(decoded)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(output_dim,activation='sigmoid',name='label_output')(x)
    
    encoder = tf.keras.models.Model(inputs=i,outputs=encoded)
    autoencoder = tf.keras.models.Model(inputs=i,outputs=[decoded,x])
    
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(0.001), 
                        loss={'decoded':'mse','label_output':'binary_crossentropy'})
    return autoencoder, encoder

### PCA

In [12]:
p = features
p.append('resp')
len(p)

131

In [13]:
x = train[p].corr()
del p

In [14]:
x = x.abs()
upper = x.where(np.triu(np.ones(x.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(to_drop)
del x, upper

['feature_25', 'feature_35', 'feature_48', 'feature_61', 'feature_63', 'feature_66', 'feature_68', 'feature_101', 'feature_107', 'feature_108', 'feature_113', 'feature_114', 'feature_119', 'feature_122', 'feature_126', 'feature_127', 'feature_128', 'feature_129']


In [15]:
train.drop(to_drop, 1, inplace=True)
del to_drop

In [16]:
features = [c for c in train.columns if 'feature' in c]

X = train[features].values
y = train['action']

f_mean = np.mean(train[features[1:]].values,axis=0)

# Next, we hold out part of the training data to form the hold-out validation set
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2)
del valid_x, valid_y

## Make a predictor with XGBoost using treelite

In [17]:
import treelite
import treelite_runtime
import xgboost as xgb

In [18]:
# We create the XGboost-specific DMatrix data format from the numpy array. 
# This data structure is optimised for memory efficiency and training speed
dtrain = xgb.DMatrix(train_x, label=train_y)

In [19]:
params = {
        'n_estimators': 435,
        'max_depth': 24,
        'learning_rate': 0.09905592273886195,
        'subsample': 0.8704369112806065,
        'colsample_bytree': 0.9932309296458037,
        'objective': 'binary:logistic',
        'gamma': 7,
        "eval_metric" : 'logloss',
        'seed': 2021,
        'tree_method': 'gpu_hist'
        }
bst = xgb.train(params, dtrain, 100, [(dtrain, 'train')])

[0]	train-logloss:0.655629
[1]	train-logloss:0.623005
[2]	train-logloss:0.594485
[3]	train-logloss:0.567377
[4]	train-logloss:0.543478
[5]	train-logloss:0.521352
[6]	train-logloss:0.500838
[7]	train-logloss:0.482105
[8]	train-logloss:0.464513
[9]	train-logloss:0.448376
[10]	train-logloss:0.433093
[11]	train-logloss:0.418456
[12]	train-logloss:0.405401
[13]	train-logloss:0.391796
[14]	train-logloss:0.378452
[15]	train-logloss:0.366547
[16]	train-logloss:0.356094
[17]	train-logloss:0.345561
[18]	train-logloss:0.336285
[19]	train-logloss:0.327395
[20]	train-logloss:0.319892
[21]	train-logloss:0.312036
[22]	train-logloss:0.304651
[23]	train-logloss:0.298485
[24]	train-logloss:0.291532
[25]	train-logloss:0.285631
[26]	train-logloss:0.279113
[27]	train-logloss:0.274815
[28]	train-logloss:0.268346
[29]	train-logloss:0.263694
[30]	train-logloss:0.259645
[31]	train-logloss:0.254514
[32]	train-logloss:0.24937
[33]	train-logloss:0.245505
[34]	train-logloss:0.241948
[35]	train-logloss:0.237897
[36

In [20]:
# pass to treelite
model1 = treelite.Model.from_xgboost(bst)

In [21]:
# generate shared library
toolchain = 'gcc'
model1.export_lib(toolchain=toolchain, libpath='./mymodel.so',compiler='failsafe',
                     params={'parallel_comp': 32}, verbose=True)

[04:29:00] ../src/compiler/failsafe.cc:245: Using FailSafeCompiler
[04:29:08] ../src/c_api/c_api.cc:286: Code generation finished. Writing code to files...
[04:29:08] ../src/c_api/c_api.cc:291: Writing file recipe.json...
[04:29:08] ../src/c_api/c_api.cc:291: Writing file header.h...
[04:29:08] ../src/c_api/c_api.cc:291: Writing file main.c...
[04:29:08] ../src/c_api/c_api.cc:291: Writing file arrays.c...

[04:29:09] /opt/conda/lib/python3.7/site-packages/treelite/contrib/util.py:104: Compiling sources files in directory ./tmp4fhicd97 into object files (*.o)...
[04:30:29] /opt/conda/lib/python3.7/site-packages/treelite/contrib/util.py:133: Generating dynamic shared library ./tmp4fhicd97/predictor.so...
[04:30:30] /opt/conda/lib/python3.7/site-packages/treelite/contrib/__init__.py:278: Generated shared library in 81.53 seconds


In [22]:
# predictor from treelite
predictor = treelite_runtime.Predictor('./mymodel.so', verbose=True)

[04:30:30] ../src/predictor/predictor.cc:262: Dynamic shared library `/kaggle/working/mymodel.so' does not contain valid get_pred_transform() function
[04:30:30] ../src/predictor/predictor.cc:276: Dynamic shared library `/kaggle/working/mymodel.so' does not contain valid get_sigmoid_alpha() function
[04:30:30] ../src/predictor/predictor.cc:288: Dynamic shared library `/kaggle/working/mymodel.so' does not contain valid get_global_bias() function
[04:30:30] /opt/conda/lib/python3.7/site-packages/treelite_runtime/predictor.py:311: Dynamic shared library /kaggle/working/mymodel.so has been successfully loaded into memory


In [23]:
del y, train_x, train_y, dtrain

### Fit Autoencoder

In [24]:
y = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T #Multitarget
f_mean = np.mean(train[features[1:]].values,axis=0)

In [25]:
autoencoder, encoder = create_autoencoder(X.shape[-1],y.shape[-1],noise=0.1)
autoencoder.fit(X,(X,y),
                epochs=1000,
                batch_size=2048, 
                validation_split=0.1,
                callbacks=[tf.keras.callbacks.EarlyStopping('val_loss',patience=10,restore_best_weights=True)])
encoder.save_weights('./encoder.hdf5')
encoder.trainable = False

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000


# NN

In [26]:
def create_1dcnn(input_dim, output_dim, encoder):
    # input
    inputs = tf.keras.layers.Input(input_dim)
    
    x = encoder(inputs)
    x = tf.keras.layers.Concatenate()([x,inputs]) #use both raw and encoded features
    
    # normalize
    x = tf.keras.layers.BatchNormalization()(x)
    
    # 1dcnn
    x = tf.keras.layers.Dense(4096, activation='relu')(x)
    x = tf.keras.layers.Reshape((256, 16))(x)
    x = tf.keras.layers.Conv1D(filters=16,
                      kernel_size=7,
                      strides=1,
                      activation='relu')(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
    x = tf.keras.layers.Flatten()(x)
    
    # ffn
    for i in range(2):
        x = tf.keras.layers.Dense(256 // (2 ** i), activation='relu')(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.GaussianNoise(0.01)(x)
        x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(output_dim, activation='sigmoid')(x)
    
    model = tf.keras.models.Model(inputs=inputs,outputs=x)
    
    # compile
    opt = tfa.optimizers.RectifiedAdam(learning_rate=1e-03)
    opt = tfa.optimizers.SWA(opt)
    loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=1e-02)
    model.compile(optimizer=opt, 
                  loss=loss, 
                  metrics=[tf.keras.metrics.AUC(name = 'auc')])
    return model

In [27]:
def create_resnet(n_features, n_labels, encoder, label_smoothing = 0.0005):    
    input_1 = tf.keras.layers.Input(shape = (n_features,))
    input_2 = encoder(input_1)

    head_1 = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(512, activation="elu"), 
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(256, activation = "elu")
        ],name='Head1') 

    input_3 = head_1(input_1)
    input_3_concat = tf.keras.layers.Concatenate()([input_2, input_3])

    head_2 = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(512, "relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(512, "elu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(256, "relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(256, "elu")
        ],name='Head2')

    input_4 = head_2(input_3_concat)
    input_4_avg = tf.keras.layers.Average()([input_3, input_4]) 

    head_3 = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(256, kernel_initializer='lecun_normal', activation='selu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128, kernel_initializer='lecun_normal', activation='selu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1), name='l2_norm'),
        tf.keras.layers.Dense(n_labels, activation="sigmoid")
        ],name='Head3')

    output = head_3(input_4_avg)

    model = tf.keras.models.Model(inputs = [input_1, ], outputs = output)
    opt = tfa.optimizers.RectifiedAdam(learning_rate=1e-03)
    opt = tfa.optimizers.SWA(opt)
    model.compile(optimizer=opt, 
                  loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing), 
                  metrics=['AUC'])
    
    return model

In [28]:
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
    
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

In [29]:
CV_STRATEGY = 'StratifiedGroupKFold' # StratifiedGroupKFold, GroupKFold, PurgedGroupTimeSeriesSplit**
NN_NAME = 'mlp' # 1dcnn, resnet, mlp

# NN

### Creating 1dcnn, Resnet, MLP
Just put 1dcnn before fead-forward network

In [30]:
X_train = train.loc[:, train.columns.str.contains('feature')]

In [31]:
epochs = 200
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

tf.keras.backend.clear_session()
tf.random.set_seed(SEED)
clf = create_mlp(
    len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )

clf.fit(X_train, y, epochs=epochs, batch_size=batch_size, verbose=2)

# save model
clf.save(f'model.h5')

Epoch 1/200
384/384 - 3s - loss: 0.7172 - AUC: 0.5121
Epoch 2/200
384/384 - 5s - loss: 0.6946 - AUC: 0.5272
Epoch 3/200
384/384 - 3s - loss: 0.6914 - AUC: 0.5343
Epoch 4/200
384/384 - 3s - loss: 0.6904 - AUC: 0.5380
Epoch 5/200
384/384 - 3s - loss: 0.6900 - AUC: 0.5409
Epoch 6/200
384/384 - 3s - loss: 0.6897 - AUC: 0.5427
Epoch 7/200
384/384 - 4s - loss: 0.6894 - AUC: 0.5445
Epoch 8/200
384/384 - 3s - loss: 0.6892 - AUC: 0.5457
Epoch 9/200
384/384 - 3s - loss: 0.6890 - AUC: 0.5467
Epoch 10/200
384/384 - 3s - loss: 0.6889 - AUC: 0.5476
Epoch 11/200
384/384 - 3s - loss: 0.6887 - AUC: 0.5484
Epoch 12/200
384/384 - 3s - loss: 0.6886 - AUC: 0.5492
Epoch 13/200
384/384 - 3s - loss: 0.6884 - AUC: 0.5501
Epoch 14/200
384/384 - 3s - loss: 0.6883 - AUC: 0.5505
Epoch 15/200
384/384 - 3s - loss: 0.6881 - AUC: 0.5509
Epoch 16/200
384/384 - 4s - loss: 0.6880 - AUC: 0.5515
Epoch 17/200
384/384 - 3s - loss: 0.6879 - AUC: 0.5519
Epoch 18/200
384/384 - 4s - loss: 0.6878 - AUC: 0.5525
Epoch 19/200
384/38

In [32]:
del X_train

In [33]:
%%time


if CV_STRATEGY == 'PurgedGroupTimeSeriesSplit':
    gkf = PurgedGroupTimeSeriesSplit(n_splits=FOLDS, group_gap=20)
    splits = list(gkf.split(y, groups=train['date'].values))    
    
elif CV_STRATEGY == "GroupKFold":
    cv = GroupKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    splits = cv.split(train, train['resp'].values.astype(int), 'date')

elif CV_STRATEGY ==  "StratifiedGroupKFold":
    cv = StratifiedGroupKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    splits = cv.split(train, train['resp'].values.astype(int), 'date')

models = []
for fold, (train_indices, test_indices) in tqdm(enumerate(splits)):
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    # model
    tf.keras.backend.clear_session()
    model = clf
    
    # callbacks
    er = tf.keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True, monitor='val_loss')
    ReduceLR = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=8, verbose=1, mode='min')
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=f'./model_{SEED}_{fold}.hdf5', save_weights_only=True, verbose=0, monitor='val_loss', save_best_only=True)
    nn_callbacks = [er, ReduceLR, model_checkpoint_callback]
    
    # fit
    model.fit(X_train, y_train, validation_data=(X_test,y_test), 
              epochs=192, batch_size=2048, callbacks=nn_callbacks)
    models.append(model)

0it [00:00, ?it/s]

Epoch 1/192
Epoch 2/192
Epoch 3/192
Epoch 4/192
Epoch 5/192
Epoch 6/192
Epoch 7/192
Epoch 8/192
Epoch 9/192
Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.


1it [01:01, 61.05s/it]

Epoch 1/192
Epoch 2/192
Epoch 3/192
Epoch 4/192
Epoch 5/192
Epoch 6/192
Epoch 7/192
Epoch 8/192
Epoch 9/192
Epoch 00009: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 10/192


2it [01:59, 60.33s/it]

Epoch 1/192
Epoch 2/192
Epoch 3/192
Epoch 4/192
Epoch 5/192
Epoch 6/192
Epoch 7/192
Epoch 8/192
Epoch 9/192
Epoch 00009: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.


3it [02:53, 58.35s/it]

Epoch 1/192
Epoch 2/192
Epoch 3/192
Epoch 4/192
Epoch 5/192
Epoch 6/192
Epoch 7/192
Epoch 8/192
Epoch 9/192
Epoch 10/192
Epoch 11/192
Epoch 12/192
Epoch 13/192
Epoch 00013: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.


4it [04:08, 63.42s/it]

Epoch 1/192
Epoch 2/192
Epoch 3/192
Epoch 4/192
Epoch 5/192
Epoch 6/192
Epoch 7/192
Epoch 8/192
Epoch 9/192
Epoch 00009: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 10/192
Epoch 11/192
Epoch 12/192
Epoch 13/192
Epoch 14/192
Epoch 15/192
Epoch 16/192
Epoch 17/192
Epoch 00017: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-09.


5it [05:47, 69.44s/it]

CPU times: user 5min 18s, sys: 22.7 s, total: 5min 41s
Wall time: 5min 47s





In [34]:
%%time

if NN_NAME == '1dcnn':
    models = []

    for fold in range(FOLDS):
        # 1dcnn
        tf.keras.backend.clear_session()
        model = create_1dcnn(X.shape[-1], y.shape[-1], encoder)
        model.load_weights(pathlib.Path(f'/kaggle/working/model_{SEED}_{fold}.hdf5'))
        models.append(model)
        
    models = [models[-1]]

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.82 µs


In [35]:
%%time

if NN_NAME == 'resnet':
    models = []

    for fold in range(FOLDS):
        tf.keras.backend.clear_session()
        model = create_resnet(X.shape[-1], y.shape[-1], encoder)
        model.load_weights(pathlib.Path(f'/kaggle/working/model_{SEED}_{fold}.hdf5'))
        models.append(model)
        
    models = [models[-1]]

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs


In [36]:
%%time

if NN_NAME == 'mlp':
    model = tf.keras.models.load_model('./model.h5')
    models = [model]

CPU times: user 276 ms, sys: 72.1 ms, total: 348 ms
Wall time: 348 ms


In [37]:
f = np.median
th = 0.500

import janestreet
env = janestreet.make_env()
for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        
        # GBDT inference with treelite
        batch = treelite_runtime.Batch.from_npy2d(x_tt)
        xgb_pred = predictor.predict(batch)
    
        # NN inference
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        
        pred = np.mean([model(x_tt, training = False).numpy() for model in models],axis=0)
        pred = f(pred)
        
        # ensemble
        pred_df.action = np.where(0.9*pred + 0.1*xgb_pred >= th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)

15219it [04:18, 58.93it/s]
