In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import sys
import pathlib

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Config

In [2]:
START_DATE = 86

### Loading the training data

In [3]:
train = pd.read_feather('../input/jane-street-save-as-feather/train.feather') # faster data load
train = train.query(f'date >= {START_DATE}').reset_index(drop = True) 
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use
train.fillna(train.mean(),inplace=True)
train = train.query('weight > 0').reset_index(drop = True)
# train = train.query('weight != 0').reset_index(drop = True)
#train['action'] = (train['resp'] > 0).astype('int')
train['action'] =  (  (train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) &  (train['resp'] > 0 )   ).astype('int')
features = [c for c in train.columns if 'feature' in c]

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

### PCA

In [4]:
"""p = features
p.append('resp')
len(p)"""

"p = features\np.append('resp')\nlen(p)"

In [5]:
"""x = train[p].corr()
del p"""

'x = train[p].corr()\ndel p'

In [6]:
"""x = x.abs()
upper = x.where(np.triu(np.ones(x.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(to_drop)
del x, upper"""

'x = x.abs()\nupper = x.where(np.triu(np.ones(x.shape), k=1).astype(np.bool))\nto_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\nprint(to_drop)\ndel x, upper'

In [7]:
"""train.drop(to_drop, 1, inplace=True)
del to_drop"""

'train.drop(to_drop, 1, inplace=True)\ndel to_drop'

In [8]:
import sklearn
from sklearn.model_selection import train_test_split

In [9]:
X = train[features].values
y = train['action']

f_mean = np.mean(train[features[1:]].values,axis=0)

# Next, we hold out part of the training data to form the hold-out validation set
train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.2)
del valid_x, valid_y

## Make a predictor with XGBoost using treelite

In [10]:
!pip --quiet install ../input/treelite/treelite-0.93-py3-none-manylinux2010_x86_64.whl

In [11]:
!pip --quiet install ../input/treelite/treelite_runtime-0.93-py3-none-manylinux2010_x86_64.whl

In [12]:
import treelite
import treelite_runtime
import xgboost as xgb

In [13]:
# We create the XGboost-specific DMatrix data format from the numpy array. 
# This data structure is optimised for memory efficiency and training speed
dtrain = xgb.DMatrix(train_x, label=train_y)

In [14]:
params = {
        'n_estimators': 435,
        'max_depth': 24,
        'learning_rate': 0.09905592273886195,
        'subsample': 0.8704369112806065,
        'colsample_bytree': 0.9932309296458037,
        'objective': 'binary:logistic',
        'gamma': 7,
        "eval_metric" : 'logloss',
        'seed': 2021,
        'tree_method': 'gpu_hist'
        }
bst = xgb.train(params, dtrain, 100, [(dtrain, 'train')])

[0]	train-logloss:0.655752
[1]	train-logloss:0.622782
[2]	train-logloss:0.593733
[3]	train-logloss:0.567687
[4]	train-logloss:0.543443
[5]	train-logloss:0.521045
[6]	train-logloss:0.501288
[7]	train-logloss:0.481838
[8]	train-logloss:0.463446
[9]	train-logloss:0.446941
[10]	train-logloss:0.43146
[11]	train-logloss:0.416752
[12]	train-logloss:0.403764
[13]	train-logloss:0.38978
[14]	train-logloss:0.377676
[15]	train-logloss:0.366772
[16]	train-logloss:0.355307
[17]	train-logloss:0.346144
[18]	train-logloss:0.337368
[19]	train-logloss:0.329091
[20]	train-logloss:0.320345
[21]	train-logloss:0.312596
[22]	train-logloss:0.30561
[23]	train-logloss:0.299382
[24]	train-logloss:0.292468
[25]	train-logloss:0.286765
[26]	train-logloss:0.282191
[27]	train-logloss:0.277062
[28]	train-logloss:0.272558
[29]	train-logloss:0.26749
[30]	train-logloss:0.261801
[31]	train-logloss:0.257442
[32]	train-logloss:0.253523
[33]	train-logloss:0.249871
[34]	train-logloss:0.24629
[35]	train-logloss:0.243225
[36]	tr

In [15]:
# pass to treelite
model = treelite.Model.from_xgboost(bst)

In [16]:
# generate shared library
toolchain = 'gcc'
model.export_lib(toolchain=toolchain, libpath='./mymodel.so',compiler='failsafe',
                     params={'parallel_comp': 32}, verbose=True)

[08:29:02] ../src/compiler/failsafe.cc:245: Using FailSafeCompiler
[08:29:10] ../src/c_api/c_api.cc:286: Code generation finished. Writing code to files...
[08:29:10] ../src/c_api/c_api.cc:291: Writing file recipe.json...
[08:29:10] ../src/c_api/c_api.cc:291: Writing file header.h...
[08:29:10] ../src/c_api/c_api.cc:291: Writing file main.c...
[08:29:10] ../src/c_api/c_api.cc:291: Writing file arrays.c...

[08:29:10] /opt/conda/lib/python3.7/site-packages/treelite/contrib/util.py:104: Compiling sources files in directory ./tmpl9s3vfdq into object files (*.o)...
[08:30:25] /opt/conda/lib/python3.7/site-packages/treelite/contrib/util.py:133: Generating dynamic shared library ./tmpl9s3vfdq/predictor.so...
[08:30:26] /opt/conda/lib/python3.7/site-packages/treelite/contrib/__init__.py:278: Generated shared library in 75.43 seconds


In [17]:
# predictor from treelite
predictor = treelite_runtime.Predictor('./mymodel.so', verbose=True)

[08:30:26] ../src/predictor/predictor.cc:262: Dynamic shared library `/kaggle/working/mymodel.so' does not contain valid get_pred_transform() function
[08:30:26] ../src/predictor/predictor.cc:276: Dynamic shared library `/kaggle/working/mymodel.so' does not contain valid get_sigmoid_alpha() function
[08:30:26] ../src/predictor/predictor.cc:288: Dynamic shared library `/kaggle/working/mymodel.so' does not contain valid get_global_bias() function
[08:30:26] /opt/conda/lib/python3.7/site-packages/treelite_runtime/predictor.py:311: Dynamic shared library /kaggle/working/mymodel.so has been successfully loaded into memory


In [18]:
"""import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set"""

'import janestreet\nenv = janestreet.make_env() # initialize the environment\niter_test = env.iter_test() # an iterator which loops over the test set'

In [19]:
"""TRADING_THRESHOLD = 0.50"""

'TRADING_THRESHOLD = 0.50'

In [20]:
"""for (test_df, pred_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0:
        # inference with treelite
        batch = treelite_runtime.Batch.from_npy2d(test_df[features].values)
        pred_df.action = (predictor.predict(batch) > TRADING_THRESHOLD).astype('int')
    else:
        pred_df.action = 0
    env.predict(pred_df)"""

"for (test_df, pred_df) in tqdm(iter_test):\n    if test_df['weight'].item() > 0:\n        # inference with treelite\n        batch = treelite_runtime.Batch.from_npy2d(test_df[features].values)\n        pred_df.action = (predictor.predict(batch) > TRADING_THRESHOLD).astype('int')\n    else:\n        pred_df.action = 0\n    env.predict(pred_df)"