In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/janestreet/data.parquet


This notebook's approach is using LSTM for times-series method.

* Ver 1: XGB
* Ver 2: PCA + XGB


In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings ("ignore")
import gc  
import xgboost as xgb
import optuna
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
SEED = 1111
tf.random.set_seed(SEED)
np.random.seed(SEED)

In [3]:
# this data is already excluded day <= 85

data = pd.read_parquet('../input/janestreet/data.parquet')

# Select trade with weight !=  0:
data = data[data['weight'] != 0]
# # limit memory use: we change datatype from float64 to float32
data = data.astype({c: np.float32 for c in data.select_dtypes(include='float64').columns}) 

# create target variable
data['action'] = (data['resp'] > 0)*1

# fill null values with mean of each feature
data.fillna(data.mean(),inplace=True)

#create fetures list
features = [c for c in data.columns if 'feature' in c]

In [4]:
# 20 / 80 split
df_trainvalid, df_test = np.split(data, [int(.8*len(data))])

df_train, df_valid = np.split(df_trainvalid, [int(.8*len(df_trainvalid))])

X_train = df_train[features]
y_train = df_train['action']

X_valid = df_valid[features]
y_valid = df_valid['action']

X_test = df_test[features]
y_test = df_test['action']


In [5]:
NORMALIZE_NONE = 0
NORMALIZE_MIN_MAX = 1
NORMALIZE_MEAN = 2

def normalize_data(df):
    if NORMALIZE_TYPE == NORMALIZE_MIN_MAX:
        return (df-df.min())/(df.max()-df.min())
    elif NORMALIZE_TYPE == NORMALIZE_MEAN:
        return (df-df.mean())/df.std()
    else:
        return df;
    
NORMALIZE_TYPE = NORMALIZE_MEAN

X_train = normalize_data(X_train)
X_valid = normalize_data(X_valid)
X_test = normalize_data(X_test)

X_trainvalid = normalize_data(df_trainvalid[features])
y_trainvalid = df_trainvalid['action']

In [6]:

from sklearn.decomposition import PCA

pca = PCA(n_components=50).fit(X_train)
X_train = pca.transform(X_train)
X_valid = pca.transform(X_valid)
X_test = pca.transform(X_test)

X_trainvalid = pca.transform(X_trainvalid)

In [7]:
# We create the XGboost-specific DMatrix data format from the numpy array. 
# This data structure is optimised for memory efficiency and training speed
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [8]:
del data, df_train, df_valid, df_trainvalid
gc.collect()

165

In [9]:
# The objective function is passed an Optuna specific argument of trial
def objective(trial):
    
# params specifies the XGBoost hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 25),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
# trials will be evaluated based on their accuracy on the test set
    accuracy = accuracy_score(y_valid, pred_labels)
    return accuracy



In [10]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, timeout=600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-03-11 12:35:08,368][0m A new study created in memory with name: no-name-3faf5fe9-173d-4d14-a069-4b4d2c1e369b[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:35:11,355][0m Trial 0 finished with value: 0.5063815739757465 and parameters: {'n_estimators': 211, 'max_depth': 15, 'learning_rate': 0.02452998267512899, 'subsample': 0.504538123835391, 'colsample_bytree': 0.77820350802674, 'gamma': 8}. Best is trial 0 with value: 0.5063815739757465.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:35:26,175][0m Trial 1 finished with value: 0.504651449526105 and parameters: {'n_estimators': 388, 'max_depth': 18, 'learning_rate': 0.040304793699077225, 'subsample': 0.6395035247129126, 'colsample_bytree': 0.6618539275128126, 'gamma': 1}. Best is trial 0 with value: 0.5063815739757465.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:35:27,085][0m Trial 2 finished with value: 0.5082628357336324 and parameters: {'n_estimators': 564, 'max_depth': 11, 'learning_rate': 0.07746034282162846, 'subsample': 0.8606702302304305, 'colsample_bytree': 0.9835078200035376, 'gamma': 10}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:17,100][0m Trial 3 finished with value: 0.5032912137519041 and parameters: {'n_estimators': 397, 'max_depth': 24, 'learning_rate': 0.08492507653468309, 'subsample': 0.9826057901566732, 'colsample_bytree': 0.6047665722335045, 'gamma': 2}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:31,921][0m Trial 4 finished with value: 0.5058247523137929 and parameters: {'n_estimators': 527, 'max_depth': 22, 'learning_rate': 0.026656033590940167, 'subsample': 0.9304770497248106, 'colsample_bytree': 0.8135561397335315, 'gamma': 6}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:35,332][0m Trial 5 finished with value: 0.5066281664260401 and parameters: {'n_estimators': 399, 'max_depth': 25, 'learning_rate': 0.06997614779510761, 'subsample': 0.5815197869807773, 'colsample_bytree': 0.7198439806310271, 'gamma': 10}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:36,528][0m Trial 6 finished with value: 0.5080759027471194 and parameters: {'n_estimators': 389, 'max_depth': 11, 'learning_rate': 0.09814022157778256, 'subsample': 0.6555624453592859, 'colsample_bytree': 0.557994168026312, 'gamma': 1}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:39,206][0m Trial 7 finished with value: 0.5052241803784001 and parameters: {'n_estimators': 409, 'max_depth': 16, 'learning_rate': 0.0933440328205724, 'subsample': 0.8346053148297473, 'colsample_bytree': 0.5262333664712917, 'gamma': 7}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:43,242][0m Trial 8 finished with value: 0.5065685069622594 and parameters: {'n_estimators': 203, 'max_depth': 23, 'learning_rate': 0.05856389845659468, 'subsample': 0.7064349840106293, 'colsample_bytree': 0.9735777146954059, 'gamma': 9}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:51,072][0m Trial 9 finished with value: 0.5056736150055483 and parameters: {'n_estimators': 203, 'max_depth': 16, 'learning_rate': 0.05663846783572531, 'subsample': 0.7381475879407631, 'colsample_bytree': 0.7896841920190132, 'gamma': 2}. Best is trial 2 with value: 0.5082628357336324.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:52,302][0m Trial 10 finished with value: 0.5087082930631953 and parameters: {'n_estimators': 600, 'max_depth': 11, 'learning_rate': 0.07755534788843921, 'subsample': 0.8595852736665608, 'colsample_bytree': 0.9999778055169534, 'gamma': 4}. Best is trial 10 with value: 0.5087082930631953.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:53,113][0m Trial 11 finished with value: 0.5071969199807499 and parameters: {'n_estimators': 600, 'max_depth': 10, 'learning_rate': 0.0760900591009763, 'subsample': 0.8517146023618044, 'colsample_bytree': 0.9994834727071495, 'gamma': 5}. Best is trial 10 with value: 0.5087082930631953.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:55,017][0m Trial 12 finished with value: 0.5070298734821638 and parameters: {'n_estimators': 596, 'max_depth': 12, 'learning_rate': 0.07368733254080126, 'subsample': 0.8429437626921459, 'colsample_bytree': 0.9131564665821954, 'gamma': 4}. Best is trial 10 with value: 0.5087082930631953.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:36:57,543][0m Trial 13 finished with value: 0.5093168195937589 and parameters: {'n_estimators': 513, 'max_depth': 13, 'learning_rate': 0.08361444315773094, 'subsample': 0.9116325741982502, 'colsample_bytree': 0.889605369619971, 'gamma': 4}. Best is trial 13 with value: 0.5093168195937589.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:00,195][0m Trial 14 finished with value: 0.5085531784573654 and parameters: {'n_estimators': 499, 'max_depth': 13, 'learning_rate': 0.09991103316252992, 'subsample': 0.990476253729074, 'colsample_bytree': 0.8944797294962171, 'gamma': 4}. Best is trial 13 with value: 0.5093168195937589.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:17,410][0m Trial 15 finished with value: 0.5035258743094417 and parameters: {'n_estimators': 470, 'max_depth': 19, 'learning_rate': 0.08673688282107343, 'subsample': 0.922362813466417, 'colsample_bytree': 0.8759437637126043, 'gamma': 3}. Best is trial 13 with value: 0.5093168195937589.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:20,647][0m Trial 16 finished with value: 0.507157147004896 and parameters: {'n_estimators': 319, 'max_depth': 14, 'learning_rate': 0.04221477876602928, 'subsample': 0.7747397439458433, 'colsample_bytree': 0.9369934008222547, 'gamma': 5}. Best is trial 13 with value: 0.5093168195937589.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:21,547][0m Trial 17 finished with value: 0.5095753439368087 and parameters: {'n_estimators': 550, 'max_depth': 10, 'learning_rate': 0.06778869537300215, 'subsample': 0.9211624171838538, 'colsample_bytree': 0.8557756580958944, 'gamma': 0}. Best is trial 17 with value: 0.5095753439368087.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:49,822][0m Trial 18 finished with value: 0.504786677644008 and parameters: {'n_estimators': 462, 'max_depth': 20, 'learning_rate': 0.0641372783945584, 'subsample': 0.9296767683544079, 'colsample_bytree': 0.8546274663575877, 'gamma': 0}. Best is trial 17 with value: 0.5095753439368087.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:52,811][0m Trial 19 finished with value: 0.5093764790575396 and parameters: {'n_estimators': 539, 'max_depth': 13, 'learning_rate': 0.04899994747685207, 'subsample': 0.7919797276718263, 'colsample_bytree': 0.7282250177198949, 'gamma': 0}. Best is trial 17 with value: 0.5095753439368087.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:53,682][0m Trial 20 finished with value: 0.5072168064686768 and parameters: {'n_estimators': 553, 'max_depth': 10, 'learning_rate': 0.04521524696088189, 'subsample': 0.7806658918037004, 'colsample_bytree': 0.7068194430557915, 'gamma': 0}. Best is trial 17 with value: 0.5095753439368087.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:37:56,722][0m Trial 21 finished with value: 0.5065883934501864 and parameters: {'n_estimators': 520, 'max_depth': 13, 'learning_rate': 0.049462787821596604, 'subsample': 0.9062277364579486, 'colsample_bytree': 0.8375494770132891, 'gamma': 0}. Best is trial 17 with value: 0.5095753439368087.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:38:01,077][0m Trial 22 finished with value: 0.5085929514332191 and parameters: {'n_estimators': 560, 'max_depth': 14, 'learning_rate': 0.062306793832496536, 'subsample': 0.9572012524614159, 'colsample_bytree': 0.7252381269269151, 'gamma': 2}. Best is trial 17 with value: 0.5095753439368087.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:38:03,037][0m Trial 23 finished with value: 0.5078849924630211 and parameters: {'n_estimators': 455, 'max_depth': 12, 'learning_rate': 0.03348268409324293, 'subsample': 0.7941644209447025, 'colsample_bytree': 0.6766636649761969, 'gamma': 1}. Best is trial 17 with value: 0.5095753439368087.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-11 12:38:10,576][0m Trial 24 finished with value: 0.506210550179575 and parameters: {'n_estimators': 525, 'max_depth': 16, 'learning_rate': 0.05039179316980943, 'subsample': 0.8754518282380466, 'colsample_bytree': 0.7632230179714273, 'gamma': 3}. Best is trial 17 with value: 0.5095753439368087.[0m


Number of finished trials:  25
Best trial:
  Value: 0.5095753439368087
  Params: 
    n_estimators: 550
    max_depth: 10
    learning_rate: 0.06778869537300215
    subsample: 0.9211624171838538
    colsample_bytree: 0.8557756580958944
    gamma: 0


In [11]:
best_params = trial.params
best_params['tree_method'] = 'gpu_hist' 
best_params['objective'] = 'binary:logistic'

# Fit the XGBoost classifier with optimal hyperparameters
optimal_clf = xgb.XGBClassifier(**best_params)
                                
optimal_clf.fit(X_trainvalid, y_trainvalid)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8557756580958944, gamma=0,
              gpu_id=0, importance_type='gain', interaction_constraints='',
              learning_rate=0.06778869537300215, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=550, n_jobs=2, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9211624171838538, tree_method='gpu_hist',
              validate_parameters=1, verbosity=None)

In [12]:
prediction = optimal_clf.predict(X_test)
result_df = pd.DataFrame({'Date': df_test['date'], 'Weight': df_test['weight'],
                          'Resp': df_test['resp'], 'Action': prediction})

result_df['P'] = result_df['Weight']*result_df['Resp']*result_df['Action']
result_groupby_days = result_df[['Date', 'P']].groupby('Date').sum().reset_index()

p = result_groupby_days['P'].values

t = (np.sum(p)/(np.sqrt(np.sum(p**2))))*np.sqrt(250/len(p))

u = min(max(t, 0), 6) * np.sum(p)

print(f"Utility score is: {u:.3f}")

Utility score is: 1447.022
