In [2]:
import os
import numpy as np
import pandas as pd

import xgboost as xgb

import optuna

from src.helpers import get_project_dir
from src.functions_and_methods import grid_search, custom_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
feats = ['lat',
 'lon',
 'hour',
 'is_working_day',
 'is_day',
 'weekday',
 'is_evening',
 'is_weekend',
 'is_night',
 'year',
 'month',
 'day',
 'is_morning']
target = 'count'

In [4]:
df = pd.read_csv(os.path.join(get_project_dir(), 'data/d2/train_data3.csv'))
df_v = pd.read_csv(os.path.join(get_project_dir(), 'data/d2/val_data3.csv'))
df_t = pd.read_csv(os.path.join(get_project_dir(), 'data/d2/test_data3.csv'))

# Grid Search

In [9]:
model = xgb.XGBRegressor
scorer = custom_score

In [9]:
params = {'learning_rate': np.logspace(-3, 0, 4),
        'lambda': np.logspace(-3, 1, 5),
        'alpha': np.logspace(-3, 1, 5)}
default_params = {'random_state': 42,
                  'max_depth': 8,
                  'n_estimators': 800, 
                  'objective': 'reg:tweedie',
                  'n_jobs': -1}

In [10]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)

  0%|          | 0/100 [00:00<?, ?it/s]

Iteration: 0, Set: {'learning_rate': 0.001, 'lambda': 0.001, 'alpha': 0.001} Score: 2.4467649908195925
Iteration: 1, Set: {'learning_rate': 0.001, 'lambda': 0.001, 'alpha': 0.01} Score: 2.4464663999771723
Iteration: 2, Set: {'learning_rate': 0.001, 'lambda': 0.001, 'alpha': 0.09999999999999999} Score: 2.446796944798728
Iteration: 3, Set: {'learning_rate': 0.001, 'lambda': 0.001, 'alpha': 1.0} Score: 2.446893766108442
Iteration: 4, Set: {'learning_rate': 0.001, 'lambda': 0.001, 'alpha': 10.0} Score: 2.4481982139082157
Iteration: 5, Set: {'learning_rate': 0.001, 'lambda': 0.01, 'alpha': 0.001} Score: 2.446821140085776
Iteration: 6, Set: {'learning_rate': 0.001, 'lambda': 0.01, 'alpha': 0.01} Score: 2.446464221380068
Iteration: 7, Set: {'learning_rate': 0.001, 'lambda': 0.01, 'alpha': 0.09999999999999999} Score: 2.4467812005924627
Iteration: 8, Set: {'learning_rate': 0.001, 'lambda': 0.01, 'alpha': 1.0} Score: 2.4468967236094117
Iteration: 9, Set: {'learning_rate': 0.001, 'lambda': 0.01, 

KeyboardInterrupt: 

In [11]:
params = {'lambda': np.logspace(-3, 1, 5),
        'alpha': np.logspace(-3, 1, 5)}
default_params = {'random_state': 42,
                  'learning_rate': 0.1,
                  'max_depth': 8,
                  'n_estimators': 800, 
                  'objective': 'reg:tweedie',
                  'n_jobs': -1}

In [12]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)

  0%|          | 0/25 [00:00<?, ?it/s]

Iteration: 0, Set: {'lambda': 0.001, 'alpha': 0.001} Score: 0.885058786993387
Iteration: 1, Set: {'lambda': 0.001, 'alpha': 0.01} Score: 0.8820793091810172
Iteration: 2, Set: {'lambda': 0.001, 'alpha': 0.09999999999999999} Score: 0.8889606606168059
Iteration: 3, Set: {'lambda': 0.001, 'alpha': 1.0} Score: 0.8789803625954187
Iteration: 4, Set: {'lambda': 0.001, 'alpha': 10.0} Score: 0.8818385359654969
Iteration: 5, Set: {'lambda': 0.01, 'alpha': 0.001} Score: 0.8817584677771687
Iteration: 6, Set: {'lambda': 0.01, 'alpha': 0.01} Score: 0.8788786014789255
Iteration: 7, Set: {'lambda': 0.01, 'alpha': 0.09999999999999999} Score: 0.8726588225648726
Iteration: 8, Set: {'lambda': 0.01, 'alpha': 1.0} Score: 0.8756968209489838
Iteration: 9, Set: {'lambda': 0.01, 'alpha': 10.0} Score: 0.8903275388462425
Iteration: 10, Set: {'lambda': 0.09999999999999999, 'alpha': 0.001} Score: 0.8852602032622098
Iteration: 11, Set: {'lambda': 0.09999999999999999, 'alpha': 0.01} Score: 0.8733559782514623
Iteration

In [13]:
params = {'learning_rate': np.logspace(-1, 0, 10)}
default_params = {'random_state': 42,
                  'max_depth': 8,
                  'lambda': 0.1,
                  'alpha': 10,
                  'n_estimators': 800, 
                  'objective': 'reg:tweedie',
                  'n_jobs': -1}

In [15]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)

  0%|          | 0/10 [00:00<?, ?it/s]

Iteration: 0, Set: {'learning_rate': 0.09999999999999999} Score: 0.8704881860614873
Iteration: 1, Set: {'learning_rate': 0.1291549665014884} Score: 0.8771360457747719
Iteration: 2, Set: {'learning_rate': 0.16681005372000587} Score: 0.8749206918308847
Iteration: 3, Set: {'learning_rate': 0.21544346900318834} Score: 0.8696758260512574
Iteration: 4, Set: {'learning_rate': 0.2782559402207124} Score: 0.8876076740649376
Iteration: 5, Set: {'learning_rate': 0.35938136638046275} Score: 0.8753931820422433
Iteration: 6, Set: {'learning_rate': 0.46415888336127786} Score: 0.9091403551853268
Iteration: 7, Set: {'learning_rate': 0.5994842503189408} Score: 0.9130063449663645
Iteration: 8, Set: {'learning_rate': 0.7742636826811269} Score: 0.9062947372639336
Iteration: 9, Set: {'learning_rate': 1.0} Score: 0.9442826058238072


In [18]:
params = {'max_depth': [5, 8, 11, 14]}
default_params = {'random_state': 42,
                  'learning_rate': 0.21544346900318834,
                  'lambda': 0.1,
                  'alpha': 10,
                  'n_estimators': 800, 
                  'objective': 'reg:tweedie',
                  'n_jobs': -1}

In [19]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)

  0%|          | 0/4 [00:00<?, ?it/s]

Iteration: 0, Set: {'max_depth': 5} Score: 0.9102294903532607
Iteration: 1, Set: {'max_depth': 8} Score: 0.8696758260512574
Iteration: 2, Set: {'max_depth': 11} Score: 0.9066257138916893
Iteration: 3, Set: {'max_depth': 14} Score: 0.9749579543097046


In [20]:
params = {'n_estimators': [600, 800, 1000, 1200]}
default_params = {'random_state': 42,
                  'learning_rate': 0.21544346900318834,
                  'max_depth': 8,
                  'lambda': 0.1,
                  'alpha': 10,
                  'objective': 'reg:tweedie',
                  'n_jobs': -1}

In [21]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)

  0%|          | 0/4 [00:00<?, ?it/s]

Iteration: 0, Set: {'n_estimators': 600} Score: 0.871337205491467
Iteration: 1, Set: {'n_estimators': 800} Score: 0.8696758260512574
Iteration: 2, Set: {'n_estimators': 1000} Score: 0.8708315666064591
Iteration: 3, Set: {'n_estimators': 1200} Score: 0.8780253715298195


In [22]:
params = {'subsample': [0.4, 0.7, 1],
         'colsample_bytree': [0.4, 0.7, 1]}
default_params = {'random_state': 42,
                  'max_depth': 8,
                  'learning_rate': 0.21544346900318834,
                  'lambda': 0.1,
                  'alpha': 10,
                  'n_estimators': 800, 
                  'objective': 'reg:tweedie',
                  'n_jobs': -1}

In [23]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)

  0%|          | 0/9 [00:00<?, ?it/s]

Iteration: 0, Set: {'subsample': 0.4, 'colsample_bytree': 0.4} Score: 1.0040733313175487
Iteration: 1, Set: {'subsample': 0.4, 'colsample_bytree': 0.7} Score: 0.9182448531914005
Iteration: 2, Set: {'subsample': 0.4, 'colsample_bytree': 1} Score: 0.879812773523244
Iteration: 3, Set: {'subsample': 0.7, 'colsample_bytree': 0.4} Score: 0.9921334427123495
Iteration: 4, Set: {'subsample': 0.7, 'colsample_bytree': 0.7} Score: 0.8838269866611287
Iteration: 5, Set: {'subsample': 0.7, 'colsample_bytree': 1} Score: 0.8878534202092804
Iteration: 6, Set: {'subsample': 1, 'colsample_bytree': 0.4} Score: 0.9740223426372583
Iteration: 7, Set: {'subsample': 1, 'colsample_bytree': 0.7} Score: 0.8842068931232451
Iteration: 8, Set: {'subsample': 1, 'colsample_bytree': 1} Score: 0.8696758260512574


In [None]:
best_set = {'random_state': 42,
                  'max_depth': 8,
                  'learning_rate': 0.21544346900318834,
                  'lambda': 0.1,
                  'alpha': 10,
                  'n_estimators': 800, 
                  'objective': 'reg:tweedie',
                  'n_jobs': -1,
            'subsample': 1, 
            'colsample_bytree': 1}
best_score = 0.8696758260512574

### -----------------------------------------------

In [28]:
params = {'lambda': [0.1, 1],
          'alpha': [0.1, 1],
          'max_depth': [5, 10],
          'learning_rate': [0.01, 0.1],
          'subsample': [0.5, 1],
          'colsample_bytree': [0.5, 1]}
default_params = {'objective': 'reg:tweedie',
                  'random_state': 42,
                  'n_estimators': 800,
                  'n_jobs': -1}

In [29]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)

  0%|          | 0/64 [00:00<?, ?it/s]

Iteration: 0, Set: {'lambda': 0.1, 'alpha': 0.1, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.5, 'colsample_bytree': 0.5} Score: 1.8569281166525788
Iteration: 1, Set: {'lambda': 0.1, 'alpha': 0.1, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.5, 'colsample_bytree': 1} Score: 1.667772768940135
Iteration: 2, Set: {'lambda': 0.1, 'alpha': 0.1, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 1, 'colsample_bytree': 0.5} Score: 1.8596524623361324
Iteration: 3, Set: {'lambda': 0.1, 'alpha': 0.1, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 1, 'colsample_bytree': 1} Score: 1.693142165688945
Iteration: 4, Set: {'lambda': 0.1, 'alpha': 0.1, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 0.5} Score: 1.2213820461113187
Iteration: 5, Set: {'lambda': 0.1, 'alpha': 0.1, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.5, 'colsample_bytree': 1} Score: 0.9958029594848267
Iteration: 6, Set: {'lambda': 0.1, 'alpha': 0.1, 'max_depth': 5, 'learni

In [None]:
best_set = {'lambda': 1, 
            'alpha': 1, 
            'max_depth': 10, 
            'learning_rate': 0.1, 
            'subsample': 1, 
            'colsample_bytree': 1,
            'random_state': 42,
            'n_estimators': 800, 
            'objective': 'reg:tweedie',
            'n_jobs': -1,}
best_score = 0.8612055245999325

In [7]:
params = {'lambda': [0.8, 1],
          'alpha': [0.8, 1],
          'max_depth': [10, 15],
          'n_estimators': [800, 1200],
          'learning_rate': [0.05, 0.1, 0.15]}
default_params = {'objective': 'reg:tweedie',
                  'random_state': 42,
                    'subsample': 1, 
                    'colsample_bytree': 1,
                  'n_jobs': -1}

In [10]:
best_set, best_score = grid_search(df, df_v, feats, target, model, params, scorer, default_params)
print(best_set, best_score)

  0%|          | 0/48 [00:00<?, ?it/s]

Iteration: 0, Set: {'lambda': 0.8, 'alpha': 0.8, 'max_depth': 10, 'n_estimators': 800, 'learning_rate': 0.05} Score: 0.8833095923589918
Iteration: 1, Set: {'lambda': 0.8, 'alpha': 0.8, 'max_depth': 10, 'n_estimators': 800, 'learning_rate': 0.1} Score: 0.872963428802489
Iteration: 2, Set: {'lambda': 0.8, 'alpha': 0.8, 'max_depth': 10, 'n_estimators': 800, 'learning_rate': 0.15} Score: 0.8823865454420218
Iteration: 3, Set: {'lambda': 0.8, 'alpha': 0.8, 'max_depth': 10, 'n_estimators': 1200, 'learning_rate': 0.05} Score: 0.8828508059988214
Iteration: 4, Set: {'lambda': 0.8, 'alpha': 0.8, 'max_depth': 10, 'n_estimators': 1200, 'learning_rate': 0.1} Score: 0.8890081151248543
Iteration: 5, Set: {'lambda': 0.8, 'alpha': 0.8, 'max_depth': 10, 'n_estimators': 1200, 'learning_rate': 0.15} Score: 0.9001021148185319
Iteration: 6, Set: {'lambda': 0.8, 'alpha': 0.8, 'max_depth': 15, 'n_estimators': 800, 'learning_rate': 0.05} Score: 0.9664257975200216
Iteration: 7, Set: {'lambda': 0.8, 'alpha': 0.8,

# Optuna

In [14]:
def objective(trial, X_train=df[feats], y_train=df[target], X_eval=df_v[feats], y_eval=df_v[target]):
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-2, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-2, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', np.logspace(-3, 0, 10)),
        'n_estimators': 800,
        'max_depth': trial.suggest_categorical('max_depth', [7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_child_weight': trial.suggest_categorical('min_child_weight', [1]+list(np.arange(30, 301, 30))),
    }
    model = xgb.XGBRegressor(**param, objective='reg:tweedie')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_eval)
    score = np.mean(np.abs(y_eval - y_pred)/y_pred)
    return score

In [16]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-10-18 05:17:08,725] A new study created in memory with name: no-name-8ff22dcb-b9f8-4c93-8c57-c2ba0f3c0ca5
[I 2023-10-18 05:17:42,331] Trial 0 finished with value: 0.9686010611039348 and parameters: {'lambda': 0.017469857924244384, 'alpha': 0.021020507006867438, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.4641588833612777, 'max_depth': 11, 'random_state': 42, 'min_child_weight': 210}. Best is trial 0 with value: 0.9686010611039348.
[I 2023-10-18 05:18:17,167] Trial 1 finished with value: 0.9533797828224584 and parameters: {'lambda': 2.565022938549281, 'alpha': 2.37771971887279, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.046415888336127774, 'max_depth': 9, 'random_state': 42, 'min_child_weight': 60}. Best is trial 1 with value: 0.9533797828224584.
[I 2023-10-18 05:18:51,420] Trial 2 finished with value: 1.1284353114540635 and parameters: {'lambda': 0.11288822786456534, 'alpha': 0.33409310200618453, 'colsample_bytree': 0.6, 'subsample': 0.7, 'l

KeyboardInterrupt: 

In [17]:
def objective(trial, X_train=df[feats], y_train=df[target], X_eval=df_v[feats], y_eval=df_v[target]):
    param = {
        'lambda': trial.suggest_categorical('lambda', np.logspace(-1, 1, 10)),
        'alpha': trial.suggest_categorical('alpha', np.logspace(-1, 1, 10)),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.4,0.6,0.8,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.6,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', np.logspace(-2, 0, 10)),
        'n_estimators': trial.suggest_categorical('n_estimators', [800, 1000]),
        'max_depth': trial.suggest_categorical('max_depth', [7,9,11,13,15,17]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_child_weight': trial.suggest_categorical('min_child_weight', [1]+list(np.arange(30, 301, 30))),
    }
    model = xgb.XGBRegressor(**param, objective='reg:tweedie')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_eval)
    score = np.mean(np.abs(y_eval - y_pred)/y_pred)
    return score

In [18]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-10-18 06:11:28,939] A new study created in memory with name: no-name-d304af7d-4b4c-4592-b594-e8407a2ef2ad
[I 2023-10-18 06:12:28,021] Trial 0 finished with value: 0.912793400724224 and parameters: {'lambda': 10.0, 'alpha': 0.2782559402207124, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.01668100537200059, 'n_estimators': 800, 'max_depth': 13, 'random_state': 42, 'min_child_weight': 300}. Best is trial 0 with value: 0.912793400724224.
[I 2023-10-18 06:13:25,123] Trial 1 finished with value: 1.3760503797968728 and parameters: {'lambda': 3.5938136638046254, 'alpha': 0.2782559402207124, 'colsample_bytree': 0.4, 'subsample': 0.4, 'learning_rate': 0.01, 'n_estimators': 800, 'max_depth': 13, 'random_state': 42, 'min_child_weight': 30}. Best is trial 0 with value: 0.912793400724224.
[I 2023-10-18 06:14:37,422] Trial 2 finished with value: 1.0601769843284732 and parameters: {'lambda': 10.0, 'alpha': 2.1544346900318834, 'colsample_bytree': 0.4, 'subsample': 0.6, 'learnin

KeyboardInterrupt: 

In [19]:
def objective(trial, X_train=df[feats], y_train=df[target], X_eval=df_v[feats], y_eval=df_v[target]):
    param = {
        'lambda': trial.suggest_categorical('lambda', np.logspace(-1, 1, 10)),
        'alpha': trial.suggest_categorical('alpha', np.logspace(-1, 1, 10)),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.4,0.6,0.8,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.6,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', np.logspace(-2, 0, 10)),
        'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
        'max_depth': trial.suggest_categorical('max_depth', [18]),
        'random_state': trial.suggest_categorical('random_state', [42]),
        'min_child_weight': trial.suggest_categorical('min_child_weight', [1]+list(np.arange(30, 301, 30))),
    }
    model = xgb.XGBRegressor(**param, objective='reg:tweedie')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_eval)
    score = np.mean(np.abs(y_eval - y_pred)/y_pred)
    return score

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2023-10-18 08:40:07,869] A new study created in memory with name: no-name-7f7e0fd1-d606-439c-ba4e-7200296de90d
[I 2023-10-18 08:42:46,953] Trial 0 finished with value: 1.1158838739427233 and parameters: {'lambda': 0.16681005372000587, 'alpha': 0.16681005372000587, 'colsample_bytree': 0.8, 'subsample': 0.4, 'learning_rate': 0.0774263682681127, 'n_estimators': 1000, 'max_depth': 18, 'random_state': 42, 'min_child_weight': 30}. Best is trial 0 with value: 1.1158838739427233.
[I 2023-10-18 08:45:04,015] Trial 1 finished with value: 1.7828701532742253 and parameters: {'lambda': 5.994842503189409, 'alpha': 0.46415888336127786, 'colsample_bytree': 1.0, 'subsample': 0.4, 'learning_rate': 0.5994842503189408, 'n_estimators': 1000, 'max_depth': 18, 'random_state': 42, 'min_child_weight': 60}. Best is trial 0 with value: 1.1158838739427233.
[I 2023-10-18 08:46:21,428] Trial 2 finished with value: 1.1550416717821463 and parameters: {'lambda': 1.2915496650148839, 'alpha': 2.1544346900318834, 'col