# Simple Modeling

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import product

import xgboost
from xgboost import train as xgboost_train
from xgboost import cv as xgboost_cv

import pickle

In [2]:
x_train = pd.read_csv('data/train_preprocessed.csv', engine='pyarrow')
x_train = x_train.loc[x_train.dist!=0, :]
x_train.head()

Unnamed: 0,ari_co,ari_po,ship_type_category,dist,breadth,built,depth,draught,gt,u_wind,v_wind,air_temperature,bn,ata_lt,port_size,ci_hour,month,wind_speed,deadweight_group
0,0,0,0,32.590869,40.0,28,20.0,20.0,86100,-0.363571,-1.574429,16.995714,4.061694,21,0.000113,161.218056,9,1.615862,1
1,0,0,0,35.575496,30.0,20,20.0,10.0,29400,-0.363571,-1.574429,16.995714,4.061694,11,0.000113,95.7675,9,1.615862,1
2,0,0,0,40.909139,40.0,13,20.0,10.0,48200,-0.363571,-1.574429,16.995714,4.061694,11,0.000113,35.445556,9,1.615862,1
3,0,0,0,45.939559,40.0,11,20.0,10.0,58600,-0.363571,-1.574429,16.995714,4.061694,11,0.000113,95.507222,9,1.615862,1
4,0,0,0,15.606497,30.0,11,20.0,10.0,44300,-0.363571,-1.574429,16.995714,4.061694,11,0.000113,99.873056,9,1.615862,1


In [3]:
y_train = x_train.ci_hour.copy()
y_train = np.sqrt(y_train).values
x_train = x_train.drop(columns=['ci_hour']).values

In [4]:
def grid_search_xgb(params_default, dtrain):
    param_grid = {
        'max_depth': [2, 3, 4, 5],
        'colsample_bylevel': [0.9, 1.0],
        'colsample_bynode': [0.9, 1.0],
        'gamma': [0, 0.01, 0.001],
        'alpha': [0, 0.01, 0.001],
        'lambda': [0, 0.01, 0.001],
    }

    params_names = param_grid.keys()
    best_score_ = np.inf

    for params_train in tqdm(list(product(*param_grid.values()))):
        params_train = dict(zip(params_names, params_train))
        params = dict(params_default, **params_train)
        cv_log = xgboost_cv(params=params, dtrain=dtrain, nfold=5, num_boost_round=200, early_stopping_rounds=10).mean(axis=0)
        cv_score = cv_log['test-mae-mean']

        if cv_score < best_score_:
            best_score_ = cv_score
            best_params_ = params
    
    return best_params_, best_score_

In [5]:
params_default = {
    'tree_method': 'hist',
    'device': 'cuda',
    'eval_metric': 'mae',
    'objective': 'reg:absoluteerror',
    'verbosity': 0,
}
dtrain = xgboost.DMatrix(x_train, y_train)

best_params_, best_score_ = grid_search_xgb(params_default, dtrain)
print(best_params_)

100%|██████████| 432/432 [47:26<00:00,  6.59s/it]

{'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'mae', 'objective': 'reg:absoluteerror', 'verbosity': 0, 'max_depth': 5, 'colsample_bylevel': 1.0, 'colsample_bynode': 1.0, 'gamma': 0, 'alpha': 0, 'lambda': 0}





In [6]:
evals = [(dtrain, 'train')]

best_model = xgboost_train(
    params=best_params_,
    dtrain=dtrain,
    num_boost_round=200,
    evals=evals,
    early_stopping_rounds=10,
)

with open('checkpoints/best_model.xgb', 'wb') as f:
    pickle.dump(best_model, f)
    print('saving process complete.')

[0]	train-mae:4.07584
[1]	train-mae:3.93144
[2]	train-mae:3.84538
[3]	train-mae:3.78287
[4]	train-mae:3.74763
[5]	train-mae:3.72672
[6]	train-mae:3.70887
[7]	train-mae:3.68964
[8]	train-mae:3.67732
[9]	train-mae:3.66987
[10]	train-mae:3.66000
[11]	train-mae:3.65400
[12]	train-mae:3.64887
[13]	train-mae:3.64438
[14]	train-mae:3.64092
[15]	train-mae:3.63178
[16]	train-mae:3.62889
[17]	train-mae:3.62404
[18]	train-mae:3.61941
[19]	train-mae:3.61451
[20]	train-mae:3.60963
[21]	train-mae:3.60681
[22]	train-mae:3.60467
[23]	train-mae:3.60156
[24]	train-mae:3.59802
[25]	train-mae:3.59632
[26]	train-mae:3.59331
[27]	train-mae:3.58894
[28]	train-mae:3.58205
[29]	train-mae:3.57642
[30]	train-mae:3.57291
[31]	train-mae:3.57040
[32]	train-mae:3.56791
[33]	train-mae:3.56542
[34]	train-mae:3.56109
[35]	train-mae:3.55864
[36]	train-mae:3.55779
[37]	train-mae:3.55606
[38]	train-mae:3.55491
[39]	train-mae:3.55360
[40]	train-mae:3.55114
[41]	train-mae:3.55044
[42]	train-mae:3.54902
[43]	train-mae:3.5430