# Simple Modeling

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, roc_auc_score
from xgboost import XGBRegressor

In [2]:
x_train = pd.read_csv('data/train_preprocessed.csv')
x_train = x_train.loc[x_train.dist!=0, :]
x_train.head()

Unnamed: 0,ari_co,ari_po,ship_type_category,dist,breadth,built,depth,draught,gt,u_wind,...,air_temperature,bn,ata_lt,dubai,bdi_adj,port_size,ci_hour,month,wind_speed,deadweight_group
0,0,0,0,32.590869,40.0,28,20.0,20.0,86100,-0.256667,...,17.050794,4.127843,21,98.07,1152.45836,0.000113,161.218056,9,1.591468,1
1,0,0,0,35.575496,30.0,20,20.0,10.0,29400,-0.256667,...,17.050794,4.127843,11,99.03,1141.586111,0.000113,95.7675,9,1.591468,1
2,0,0,0,40.909139,40.0,13,20.0,10.0,48200,-0.256667,...,17.050794,4.127843,11,100.39,1135.655794,0.000113,35.445556,9,1.591468,1
3,0,0,0,45.939559,40.0,11,20.0,10.0,58600,-0.256667,...,17.050794,4.127843,11,99.03,1141.586111,0.000113,95.507222,9,1.591468,1
4,0,0,0,15.606497,30.0,11,20.0,10.0,44300,-0.256667,...,17.050794,4.127843,11,99.03,1141.586111,0.000113,99.873056,9,1.591468,1


In [3]:
y_train = x_train.ci_hour.copy()
y_train = np.sqrt(y_train)
x_train = x_train.drop(columns=['ci_hour'])

In [4]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 4, 5, 6],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'colsample_bynode': [0.8, 0.9, 1.0],
    'gamma': [0, 0.2, 0.4],
    'alpha': [0, 0.1, 0.01],
    'lambda': [0, 0.1, 0.01],
}
model = XGBRegressor(tree_method='gpu_hist', objective='reg:absoluteerror')
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_absolute_error', verbose=3, cv=5)

In [5]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 2916 candidates, totalling 14580 fits
[CV 1/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=3, n_estimators=100;, score=-4.972 total time=   0.7s
[CV 2/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=3, n_estimators=100;, score=-2.884 total time=   0.5s
[CV 3/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=3, n_estimators=100;, score=-3.360 total time=   0.5s
[CV 4/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=3, n_estimators=100;, score=-4.545 total time=   0.5s
[CV 5/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=3, n_estimators=100;, score=-5.547 total time=   0.5s
[CV 1/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=3, n_estimators=150;, score=-5.166 total time=   0.7s
[CV 2/5] END alpha=0, colsample_by

In [6]:
grid_search.best_params_

{'alpha': 0,
 'colsample_bylevel': 1.0,
 'colsample_bynode': 0.9,
 'gamma': 0,
 'lambda': 0.01,
 'max_depth': 3,
 'n_estimators': 100}

In [7]:
grid_search.best_score_

-4.034751583137149

In [8]:
grid_log = pd.DataFrame(grid_search.cv_results_)
grid_log.to_csv('checkpoints/grid_log.csv', index=False, encoding='UTF-8')

In [9]:
import pickle

best_model = XGBRegressor(
    tree_method='gpu_hist',
    objective='reg:absoluteerror',
    **grid_search.best_params_
)

best_model.fit(x_train, y_train)

with open('checkpoints/best_model.xgb', 'wb') as f:
    pickle.dump(best_model, f)

In [10]:
with open('checkpoints/best_model.xgb', 'rb') as f:
    fuck = pickle.load(f)