# Simple Modeling

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, roc_auc_score
from xgboost import XGBRegressor

In [2]:
x_train = pd.read_csv('data/train_preprocessed.csv')
x_test = pd.read_csv('data/test_preprocessed.csv')
x_train.head()

Unnamed: 0,SHIP_TYPE_CATEGORY,DIST,ATA,BREADTH,BUILT,DEADWEIGHT,DEPTH,DRAUGHT,SHIPMANAGER,FLAG,...,V_WIND,AIR_TEMPERATURE,BN,ATA_LT,DUBAI,BDI_ADJ,PORT_SIZE,CI_HOUR,ARI,deadweight_group
0,0,30.736578,2020-10-15 04:03:00,30.0,28,73100,20.0,10.0,CHMT36,"China, People's Republic Of",...,3.77,15.9,2.730798,12,42.01,1407.66833,0.00166,3.048333,CNEKP8,0
1,2,63.220425,2019-09-17 02:55:00,30.0,15,37900,20.0,10.0,CUFV52,Singapore,...,-6.72,24.5,4.289058,10,67.53,2089.046774,0.001614,17.138611,CNEUC8,0
2,2,90.427421,2019-02-23 06:43:00,50.0,7,115000,20.0,10.0,ISIG88,Liberia,...,0.0,9.4,0.0,14,65.3,603.193047,0.001743,98.8275,CNNGG6,1
3,1,0.0,2020-09-18 22:06:00,10.0,33,1490,10.0,0.0,MWFU27,Nauru,...,-7.31,22.1,4.693735,7,43.02,1169.853455,6.9e-05,0.0,JPTMR7,0
4,2,8.813725,2022-08-13 12:57:00,30.0,10,27600,10.0,10.0,OXZY41,Panama,...,2.31,22.8,2.345875,14,90.45,1107.944894,0.000197,96.030556,RUNNC2,0


In [3]:
drop_columns = ['ATA', 'FLAG', 'SHIPMANAGER', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN', 'ARI']
x_train.drop(columns=drop_columns, inplace=True)
x_test.drop(columns=drop_columns, inplace=True)

x_test.to_csv('data/test_4xgb.csv', encoding='UTF-8', index=False)

In [4]:
x_train = x_train.loc[x_train.DIST!=0, :]
y_train = x_train.CI_HOUR.copy()
y_train = np.log(y_train + 1)
x_train.drop(columns=['CI_HOUR'], inplace=True)

x_train.shape[1] == x_test.shape[1]

True

In [5]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 6, 7],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'colsample_bynode': [0.8, 0.9, 1.0],
    'gamma': [0, 0.2, 0.4],
    'alpha': [0, 0.1, 0.01],
    'lambda': [0, 0.1, 0.01],
}
model = XGBRegressor(tree_method='gpu_hist', objective='reg:absoluteerror')
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_absolute_error', verbose=3, cv=5)

In [6]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits
[CV 1/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=5, n_estimators=100;, score=-0.916 total time=   0.8s
[CV 2/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=5, n_estimators=100;, score=-0.917 total time=   0.6s
[CV 3/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=5, n_estimators=100;, score=-0.921 total time=   0.6s
[CV 4/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=5, n_estimators=100;, score=-0.912 total time=   0.6s
[CV 5/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=5, n_estimators=100;, score=-0.914 total time=   0.6s
[CV 1/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=0.8, gamma=0, lambda=0, max_depth=5, n_estimators=200;, score=-0.912 total time=   1.2s
[CV 2/5] END alpha=0, colsample_byl

In [7]:
grid_search.best_params_

{'alpha': 0,
 'colsample_bylevel': 0.8,
 'colsample_bynode': 1.0,
 'gamma': 0,
 'lambda': 0,
 'max_depth': 5,
 'n_estimators': 200}

In [8]:
grid_search.best_score_

-0.9116761366646765

In [9]:
grid_log = pd.DataFrame(grid_search.cv_results_)
grid_log.to_csv('checkpoints/grid_log.csv', index=False, encoding='UTF-8')

In [10]:
import pickle

best_model = XGBRegressor(
    tree_method='gpu_hist',
    objective='reg:absoluteerror',
    **grid_search.best_params_
)

best_model.fit(x_train, y_train)

with open('checkpoints/best_model.xgb', 'wb') as f:
    pickle.dump(best_model, f)

In [11]:
with open('checkpoints/best_model.xgb', 'rb') as f:
    fuck = pickle.load(f)