# Simple Modeling

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, roc_auc_score
from xgboost import XGBRegressor

In [None]:
x_train = pd.read_csv('data/train_preprocessed.csv')
x_train = x_train.loc[x_train.dist!=0, :]
x_train.head()

In [None]:
y_train = x_train.ci_hour.copy()
y_train = np.sqrt(y_train)
x_train = x_train.drop(columns=['ci_hour'])

In [None]:
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 4, 5, 6],
    'colsample_bylevel': [0.8, 0.9, 1.0],
    'colsample_bynode': [0.8, 0.9, 1.0],
    'gamma': [0, 0.2, 0.4],
    'alpha': [0, 0.1, 0.01],
    'lambda': [0, 0.1, 0.01],
}
model = XGBRegressor(tree_method='gpu_hist', objective='reg:absoluteerror')
grid_search = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_absolute_error', verbose=3, cv=5)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_log = pd.DataFrame(grid_search.cv_results_)
grid_log.to_csv('checkpoints/grid_log.csv', index=False, encoding='UTF-8')

In [None]:
import pickle

best_model = XGBRegressor(
    tree_method='gpu_hist',
    objective='reg:absoluteerror',
    **grid_search.best_params_
)

best_model.fit(x_train, y_train)

with open('checkpoints/best_model.xgb', 'wb') as f:
    pickle.dump(best_model, f)

In [None]:
with open('checkpoints/best_model.xgb', 'rb') as f:
    fuck = pickle.load(f)