### Import libraries and load datasets

In [1]:
import pandas as pd

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, train_test_split

from xgboost import XGBRegressor

In [2]:
x_train = pd.read_csv('feature_drop_na_dengue.csv').drop('Unnamed: 0', axis = 1)
y_train = pd.read_csv('label_drop_na_dengue.csv')['total_cases']

correlated_col = list(x_train.columns)

train_df = x_train.join(y_train)

train_df

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,04,05,06,07,08,09,10,11,12,total_cases
0,0.122600,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,...,True,False,False,False,False,False,False,False,False,4
1,0.169900,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,...,False,True,False,False,False,False,False,False,False,5
2,0.032250,0.172967,0.157200,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,...,False,True,False,False,False,False,False,False,False,4
3,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.310000,301.4,297.0,...,False,True,False,False,False,False,False,False,False,3
4,0.196200,0.262200,0.251200,0.247340,7.52,299.518571,299.664286,295.821429,301.9,297.5,...,False,True,False,False,False,False,False,False,False,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,0.342750,0.318900,0.256343,0.292514,55.30,299.334286,300.771429,296.825714,309.7,294.5,...,False,True,False,False,False,False,False,False,False,5
1195,0.160157,0.160371,0.136043,0.225657,86.47,298.330000,299.392857,296.452857,308.5,291.9,...,False,False,True,False,False,False,False,False,False,8
1196,0.247057,0.146057,0.250357,0.233714,58.94,296.598571,297.592857,295.501429,305.5,292.4,...,False,False,True,False,False,False,False,False,False,1
1197,0.333914,0.245771,0.278886,0.325486,59.67,296.345714,297.521429,295.324286,306.1,291.9,...,False,False,True,False,False,False,False,False,False,1


### Split train set and test set

In [3]:
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.3, random_state = 13)

X_train

Unnamed: 0,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,...,03,04,05,06,07,08,09,10,11,12
199,0.089550,0.079700,0.181714,0.155986,20.16,299.828571,299.900000,295.892857,301.4,298.4,...,False,False,False,True,False,False,False,False,False,False
737,0.295586,0.295683,0.312214,0.265929,23.12,300.802857,301.935714,290.635714,312.8,291.5,...,False,False,False,False,False,False,True,False,False,False
978,0.307257,0.278214,0.354829,0.286957,73.97,299.578571,301.192857,295.768571,310.5,291.9,...,False,False,False,False,False,False,False,True,False,False
286,0.008933,0.046675,0.141100,0.158571,57.65,299.840000,299.992857,296.572857,301.9,298.1,...,False,False,False,False,False,True,False,False,False,False
294,-0.015150,0.083100,0.201867,0.210733,99.33,299.478571,299.435714,296.435714,302.9,297.6,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,0.245429,0.184700,0.261457,0.268886,84.16,294.922857,295.242857,294.567143,301.5,290.1,...,False,False,True,False,False,False,False,False,False,False
742,0.223533,0.133914,0.349800,0.100917,31.10,298.474286,300.285714,294.592857,309.2,291.9,...,False,False,False,False,False,False,False,True,False,False
74,0.053050,0.084050,0.141771,0.134514,0.00,297.947143,298.150000,293.708571,300.4,295.5,...,True,False,False,False,False,False,False,False,False,False
176,0.183000,0.265800,0.196729,0.193900,2.79,299.977143,300.128571,295.237143,301.5,298.5,...,False,False,False,False,True,False,False,False,False,False


In [4]:
pd.DataFrame(y_train)

Unnamed: 0,total_cases
199,7
737,1
978,3
286,10
294,56
...,...
866,1
742,1
74,37
176,80


### XGBoost Training - Param_grid

In [5]:
model = XGBRegressor(objective = 'reg:squarederror')

param_grid = {
    'max_depth': [6, 7, 8],
    'learning_rate': [0.1, 0.05, 0.025],
    'n_estimators': [50, 55, 60],
    'subsample': [0.9], 
    'colsample_bytree': [0.6], 
    'colsample_bylevel': [0.1], 
    'min_child_weight': [5], 
    'reg_alpha': [0.1]
}

grid_search = GridSearchCV(model, 
                           param_grid,
                           scoring = 'neg_mean_absolute_error',
                           cv = 5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_params

{'colsample_bylevel': 0.1,
 'colsample_bytree': 0.6,
 'learning_rate': 0.05,
 'max_depth': 8,
 'min_child_weight': 5,
 'n_estimators': 60,
 'reg_alpha': 0.1,
 'subsample': 0.9}

In [9]:
best_model = XGBRegressor(n_estimators = 55,
                          max_depth = 7,
                          learning_rate = 0.05 ,
                          subsample = 0.9, 
                          colsample_bytree = 0.6, 
                          colsample_bylevel = 0.1, 
                          min_child_weight = 5, 
                          reg_alpha = 0.1) 

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)

mae

13.990789610147477

'''
{'colsample_bylevel': 0.1,
 'colsample_bytree': 0.6,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 5,
 'n_estimators': 50,
 'reg_alpha': 0.1,
 'subsample': 0.9}

 mae: 14.144283301300472
'''

'''
{'colsample_bylevel': 0.1,
 'colsample_bytree': 0.6,
 'learning_rate': 0.05,
 'max_depth': 7,
 'min_child_weight': 5,
 'n_estimators': 55,
 'reg_alpha': 0.1,
 'subsample': 0.9}

 mae: 13.990789610147477
'''

'''
{'colsample_bylevel': 0.1,
 'colsample_bytree': 0.6,
 'learning_rate': 0.05,
 'max_depth': 8,
 'min_child_weight': 5,
 'n_estimators': 60,
 'reg_alpha': 0.1,
 'subsample': 0.9}

mae: 14.500434876812829
'''