# Xgboost implementation

In [89]:
import pandas as pd
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit, HalvingGridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np
import matplotlib.pyplot as plt

In [90]:
df = pd.read_csv('../clean_data.csv')

In [91]:
df.head(5)

Unnamed: 0,cluster_nl_encode,che_perc_gdp,cluster_nl,date,insurance_perc_che,price_month,price_unit,public_perc_che,target
0,0.543405,1.665879,BRAND_354E_COUNTRY_88A3,2014-06-01,1.893333,1.006444,1.013784,1.835821,1.000784
1,0.278369,1.689348,BRAND_626D_COUNTRY_8B47,2014-06-01,1.495874,1.120724,1.626677,1.779263,1.0
2,0.424518,1.665879,BRAND_45D9_COUNTRY_88A3,2014-06-01,1.893333,1.120724,3.144874,1.835821,1.002258
3,0.844776,2.05177,BRAND_D724_COUNTRY_445D,2014-06-01,1.0,1.120724,1.213446,1.80597,1.068761
4,0.004719,2.05913,BRAND_4887_COUNTRY_D8B0,2014-06-01,2.013333,1.018589,1.008708,1.880597,1.036312


In [92]:
df = df.drop(['cluster_nl'], axis=1)

In [93]:
cutoff_date = pd.to_datetime('2018-01-01')
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.month
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]
# cutoff_date = pd.to_datetime('2022-12-01')
train_df = df[df['date'] < cutoff_date]
test_df = df[df['date'] >= cutoff_date]

In [95]:
test_df

Unnamed: 0,month,cluster_nl_encode,che_perc_gdp,date,insurance_perc_che,price_month,price_unit,public_perc_che,target
24100,1,0.796690,1.873889,2018-01-01,1.000000,1.002274,1.006640,1.373134,1.020811
24101,1,0.189641,1.808255,2018-01-01,1.495874,2.284708,1.499938,2.000000,1.019474
24102,1,0.045014,1.923538,2018-01-01,2.000000,1.942202,1.290017,2.000000,1.446992
24103,1,0.637324,1.652744,2018-01-01,1.495874,1.928734,1.002809,1.850746,1.266316
24104,1,0.273154,1.878850,2018-01-01,1.495874,1.139277,1.259008,2.014925,1.318998
...,...,...,...,...,...,...,...,...,...
118826,12,0.684667,2.058055,2022-12-01,1.495874,1.054007,1.100336,2.029851,1.203657
118827,12,0.254677,1.819485,2022-12-01,1.173333,1.008317,1.029630,1.955224,1.109272
118828,12,0.096464,1.491552,2022-12-01,1.826667,1.017259,1.018310,1.926795,1.343341
118829,12,0.785380,2.020277,2022-12-01,1.495874,1.960978,2.490911,1.985847,1.266831


In [96]:
## Drop date for now
train_df = train_df.drop('date', axis=1)
test_df = test_df.drop('date', axis=1)

In [97]:
train_df

Unnamed: 0,month,cluster_nl_encode,che_perc_gdp,insurance_perc_che,price_month,price_unit,public_perc_che,target
0,6,0.543405,1.665879,1.893333,1.006444,1.013784,1.835821,1.000784
1,6,0.278369,1.689348,1.495874,1.120724,1.626677,1.779263,1.000000
2,6,0.424518,1.665879,1.893333,1.120724,3.144874,1.835821,1.002258
3,6,0.844776,2.051770,1.000000,1.120724,1.213446,1.805970,1.068761
4,6,0.004719,2.059130,2.013333,1.018589,1.008708,1.880597,1.036312
...,...,...,...,...,...,...,...,...
24095,12,0.661481,1.807886,1.986667,1.120724,2.568833,1.716418,1.345190
24096,12,0.805451,1.875446,2.026667,1.317197,1.212507,1.865672,1.169581
24097,12,0.500614,1.250370,1.360000,1.223164,1.361424,1.507463,1.044899
24098,12,0.708326,1.235157,1.360000,1.023726,1.016905,1.671642,1.004769


In [98]:
train_x = train_df.iloc[:,:-1]
train_y = train_df.iloc[:, -1]

test_x = test_df.iloc[:, :-1]
test_y = test_df.iloc[:, -1]

In [99]:
test_y

24100     1.020811
24101     1.019474
24102     1.446992
24103     1.266316
24104     1.318998
            ...   
118826    1.203657
118827    1.109272
118828    1.343341
118829    1.266831
118830    1.001763
Name: target, Length: 94731, dtype: float64

In [100]:
n_split = 5
tscv = TimeSeriesSplit(n_splits=n_split)

In [101]:
parameters = { 'objective':['reg:pseudohubererror', 'reg:squarederror'],
              'learning_rate': [.01, 0.05, .1], #so called `eta` value
              'max_depth': [3, 5, 6, 7],
              'min_child_weight': [1, 3, 5],
              'subsample': [0.8, 1.0],
              'colsample_bytree': [0.8, 1.0],
              'n_estimators': [500]}

In [102]:
modelcv = XGBRegressor()

In [103]:
model_grid = HalvingGridSearchCV(
    modelcv, 
    parameters, 
    cv = tscv, 
    scoring = 'r2', 
    n_jobs=-1,
    return_train_score=True)

In [104]:
model_grid.fit(train_x, train_y)

  _data = np.array(data, dtype=dtype, copy=copy,


In [105]:
model_grid.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.01,
 'max_depth': 7,
 'min_child_weight': 5,
 'n_estimators': 500,
 'objective': 'reg:squarederror',
 'subsample': 1.0}

In [106]:
best_model = model_grid.best_estimator_

In [107]:
pred = best_model.predict(test_x)

In [110]:
rmse = mean_squared_error(test_y, pred)
r2 = r2_score(test_y, pred)
print("RMSE is %f" %(rmse))
print("R2 score is %f" %(r2))

RMSE is 1.368178
R2 score is 0.136884
