# Model Template

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
val_df = pd.read_csv('data/preprocessed_data/validation.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')
corona_df = pd.read_csv('data/preprocessed_data/corona.csv')

In [11]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val = val_df.drop(columns='count')
y_val = val_df['count']

X_test = test_df.drop(columns='count')
y_test = test_df['count']

X_corona = corona_df.drop(columns='count')
y_corona = corona_df['count']

In [4]:
# hyperparameter tuning

try:
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/xgboost.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # define hyperparameters
        colsample_bynode = trial.suggest_int('colsample_bynode', 0.01, 1)
        max_depth = trial.suggest_int('max_depth', 1, 10)
        subsample = trial.suggest_float('subsample', 0.01, 1)
        n_estimators = trial.suggest_int('n_estimators', 1, 200)
        gamma = trial.suggest_float('gamma', 0.0, 20.0)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1)
        
        # setup and train model
        xgb_reg = XGBRegressor(
            colsample_bynode=colsample_bynode,            
            max_depth=max_depth,
            subsample=subsample,
            n_estimators=n_estimators,
            gamma=gamma,
            learning_rate=learning_rate,
        )
        xgb_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = xgb_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'colsample_bynode': colsample_bynode,
             'max_depth': max_depth,
             'subsample': subsample,
             'n_estimators': n_estimators,
             'gamma': gamma,
             'learning_rate': learning_rate,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=25)
    
    # convert to correct data types
    hyperparameters_df[['n_estimators', 'max_depth']] = hyperparameters_df[['n_estimators', 'max_depth']].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/xgboost.csv', index=False)

[32m[I 2022-11-22 17:48:26,025][0m A new study created in memory with name: no-name-706bd1ce-c52b-4593-9795-58cf5cdd19c0[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-22 17:50:29,795][0m Trial 0 finished with value: 14.343647414551873 and parameters: {'colsample_bynode': 1, 'max_depth': 9, 'subsample': 0.4749635281737962, 'n_estimators': 184, 'gamma': 1.9017305764079606, 'learning_rate': 0.5452424918327863}. Best is trial 0 with value: 14.343647414551873.[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-22 17:50:44,980][0m Trial 1 finished with value: 23.243219387083872 and parameters: {'colsample_bynode': 0, 'max_depth': 6, 'subsample': 0.1620419107273263, 'n_estimators': 118, 'gamma': 0.8748138452033882, 'learning_rate': 0.011459510952873368}. Best is trial 0 with value: 14.343647414551873.[0m
  elif isinst

In [5]:
hyperparameters_df.head()

Unnamed: 0,colsample_bynode,max_depth,subsample,n_estimators,gamma,learning_rate,r_squared,rmse
0,1.0,8,0.585814,174,10.105027,0.211314,0.747358,13.191494
1,1.0,6,0.297326,194,15.037143,0.224621,0.745122,13.249745
2,1.0,6,0.282574,181,15.041692,0.241144,0.743191,13.299841
3,1.0,6,0.296796,180,10.487348,0.258043,0.738742,13.414544
4,1.0,8,0.587636,159,11.051855,0.337772,0.736759,13.465367


In [6]:
# final model evaluation

# build and train model using the most successful hyperparameters
xgb_reg = XGBRegressor(
    colsample_bynode=hyperparameters_df.loc[0, 'colsample_bynode'],            
    max_depth=hyperparameters_df.loc[0, 'max_depth'],
    subsample=hyperparameters_df.loc[0, 'subsample'],
    n_estimators=hyperparameters_df.loc[0, 'n_estimators'],
    gamma=hyperparameters_df.loc[0, 'gamma'],
    learning_rate=hyperparameters_df.loc[0, 'learning_rate'],
)
xgb_reg.fit(X_train, y_train)

# make predictions
y_test_pred = xgb_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R^2:	0.659876576921888
RMSE:	15.509422392330048


In [12]:
# predictions on corona year 2020

# make predictions
y_corona_pred = xgb_reg.predict(X_corona)

# evaluate predictions
r_squared_c = r2_score(y_corona, y_corona_pred)
rmse_c = mean_squared_error(y_corona, y_corona_pred) ** 0.5

print(f'R^2:\t{r_squared_c}')
print(f'RMSE:\t{rmse_c}')

R^2:	-0.39035827862116834
RMSE:	18.459343039665292
