# Model Template

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
val_df = pd.read_csv('data/preprocessed_data/validation.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')
corona_df = pd.read_csv('data/preprocessed_data/corona.csv')

In [3]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val = val_df.drop(columns='count')
y_val = val_df['count']

X_test = test_df.drop(columns='count')
y_test = test_df['count']

X_corona = corona_df.drop(columns='count')
y_corona = corona_df['count']

In [4]:
# hyperparameter tuning

try:
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/xgboost.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # define hyperparameters
        colsample_bynode = trial.suggest_float('colsample_bynode', 0.01, 1)
        max_depth = trial.suggest_int('max_depth', 1, 10)
        subsample = trial.suggest_float('subsample', 0.01, 1)
        n_estimators = trial.suggest_int('n_estimators', 1, 200)
        gamma = trial.suggest_float('gamma', 0.0, 20.0)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1)
        
        # setup and train model
        xgb_reg = XGBRegressor(
            colsample_bynode=colsample_bynode,            
            max_depth=max_depth,
            subsample=subsample,
            n_estimators=n_estimators,
            gamma=gamma,
            learning_rate=learning_rate,
        )
        xgb_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = xgb_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'colsample_bynode': colsample_bynode,
             'max_depth': max_depth,
             'subsample': subsample,
             'n_estimators': n_estimators,
             'gamma': gamma,
             'learning_rate': learning_rate,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=25)
    
    # convert to correct data types
    hyperparameters_df[['n_estimators', 'max_depth']] = hyperparameters_df[['n_estimators', 'max_depth']].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/xgboost.csv', index=False)

[32m[I 2022-12-02 16:52:42,098][0m A new study created in memory with name: no-name-349939d7-288c-4c7e-9491-4fa92306d59f[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-12-02 16:52:44,803][0m Trial 0 finished with value: 20.174821647496323 and parameters: {'colsample_bynode': 0.7868170534135648, 'max_depth': 4, 'subsample': 0.5810115307370967, 'n_estimators': 6, 'gamma': 7.422409918349329, 'learning_rate': 0.3990675546102253}. Best is trial 0 with value: 20.174821647496323.[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-12-02 16:52:52,488][0m Trial 1 finished with value: 19.61215565885372 and parameters: {'colsample_bynode': 0.24109849244152065, 'max_depth': 2, 'subsample': 0.06572468505129272, 'n_estimators': 103, 'gamma': 11.964609075650905, 'learning_rate': 0.3763460548938589}. Best is trial 1 with value: 19.6121

In [5]:
hyperparameters_df.head()

Unnamed: 0,colsample_bynode,max_depth,subsample,n_estimators,gamma,learning_rate,r_squared,rmse
0,0.437476,9,0.873376,194,17.070415,0.305039,0.76617,12.690887
1,0.656197,9,0.864334,199,14.695009,0.1654,0.762414,12.792402
2,0.266017,9,0.767387,158,13.87609,0.207275,0.760452,12.845108
3,0.382525,8,0.816739,170,15.633954,0.183202,0.759579,12.868489
4,0.84288,9,0.711444,181,18.12939,0.120066,0.756722,12.944731


In [6]:
# build final model on test and validation data

X_final_train = pd.concat([X_train, X_val], ignore_index=True)
y_final_train = pd.concat([y_train, y_val], ignore_index=True)

In [7]:
# final model evaluation

# build and train model using the most successful hyperparameters
xgb_reg = XGBRegressor(
    colsample_bynode=hyperparameters_df.loc[0, 'colsample_bynode'],            
    max_depth=hyperparameters_df.loc[0, 'max_depth'],
    subsample=hyperparameters_df.loc[0, 'subsample'],
    n_estimators=hyperparameters_df.loc[0, 'n_estimators'],
    gamma=hyperparameters_df.loc[0, 'gamma'],
    learning_rate=hyperparameters_df.loc[0, 'learning_rate'],
)
xgb_reg.fit(X_final_train, y_final_train)

# make predictions
y_test_pred = xgb_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R^2:	0.7350558292809322
RMSE:	13.688458311399897


In [8]:
# predictions on corona year 2020

# make predictions
y_corona_pred = xgb_reg.predict(X_corona)

# evaluate predictions
r_squared_c = r2_score(y_corona, y_corona_pred)
rmse_c = mean_squared_error(y_corona, y_corona_pred) ** 0.5

print(f'R^2:\t{r_squared_c}')
print(f'RMSE:\t{rmse_c}')

R^2:	-0.5426717375688861
RMSE:	19.444180259435623
