# Model Template

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

  from pandas import MultiIndex, Int64Index


In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
val_df = pd.read_csv('data/preprocessed_data/validation.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')

In [3]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val = val_df.drop(columns='count')
y_val = val_df['count']

X_test = test_df.drop(columns='count')
y_test = test_df['count']

In [4]:
# hyperparameter tuning

try:
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/xgboost.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # define hyperparameters
        colsample_bynode = trial.suggest_int('colsample_bynode', 0.01, 1)
        max_depth = trial.suggest_int('max_depth', 1, 10)
        subsample = trial.suggest_float('subsample', 0.01, 1)
        n_estimators = trial.suggest_int('n_estimators', 1, 200)
        gamma = trial.suggest_float('gamma', 0.0, 20.0)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 1)
        
        # setup and train model
        xgb_reg = XGBRegressor(
            colsample_bynode=colsample_bynode,            
            max_depth=max_depth,
            subsample=subsample,
            n_estimators=n_estimators,
            gamma=gamma,
            learning_rate=learning_rate,
        )
        xgb_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = xgb_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # insert results in dataframe
        global hyperparameters_df
        hyperparameters_df = hyperparameters_df.append(
            {'colsample_bynode': colsample_bynode,
             'max_depth': max_depth,
             'subsample': subsample,
             'n_estimators': n_estimators,
             'gamma': gamma,
             'learning_rate': learning_rate,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=25)
    
    # convert to correct data types
    hyperparameters_df[['n_estimators', 'max_depth']] = hyperparameters_df[['n_estimators', 'max_depth']].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/xgboost.csv', index=False)

[32m[I 2022-11-17 11:42:05,925][0m A new study created in memory with name: no-name-6a76784f-f9ad-4eb3-882b-d51424e4b997[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-17 11:42:16,121][0m Trial 0 finished with value: 16.865554903998444 and parameters: {'colsample_bynode': 0, 'max_depth': 6, 'subsample': 0.23350782801764844, 'n_estimators': 92, 'gamma': 12.892708781040076, 'learning_rate': 0.874319913798563}. Best is trial 0 with value: 16.865554903998444.[0m
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-17 11:42:20,625][0m Trial 1 finished with value: 20.959888668729484 and parameters: {'colsample_bynode': 0, 'max_depth': 1, 'subsample': 0.30772546116742644, 'n_estimators': 72, 'gamma': 18.083413185843632, 'learning_rate': 0.7610463219799021}. Best is trial 0 with value: 16.865554903998444.[0m
  elif isinstanc

In [5]:
hyperparameters_df.head()

Unnamed: 0,colsample_bynode,max_depth,subsample,n_estimators,gamma,learning_rate,r_squared,rmse
0,1.0,10,0.161061,90,19.64821,0.155192,0.750186,13.105351
1,1.0,10,0.384755,75,14.24344,0.272126,0.748068,13.160794
2,1.0,10,0.199022,89,18.424769,0.161586,0.746712,13.196164
3,1.0,10,0.403315,66,12.335291,0.275075,0.746098,13.212138
4,1.0,10,0.402416,67,12.165942,0.263252,0.744771,13.246625


In [6]:
# final model evaluation

# build and train model using the most successful hyperparameters
xgb_reg = XGBRegressor(
    colsample_bynode=hyperparameters_df.loc[0, 'colsample_bynode'],            
    max_depth=hyperparameters_df.loc[0, 'max_depth'],
    subsample=hyperparameters_df.loc[0, 'subsample'],
    n_estimators=hyperparameters_df.loc[0, 'n_estimators'],
    gamma=hyperparameters_df.loc[0, 'gamma'],
    learning_rate=hyperparameters_df.loc[0, 'learning_rate'],
)
xgb_reg.fit(X_train, y_train)

# make predictions
y_test_pred = xgb_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


R^2:	0.6132588950586824
RMSE:	16.498942031344924
