In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime 
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
train_features = pd.read_csv('../data/train_features.csv').drop(['Unnamed: 0'], axis=1)

In [3]:
X = train_features
Y = train_features.severity

X['month'] = pd.DatetimeIndex(X.date).month
X['year'] = pd.DatetimeIndex(X.date).year
X = X.drop(['latitude', 'longitude', 'year', 'uid', 'date', 'split', 'severity', 'density'], axis=1)
X['region'] = X['region'].map({
    'midwest': 0,
    'south': 1,
    'northeast': 2,
    'west': 3
})

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [5]:
gb_param_grid = {
    'n_estimators': [500,1000,3000],
    'learning_rate': [0.05, 0.1, 0.5, 1],
    'max_features': [2,5,7,10,20],
    'min_samples_split': [5,10,15],
    'loss': ['absolute_error', 'squared_error'],
    'subsample': [0.1,0.5,1.0]
}
gb = GradientBoostingRegressor()

In [6]:
gb_grid_search = GridSearchCV(estimator=gb,param_grid=gb_param_grid, n_jobs=-1, verbose=1)

In [7]:
%%time
gb_grid_search.fit(X_train, y_train)

gb_grid_search.best_params_

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


CPU times: user 18.3 s, sys: 3.57 s, total: 21.9 s
Wall time: 47min 7s


{'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_features': 10,
 'min_samples_split': 5,
 'n_estimators': 500,
 'subsample': 1.0}

In [8]:
y_pred = gb_grid_search.best_estimator_.predict(X_test)

In [9]:
metrics.root_mean_squared_error(y_test, y_pred)

0.7138047385025224

In [10]:
y_pred = gb_grid_search.best_estimator_.predict(X_test)
y_pred = np.round(y_pred)

In [11]:
metrics.root_mean_squared_error(y_test, y_pred)

0.7634734760784071

In [16]:
gbc_param_grid = {
    'n_estimators': [500,1000,3000],
    'learning_rate': [0.05, 0.1, 0.5, 1],
    'max_features': [2,5,7,10,20],
    'min_samples_split': [5,10,15],
    'loss': ['log_loss', 'exponential'],
    'subsample': [0.1,0.5,1.0]
}
gbc = GradientBoostingClassifier()

In [17]:
gbc_grid_search = GridSearchCV(estimator=gbc,param_grid=gbc_param_grid, n_jobs=-1, verbose=1)

In [18]:
%%time
gbc_grid_search.fit(X_train, y_train)

gbc_grid_search.best_params_

Fitting 5 folds for each of 1080 candidates, totalling 5400 fits


2700 fits failed out of a total of 5400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2700 fits failed with the following error:
Traceback (most recent call last):
  File "/home/misa/Documents/Master/Global Project/iebs-mdsbd-global-project/venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/misa/Documents/Master/Global Project/iebs-mdsbd-global-project/venv/lib/python3.10/site-packages/sklearn/base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/misa/Documents/Master/Global Project/iebs-mdsbd-global-project/venv/lib/python3.10/site-packages/sklearn/ensemble/_gb.py", line 673, in

CPU times: user 26.4 s, sys: 8.02 s, total: 34.4 s
Wall time: 1h 59min 21s


{'learning_rate': 0.05,
 'loss': 'log_loss',
 'max_features': 2,
 'min_samples_split': 15,
 'n_estimators': 500,
 'subsample': 1.0}

In [21]:
y_pred = gbc_grid_search.best_estimator_.predict(X_test)

In [22]:
metrics.root_mean_squared_error(y_test, y_pred)

0.8462411880619172