## Gradient Boosting Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics

from sklearn import ensemble
import pickle

In [2]:
#loading data for regression
data_earthquakes = pd.read_csv('data_for_regression.csv')

X = data_earthquakes[['Latitude', 'Longitude', 'Depth', 'Magnitude', 'Distance']]
y = data_earthquakes['Days-till-eruption']

In [3]:
X_train_validation, X_test, y_train_validation, y_test = model_selection.train_test_split(X, y, test_size = 0.33)

# standardization    
scaler = preprocessing.StandardScaler()
scaler.fit(X_train_validation)
X_train_validation = scaler.transform(X_train_validation)
X_test = scaler.transform(X_test)


Using grid search to find the optimal hyperparameters of a model

In [4]:
grid_model = ensemble.GradientBoostingRegressor()

params = {
    'n_estimators': [50],
    'max_depth': [49, 50, 51],
    'min_samples_split': [15, 20, 25],
    'learning_rate': [0.09, 0.1, 0.12],
    'loss': ['ls']
}

grid = model_selection.GridSearchCV(grid_model, param_grid=params, scoring='r2', cv=10, return_train_score=True)
grid.fit(X_train_validation, y_train_validation)
print('Best r2 score: ', grid.best_score_)

Best r2 score:  0.544340946732603


In [5]:
print('Best parameters: \n\tn_estimators: ', grid.best_params_['n_estimators'], 
      '\n\tmax_depth: ', grid.best_params_['max_depth'], 
      '\n\tmin_samples_split:', grid.best_params_['min_samples_split'], 
      '\n\tlearning_rate: ', grid.best_params_['learning_rate'])


Best parameters: 
	n_estimators:  50 
	max_depth:  51 
	min_samples_split: 15 
	learning_rate:  0.1


Testing model with optimal parameters and bigger n_estimators

In [6]:
model = ensemble.GradientBoostingRegressor(n_estimators=500, 
                                           max_depth=grid.best_params_['max_depth'],
                                           min_samples_split=grid.best_params_['min_samples_split'],
                                           learning_rate=grid.best_params_['learning_rate'],
                                           loss='ls'
                                          )

model.fit(X_train_validation, y_train_validation)
y_test_predicted = model.predict(X_test)
y_train_predicted = model.predict(X_train_validation)

print('Test results')
print('r2_score: ', metrics.r2_score(y_test, y_test_predicted))
print('RMSE: ', metrics.mean_squared_error(y_test, y_test_predicted, squared=False))

print('\n\nTrain results')
print('r2_score: ', metrics.r2_score(y_train_validation, y_train_predicted))
print('RMSE: ', metrics.mean_squared_error(y_train_validation, y_train_predicted, squared=False))

Test results
r2_score:  0.6178474652129669
RMSE:  8.362943068568423


Train results
r2_score:  0.9999999992397545
RMSE:  0.0003745995035900799


Cross validation on model with optimal parameters

In [7]:
kf = model_selection.KFold(n_splits=3, shuffle=True, random_state=1)

model = ensemble.GradientBoostingRegressor(n_estimators=500, 
                                           max_depth=grid.best_params_['max_depth'],
                                           min_samples_split=grid.best_params_['min_samples_split'],
                                           learning_rate=grid.best_params_['learning_rate'],
                                           loss='ls'
                                          )

scores = model_selection.cross_val_score(model, X, y, scoring='r2', cv=kf)
print('Cross validation r_2 score: ', scores.mean())

Cross validation r_2 score:  0.6206063486819171


### Saving model

In [3]:
# standardization    
scaler = preprocessing.StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

model = ensemble.GradientBoostingRegressor(n_estimators=500, 
                                           max_depth=51,
                                           min_samples_split=15,
                                           learning_rate=0.1,
                                           loss='ls'
                                          )
model.fit(X, y)
with open('models/regression.model', 'wb') as pickle_file:
    pickle.dump(model, pickle_file)
with open('models/regression.scaler', 'wb') as pickle_file:
    pickle.dump(scaler, pickle_file)