# Modeling
## GradientBoostingRegressor

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
X_train = pd.read_csv('data/tt_sets/cats_X_train.csv')
y_train = pd.read_csv('data/tt_sets/cats_y_train.csv')
X_test = pd.read_csv('data/tt_sets/cats_X_test.csv')
y_test = pd.read_csv('data/tt_sets/cats_y_test.csv')

In [3]:
gb_params = {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 233}
gb = GradientBoostingRegressor(n_estimators=gb_params['n_estimators'], max_depth=gb_params['max_depth'], learning_rate=gb_params['learning_rate'])
gb.fit(X_train, y_train)
y_pred_train = gb.predict(X_train)
score_train = gb.score(X_train, y_train)

  return f(**kwargs)


In [4]:
y_pred = gb.predict(X_test)
score_test = gb.score(X_test, y_test)

In [5]:
print('Score on training set:', round(score_train, 2))
print('Score on test set:', round(score_test, 2))

Score on training set: 0.3
Score on test set: 0.18


In [6]:
comparison = pd.DataFrame(list(zip(y_pred, y_test.values)),
              columns=['Predicted','Actual'])

In [7]:
comparison.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1947 entries, 0 to 1946
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Predicted  1947 non-null   float64
 1   Actual     1947 non-null   object 
dtypes: float64(1), object(1)
memory usage: 30.5+ KB


In [8]:
comparison = comparison.astype('float64')

In [9]:
comparison.describe()

Unnamed: 0,Predicted,Actual
count,1947.0,1947.0
mean,24.748081,24.67891
std,15.540746,35.499379
min,-7.588845,0.004965
25%,14.825157,3.681453
50%,23.511788,10.881238
75%,33.122786,29.40162
max,148.559488,220.761794


In [10]:
comparison.sample(20)

Unnamed: 0,Predicted,Actual
295,15.883155,15.916389
657,10.859235,25.344352
1905,23.082529,15.150313
87,-5.991998,0.278056
1941,17.500912,7.899363
1267,33.468041,5.303171
1376,33.418832,4.199954
1258,16.538357,0.999016
1368,24.989947,19.180891
1007,20.21576,22.231748


In [12]:
gb.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 2,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 233,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [13]:
print('Average predicted time before adoption:', np.round(np.mean(y_pred), 2))
print('Actual average time before adoption:', np.round(np.mean(y_test.duration_as_adoptable), 2))

Average predicted time before adoption: 24.75
Actual average time before adoption: 24.68


## Calculating Margin of Error (Mean Absolute Error)

In [14]:
from sklearn.metrics import mean_absolute_error as mae

In [15]:
mae(y_test, y_pred)

20.24883754859164

## Calculating RMSE

In [16]:
from sklearn.metrics import mean_squared_error

In [17]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Test Set : % f" %(rmse))

RMSE Test Set :  32.198928
