# Modeling
## XGBoost

In [3]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

import xgboost as xgb

In [4]:
X_train = pd.read_csv('data/tt_sets/dogs_X_train.csv')
y_train = pd.read_csv('data/tt_sets/dogs_y_train.csv')
X_test = pd.read_csv('data/tt_sets/dogs_X_test.csv')
y_test = pd.read_csv('data/tt_sets/dogs_y_test.csv')

In [5]:
xg_params = {'n_estimators': 26, 'objective': 'reg:squarederror'}
xg = xgb.XGBRegressor(objective=xg_params['objective'], n_estimators = xg_params['n_estimators'])
xg.fit(X_train, y_train)
score_train = xg.score(X_train, y_train)



In [6]:
y_pred = xg.predict(X_test)
score_test = xg.score(X_test, y_test)



In [7]:
print('Score on training set:', round(score_train, 2))
print('Score on test set:', round(score_test, 2))

Score on training set: 0.49
Score on test set: 0.21


In [8]:
# comparison = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test})

comparison = pd.DataFrame(list(zip(y_pred, y_test.values)),
              columns=['Predicted','Actual'])

In [9]:
comparison.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154 entries, 0 to 2153
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Predicted  2154 non-null   float32
 1   Actual     2154 non-null   object 
dtypes: float32(1), object(1)
memory usage: 25.4+ KB


In [10]:
comparison['Actual'] = comparison['Actual'].astype('float64')

In [11]:
comparison.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154 entries, 0 to 2153
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Predicted  2154 non-null   float32
 1   Actual     2154 non-null   float64
dtypes: float32(1), float64(1)
memory usage: 25.4 KB


In [12]:
comparison.head()

Unnamed: 0,Predicted,Actual
0,18.037331,25.153183
1,19.899158,2.049769
2,24.543398,49.645567
3,19.839594,82.904051
4,22.838552,21.581771


In [13]:
comparison.describe()

Unnamed: 0,Predicted,Actual
count,2154.0,2154.0
mean,32.420681,32.578713
std,23.258932,44.42103
min,-5.787509,0.000231
25%,16.766083,6.077465
50%,26.273042,15.78941
75%,42.425461,38.762115
max,201.54007,276.005


In [14]:
xg.get_params()

{'objective': 'reg:squarederror',
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': 'gain',
 'interaction_constraints': '',
 'learning_rate': 0.300000012,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 26,
 'n_jobs': 8,
 'num_parallel_tree': 1,
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 1,
 'tree_method': 'exact',
 'validate_parameters': 1,
 'verbosity': None}

In [15]:
print('Average predicted time before adoption:', np.round(np.mean(y_pred), 2))
print('Actual average time before adoption:', np.round(np.mean(y_test.duration_as_adoptable), 2))

Average predicted time before adoption: 32.42
Actual average time before adoption: 32.58


## Calculating Margin of Error (Mean Absolute Error)

In [16]:
from sklearn.metrics import mean_absolute_error as mae

In [17]:
mae(y_test, y_pred)

24.138907726651144

## Calculating RMSE

In [18]:
from sklearn.metrics import mean_squared_error

In [19]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE Test Set : % f" %(rmse))

RMSE Test Set :  39.428796


### Normalizing RMSE

In [26]:
nrmse = rmse / y_test.std()
print("Normalized RMSE:", nrmse.values[0])

Normalized RMSE: 0.8876155301116694
