# Preprocessing & Training

### Goal:
<p>Create a cleaned development dataset you can use to complete the modeling step of your project.</p>

### Steps:
<ul><li>Create dummy or indicator features for categorical variables</li><li>Standardize the magnitude of numeric features using a scaler</li><li>Split into testing and training datasets</li></ul>
Review the following questions and apply them to your dataset:<ul><li>Does my data set have any categorical data, such as Gender or day of the week?</li><li>Do my features have data values that range from 0 - 100 or 0-1 or both and more?  </li></ul>

In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve

from library.sb_utils import save_file

In [3]:
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [4]:
adopted = pd.read_csv('data/dogs_trimmed.csv')
adopted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7179 entries, 0 to 7178
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    7179 non-null   object 
 1   gender                 7179 non-null   object 
 2   size                   7179 non-null   object 
 3   coat                   7179 non-null   object 
 4   distance               7179 non-null   float64
 5   spayed_neutered        7179 non-null   bool   
 6   house_trained          7179 non-null   bool   
 7   special_needs          7179 non-null   bool   
 8   shots_current          7179 non-null   bool   
 9   breed_primary          7179 non-null   object 
 10  breed_secondary        7179 non-null   object 
 11  breed_mixed            7179 non-null   bool   
 12  color_primary          7179 non-null   object 
 13  color_secondary        7179 non-null   object 
 14  color_tertiary         7179 non-null   object 
 15  good

## Dummies!
### After converting bools to ints, of course

In [5]:
df = adopted[['gender', 'size', 'coat', 'duration_as_adoptable', 'hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'special_needs', 'shots_current', 'goodwith_children', 'goodwith_dogs', 'goodwith_cats']]
df.loc[:, ['hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'special_needs', 'shots_current']] = adopted.loc[:, ['hasimage', 'hasvideo', 'spayed_neutered', 'house_trained', 'special_needs', 'shots_current']].astype('int64')
df = pd.get_dummies(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
# drop one of each of the dummy category columns so those features don't double-weight anything
df.drop(['size_Extra Large', 'gender_Female', 'coat_Hairless', 'goodwith_children_False', 'goodwith_dogs_False', 'goodwith_cats_False'], axis=1, inplace=True)

## Scaling using StandardScaler()

In [7]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df)

In [8]:
scaled_df = pd.DataFrame(scaled, columns=df.columns)
scaled_df.describe()

Unnamed: 0,duration_as_adoptable,hasimage,hasvideo,spayed_neutered,house_trained,special_needs,shots_current,gender_Male,size_Large,size_Medium,size_Small,coat_Curly,coat_Long,coat_Medium,coat_Short,coat_Wire,coat_unknown,goodwith_children_True,goodwith_children_unknown,goodwith_dogs_True,goodwith_dogs_unknown,goodwith_cats_True,goodwith_cats_unknown
count,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0,7179.0
mean,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0
std,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007,1.00007
min,-0.7225,-8.24796,-0.20295,-1.77474,-0.84534,-0.17402,-3.4581,-1.05633,-0.59735,-0.72447,-0.77941,-0.12589,-0.25614,-0.54392,-1.05367,-0.14103,-0.41713,-0.85824,-0.98356,-1.26237,-0.73793,-0.62833,-1.23031
25%,-0.58503,0.12124,-0.20295,0.56346,-0.84534,-0.17402,0.28918,-1.05633,-0.59735,-0.72447,-0.77941,-0.12589,-0.25614,-0.54392,-1.05367,-0.14103,-0.41713,-0.85824,-0.98356,-1.26237,-0.73793,-0.62833,-1.23031
50%,-0.37803,0.12124,-0.20295,0.56346,-0.84534,-0.17402,0.28918,0.94668,-0.59735,-0.72447,-0.77941,-0.12589,-0.25614,-0.54392,0.94906,-0.14103,-0.41713,-0.85824,-0.98356,0.79216,-0.73793,-0.62833,0.81281
75%,0.13745,0.12124,-0.20295,0.56346,1.18296,-0.17402,0.28918,0.94668,1.67405,1.38033,1.28302,-0.12589,-0.25614,-0.54392,0.94906,-0.14103,-0.41713,1.16518,1.01672,0.79216,1.35515,1.59153,0.81281
max,5.60134,0.12124,4.92729,0.56346,1.18296,5.74662,0.28918,0.94668,1.67405,1.38033,1.28302,7.94344,3.90411,1.83851,0.94906,7.09074,2.39733,1.16518,1.01672,0.79216,1.35515,1.59153,0.81281


## Split into training and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df.drop(columns='duration_as_adoptable'), 
                                                    scaled_df.duration_as_adoptable, test_size=0.3, 
                                                    random_state=192)

In [10]:
X_train.shape, X_test.shape

((5025, 22), (2154, 22))

In [11]:
y_train.shape, y_test.shape

((5025,), (2154,))

In [12]:
# save training and test sets
datapath = 'data/tt_sets'
save_file(X_train, 'dogs_X_train.csv', datapath)
save_file(X_test, 'dogs_X_test.csv', datapath)
save_file(y_train, 'dogs_y_train.csv', datapath)
save_file(y_test, 'dogs_y_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)n

Please re-run this cell with a new filename.
A file already exists with this name.

Do you want to overwrite? (Y/N)n

Please re-run this cell with a new filename.
A file already exists with this name.

Do you want to overwrite? (Y/N)n

Please re-run this cell with a new filename.
A file already exists with this name.

Do you want to overwrite? (Y/N)n

Please re-run this cell with a new filename.


# Modeling
### Goal: Build two to three different models and identify the best one.
<ul><li>Fit your models with a training dataset</li>
<li>Review model outcomes — Iterate over additional models as needed</li>
<li>Identify the final model that you think is the best model for this project</li></ul>
 Review the following questions and apply them to your analysis: 
<ul><li>Does my data involve a time series or forecasting? If so, am I splitting the train and test data appropriately?</li>
<li>Is my response variable continuous or categorical?</li></ul>

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error

### RandomForestRegressor

In [66]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.40289566360909645

In [68]:
rf_cv = cross_validate(rf, X_train, y_train, cv=5)
rf_cv_scores_preopt = rf_cv['test_score']
rf_cv_scores_preopt

array([-0.07487687,  0.03303302,  0.00564385, -0.03949187,  0.00522153])

In [69]:
np.mean(rf_cv_scores_preopt), np.std(rf_cv_scores_preopt)

(-0.014094068350996736, 0.03827071177887789)

In [70]:
rf_pred = rf.predict(X_test)
rmse_rf_preopt = np.sqrt(mean_squared_error(y_test, rf_pred))
print("RMSE : % f" %(rmse_rf_preopt))

RMSE :  1.019257


In [71]:
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=20)]
grid_params = {
        'n_estimators': n_est,
        'max_depth': [1, 2, 3,4,5, 6,7,8,9, 10, None]
}

rf_random_cv = RandomizedSearchCV(rf, param_distributions=grid_params, cv=5, n_jobs=-1)

In [72]:
from sklearn.utils import parallel_backend

with parallel_backend('threading'):
    rf_random_cv.fit(X_train, y_train)

In [73]:
rf_random_cv.best_params_

{'n_estimators': 69, 'max_depth': 7}

In [74]:
rf = RandomForestRegressor(n_estimators=69, max_depth=7)
rf.fit(X_train, y_train)
rf.score(X_train, y_train)

0.22285029296415215

In [75]:
rf_cv = cross_validate(rf, X_train, y_train, cv=5)
rf_cv_scores = rf_cv['test_score']
rf_cv_scores

array([0.05881422, 0.11527452, 0.0923009 , 0.08372498, 0.06403467])

In [76]:
np.mean(rf_cv['test_score']), np.std(rf_cv['test_score'])

(0.0828298577130411, 0.02036255116263096)

In [77]:
rf_pred = rf.predict(X_test)
rmse_rf = np.sqrt(mean_squared_error(y_test, rf_pred))
print("RMSE : % f" %(rmse_rf))

RMSE :  0.957554


### GradientBoostingRegressor

In [78]:
gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_train, y_train)

0.16057548120350285

In [79]:
gb_cv = cross_validate(gb, X_train, y_train, cv=5)
gb_cv_scores_preopt = gb_cv['test_score']
gb_cv_scores_preopt

array([0.10367682, 0.12925761, 0.09639947, 0.09612816, 0.07504663])

In [80]:
np.mean(gb_cv_scores_preopt), np.std(gb_cv_scores_preopt)

(0.10010173643521328, 0.01743617339603471)

In [81]:
gb_pred = gb.predict(X_test)
rmse_gb_preopt = np.sqrt(mean_squared_error(y_test, gb_pred))
print("RMSE : % f" %(rmse_gb_preopt))

RMSE :  0.947903


In [96]:
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=20)]
grid_params = {
        'learning_rate': [.001, .01, .1, 1, 10],
        'n_estimators': n_est,
        'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]
}

gb_random_cv = RandomizedSearchCV(gb, param_distributions=grid_params, cv=5, n_jobs=-1)

In [97]:
with parallel_backend('threading'):
    gb_random_cv.fit(X_train, y_train)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight *

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_pre

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_pre

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_pre

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_pre

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight *

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_predictions.ravel()) ** 2)))
  sample_weight * ((y - raw_pre

In [98]:
gb_random_cv.best_params_

{'n_estimators': 12, 'max_depth': 4, 'learning_rate': 0.1}

In [99]:
gb = GradientBoostingRegressor(n_estimators=12, max_depth=4, learning_rate=0.1)
gb.fit(X_train, y_train)
gb.score(X_train, y_train)

0.10465974626189356

In [100]:
gb_cv = cross_validate(gb, X_train, y_train, cv=5)
gb_cv_scores = gb_cv['test_score']
gb_cv_scores

array([0.07222386, 0.09248403, 0.08768141, 0.07059927, 0.0692522 ])

In [101]:
np.mean(gb_cv_scores), np.std(gb_cv_scores)

(0.0784481543685726, 0.009666138985732113)

In [102]:
gb_pred = gb.predict(X_test)
rmse_gb = np.sqrt(mean_squared_error(y_test, gb_pred))
print("RMSE : % f" %(rmse_gb))

RMSE :  0.962661


### KNeighborsRegressor

In [103]:
kn = KNeighborsRegressor(n_neighbors=25, weights='distance')
kn.fit(X_train, y_train)
kn.score(X_train, y_train)

0.4341712634231919

In [104]:
kn_cv = cross_validate(kn, X_train, y_train, cv=5)
kn_cv_scores_preopt = kn_cv['test_score']
np.mean(kn_cv_scores_preopt), np.std(kn_cv_scores_preopt)

(-0.03266607198484026, 0.05799061146759032)

In [105]:
kn_pred = kn.predict(X_test)
rmse_kn_preopt = np.sqrt(mean_squared_error(y_test, kn_pred))
print("RMSE : % f" %(rmse_kn_preopt))

RMSE :  1.040748


In [106]:
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=20)]
grid_params = {
        'weights': ['uniform', 'distance'],
        'n_neighbors': n_est,
        'p': [1, 2]
}

kn_random_cv = RandomizedSearchCV(kn, param_distributions=grid_params, cv=5, n_jobs=-1)

In [107]:
with parallel_backend('threading'):
    kn_random_cv.fit(X_train, y_train)

In [108]:
kn_random_cv.best_params_

{'weights': 'uniform', 'p': 1, 'n_neighbors': 143}

In [109]:
kn = KNeighborsRegressor(n_neighbors=143, weights='uniform', p=1)
kn.fit(X_train, y_train)
kn.score(X_train, y_train)

0.09116551185529542

In [110]:
kn_cv = cross_validate(kn, X_train, y_train, cv=5)
kn_cv_scores = kn_cv['test_score']
np.mean(kn_cv_scores), np.std(kn_cv_scores)

(0.07581337579892741, 0.013035914247258001)

In [111]:
kn_pred = kn.predict(X_test)
rmse_kn = np.sqrt(mean_squared_error(y_test, kn_pred))
print("RMSE : % f" %(rmse_kn))

RMSE :  0.966950


### XGBoost

In [112]:
xg = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = 50)
xg.fit(X_train, y_train)
xg.score(X_train, y_train)



0.3427697163946294

In [113]:
xg_cv = cross_validate(xg, X_train, y_train, cv=5)
xg_cv_scores_preopt = xg_cv['test_score']
np.mean(xg_cv_scores_preopt), np.std(xg_cv_scores_preopt)



(-0.001412215580556242, 0.021037048338566924)

In [114]:
xg_pred = xg.predict(X_test)
rmse_xg_preopt = np.sqrt(mean_squared_error(y_test, xg_pred))
print("RMSE : % f" %(rmse_xg_preopt))

RMSE :  0.992280




In [115]:
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=20)]
grid_params = {
        'objective': ['reg:squarederror', 'reg:squaredlogerror', 'reg:logistic'],
        'n_estimators': n_est,
}

xg_random_cv = RandomizedSearchCV(xg, param_distributions=grid_params, cv=5, n_jobs=-1)

In [116]:
xg_random_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          gpu_id=-1, importance_type='gain',
                                          interaction_constraints='',
                                          learning_rate=0.300000012,
                                          max_delta_step=0, max_depth=6,
                                          min_child_weight=1, missing=nan,
                                          monotone_constraints='()',
                                          n_estimators=50, n_jobs=8,
                                          num_parallel_tree=1, random_state=0,
                                          reg_alpha=0, reg_lambda=1,
                                          scale_pos_w

In [117]:
xg_random_cv.best_params_

{'objective': 'reg:squarederror', 'n_estimators': 12}

In [118]:
xg = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = 12)
xg.fit(X_train, y_train)
xg.score(X_train, y_train)



0.24393923329017453

In [119]:
xg_cv = cross_validate(xg, X_train, y_train, cv=5)
xg_cv_scores = xg_cv['test_score']
np.mean(xg_cv_scores), np.std(xg_cv_scores)



(0.07447023107612856, 0.025500566109289923)

In [120]:
xg_pred = xg.predict(X_test)
rmse_xg = np.sqrt(mean_squared_error(y_test, xg_pred))
print("RMSE : % f" %(rmse_xg))

RMSE :  0.963860




### Model Comparisons

In [123]:
model_scores = pd.DataFrame({'CV Score': [np.mean(rf_cv_scores), np.mean(gb_cv_scores), np.mean(kn_cv_scores), np.mean(xg_cv_scores)], 'RMSE': [rmse_rf, rmse_gb, rmse_kn, rmse_xg]}, index=['RandomForest', 'GradientBoosting', 'KNNeighbors', 'XGBoost'])
model_scores

Unnamed: 0,CV Score,RMSE
RandomForest,0.08283,0.95755
GradientBoosting,0.07845,0.96266
KNNeighbors,0.07581,0.96695
XGBoost,0.07447,0.96386


In [124]:
model_scores_pos = model_scores[model_scores >= 0]
print("Model with best CV Score:", model_scores_pos['CV Score'].idxmin())
print("Model with best RMSE:", model_scores_pos['RMSE'].idxmin())

Model with best CV Score: XGBoost
Model with best RMSE: RandomForest


I've run through these multiple times and up until this iteration GradientBoosting was consistently ahead. Now all of the scores are very close. I'm going to write a function that does the randomsearch multiple times for each and picks the best params of the best run.