In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate

In [2]:
# This isn't exactly production-grade, but a quick check for development
# These checks can save some head-scratching in development when moving from
# one python environment to another, for example
expected_model_version = '1.0'
model_path = '../models/ski_resort_pricing_model.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

In [3]:
ski_data = pd.read_csv('../data/ski_data_step3_features.csv')

In [4]:
#Why does model have an attribute 'X_columns' which has the exact columns we want to use from ski_data and doesn't
#include Adult Weekend ticket price or the object data type columns like name, state and region?  Did it gain this attribute
#during training in our the Unit 6.4 notebook?

dir(model)

['X_columns',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_fit_params',
 '_check_n_features',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_param_names',
 '_get_params',
 '_get_tags',
 '_inverse_transform',
 '_iter',
 '_log_message',
 '_more_tags',
 '_pairwise',
 '_replace_estimator',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_set_params',
 '_sk_visual_block_',
 '_transform',
 '_validate_data',
 '_validate_names',
 '_validate_steps',
 'build_datetime',
 'classes_',
 'decision_function',

In [5]:
hasattr(model, 'X_columns')

True

In [6]:
getattr(model, 'X_columns')

['summit_elev',
 'vertical_drop',
 'base_elev',
 'trams',
 'fastSixes',
 'fastQuads',
 'quad',
 'triple',
 'double',
 'surface',
 'total_chairs',
 'Runs',
 'TerrainParks',
 'LongestRun_mi',
 'SkiableTerrain_ac',
 'Snow Making_ac',
 'daysOpenLastYear',
 'yearsOpen',
 'averageSnowfall',
 'projectedDaysOpen',
 'NightSkiing_ac',
 'resorts_per_state',
 'resorts_per_100kcapita',
 'resorts_per_100ksq_mile',
 'resort_skiable_area_ac_state_ratio',
 'resort_days_open_state_ratio',
 'resort_terrain_park_state_ratio',
 'resort_night_skiing_state_ratio',
 'total_chairs_runs_ratio',
 'total_chairs_skiable_ratio',
 'fastQuads_runs_ratio',
 'fastQuads_skiable_ratio']

In [7]:
X = ski_data.loc[ski_data.Name != "Big Mountain Resort", model.X_columns]
y = ski_data.loc[ski_data.Name != "Big Mountain Resort", 'AdultWeekend']

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

RF_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    RandomForestRegressor(random_state=47)
)

In [9]:
n_est = [int(n) for n in np.logspace(start=1, stop=3, num=20)]
grid_params = {
        'randomforestregressor__n_estimators': n_est,
        'standardscaler': [StandardScaler(), None],
        'simpleimputer__strategy': ['mean', 'median']
}
grid_params

{'randomforestregressor__n_estimators': [10,
  12,
  16,
  20,
  26,
  33,
  42,
  54,
  69,
  88,
  112,
  143,
  183,
  233,
  297,
  379,
  483,
  615,
  784,
  1000],
 'standardscaler': [StandardScaler(), None],
 'simpleimputer__strategy': ['mean', 'median']}

In [10]:
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve

rf_grid_cv = GridSearchCV(RF_pipe, param_grid=grid_params, cv=5, n_jobs=-1)

In [11]:
#This confirms that X_columns, in addition to a lot of other attributes, weren't present in the initial model before it was
#fit to data

set(dir(model)) - set(dir(rf_grid_cv))

{'X_columns',
 '__annotations__',
 '__getitem__',
 '__len__',
 '_check_fit_params',
 '_final_estimator',
 '_fit',
 '_get_params',
 '_inverse_transform',
 '_iter',
 '_log_message',
 '_replace_estimator',
 '_set_params',
 '_sk_visual_block_',
 '_transform',
 '_validate_names',
 '_validate_steps',
 'build_datetime',
 'fit_predict',
 'fit_transform',
 'memory',
 'named_steps',
 'numpy_version',
 'pandas_version',
 'score_samples',
 'sklearn_version',
 'steps',
 'version'}

In [12]:
#however once we trained the model, it also seems to have lost these few attributes, maybe because they became irrelevant

set(dir(rf_grid_cv)) - set(dir(model))

{'_check_is_fitted',
 '_format_results',
 '_run_search',
 'cv',
 'error_score',
 'estimator',
 'iid',
 'n_jobs',
 'param_grid',
 'pre_dispatch',
 'refit',
 'return_train_score',
 'scoring'}

In [13]:
dir(rf_grid_cv)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'classes_',
 'cv',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'n_features_in_',
 'n_jobs',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'return_train_score',
 'score',
 'scoring'

In [14]:
rf_grid_cv.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('simpleimputer',
                                        SimpleImputer(strategy='median')),
                                       ('standardscaler', StandardScaler()),
                                       ('randomforestregressor',
                                        RandomForestRegressor(random_state=47))]),
             n_jobs=-1,
             param_grid={'randomforestregressor__n_estimators': [10, 12, 16, 20,
                                                                 26, 33, 42, 54,
                                                                 69, 88, 112,
                                                                 143, 183, 233,
                                                                 297, 379, 483,
                                                                 615, 784,
                                                                 1000],
                         'simpleimputer__strategy': [

In [15]:
dir(rf_grid_cv)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_

In [16]:
set(dir(model)) - set(dir(rf_grid_cv))

{'X_columns',
 '__annotations__',
 '__getitem__',
 '__len__',
 '_check_fit_params',
 '_final_estimator',
 '_fit',
 '_get_params',
 '_inverse_transform',
 '_iter',
 '_log_message',
 '_replace_estimator',
 '_set_params',
 '_sk_visual_block_',
 '_transform',
 '_validate_names',
 '_validate_steps',
 'build_datetime',
 'fit_predict',
 'fit_transform',
 'memory',
 'named_steps',
 'numpy_version',
 'pandas_version',
 'score_samples',
 'sklearn_version',
 'steps',
 'version'}

In [17]:
#Those are the same columns differed by before, so training is not where the change in attributes came from
#Maybe the model we saved at the end of Unit 6.4 was actually just a part of this random forest grid search cv object,
#thus it may be its own object with its own attributes

In [19]:
rf_best_cv_results = cross_validate(rf_grid_cv.best_estimator_, X, y, cv=5)

In [20]:
rf_best_cv_results

{'fit_time': array([0.07195425, 0.06891251, 0.06695938, 0.07142448, 0.07191825]),
 'score_time': array([0.00346828, 0.00347209, 0.00347233, 0.00396848, 0.00347257]),
 'test_score': array([0.75587653, 0.64070027, 0.58611079, 0.59053655, 0.69462271])}

In [21]:
#what we saved was actually the "best estimator" from our random forest grid search cv object
#maybe that has the attributes we are looking for

rf_grid_cv.best_estimator_

Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='median')),
                ('standardscaler', StandardScaler()),
                ('randomforestregressor',
                 RandomForestRegressor(n_estimators=33, random_state=47))])

In [22]:
dir(rf_grid_cv.best_estimator_)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_fit_params',
 '_check_n_features',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_param_names',
 '_get_params',
 '_get_tags',
 '_inverse_transform',
 '_iter',
 '_log_message',
 '_more_tags',
 '_pairwise',
 '_replace_estimator',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_set_params',
 '_sk_visual_block_',
 '_transform',
 '_validate_data',
 '_validate_names',
 '_validate_steps',
 'classes_',
 'decision_function',
 'fit',
 'fit_predict',
 'fit_tr

In [25]:
set(dir(model)) - set(dir(rf_grid_cv.best_estimator_))

{'X_columns',
 'build_datetime',
 'numpy_version',
 'pandas_version',
 'sklearn_version',
 'version'}

In [27]:
#Now we are closer!  Looks like these attributes only exist because we added them manually at the end of Unit 6.4

rf_grid_cv.best_estimator_.X_columns = [col for col in X.columns]
set(dir(model)) - set(dir(rf_grid_cv.best_estimator_))

{'build_datetime',
 'numpy_version',
 'pandas_version',
 'sklearn_version',
 'version'}

We figured it out!

Here are some ways to add attributes to objects in python in general

https://stackoverflow.com/questions/2827623/how-can-i-create-an-object-and-add-attributes-to-it

https://code.activestate.com/recipes/52308-the-simple-but-handy-collector-of-a-bunch-of-named/?in=user-97991

However, I want to ask my Mentor John Sukup what might have happened in this instance: why can we add attributes to our best model without resorting to any tricks?  Is this general sklearn functionality in order to make saving and re-using models easier?

In [1]:
expected_visitors = 350_000
revenueincrease5dollar = 5 * expected_visitors * 5

In [2]:
revenueincrease5dollar

8750000

In [3]:
revenueincrease15dollar = 5 * expected_visitors * 15
revenueincrease15dollar

26250000