In [2]:
from DataProcessing import *
from FeatureEngineering import *
from Tuning import *

#### Instantiate data object $data$
- assign ordinal categorical features 
- separate full train data into train, test

#### Vectorize categorical data


In [2]:
ordinalcodes = {'degree':{'NONE':0,
                          'HIGH_SCHOOL':1,
                          'BACHELORS':2,
                          'MASTERS':3,
                          'DOCTORAL':4},
                'jobType':{'JANITOR':0,
                           'JUNIOR':1,
                           'SENIOR':2,
                           'MANAGER':3,
                           'VICE_PRESIDENT':4,
                           'CFO':5,
                           'CTO':6,
                           'CEO':7}
               }


data = Data('train_features.csv','train_salaries.csv',
         ['companyId','degree','industry','jobType','major'],
         ['milesFromMetropolis','yearsExperience'],
         'salary','jobId',ordinalcodes,test_sz=.3)

train_df = data._preprocessData(data.train_df,
                                True,True,
                                ['companyId','industry','major'])
# assign copy of train data in initial vectorized state
vec_train_df = train_df.copy()
display(train_df.head())
train_df.shape

Unnamed: 0,milesFromMetropolis,yearsExperience,jobId,salary,companyId,industry,major,degree,jobType
0,39,3,JOB1362685396294,103,30,2,7,0,3
1,39,23,JOB1362684596757,191,51,3,8,3,7
2,10,19,JOB1362685320583,125,53,0,7,2,6
3,22,24,JOB1362685350076,129,13,3,7,0,6
4,21,22,JOB1362685172779,99,45,1,7,0,0


(699997, 9)

#### Instantiate feature engineering object and compute new response statistics on grouped features    

In [3]:
featureEng = FeatureEng(data) 

train_df = featureEng._compute_new_features(vec_train_df,train_df,
                                            featureEng.data.columns_cat,
                                            featureEng.data.columns_num,4)
display(train_df.head())
train_df.shape

Unnamed: 0,milesFromMetropolis,yearsExperience,jobId,salary,companyId,industry,major,degree,jobType,cat_group_min,...,cat_group_mean,cat_group_max,cat_group_std,milesFromMetropolis_quantile,yearsExperience_quantile,numeric_quantile_min,numeric_quantile_median,numeric_quantile_mean,numeric_quantile_max,numeric_quantile_std
0,39,3,JOB1362685396294,103,30,2,7,0,3,66,...,116.822222,209,34.350658,1,0,23,102,103.019774,245,39.274601
1,39,23,JOB1362684596757,191,51,3,8,3,7,150,...,170.25,191,18.997807,1,3,49,143,145.250973,284,43.013118
2,10,19,JOB1362685320583,125,53,0,7,2,6,108,...,124.285714,140,11.513967,0,3,62,159,160.419643,301,44.190884
3,22,24,JOB1362685350076,129,13,3,7,0,6,82,...,119.727273,213,27.278379,0,3,62,159,160.419643,301,44.190884
4,21,22,JOB1362685172779,99,45,1,7,0,0,23,...,53.45098,102,19.952234,0,3,62,159,160.419643,301,44.190884


(699997, 21)

#### Instantiate $tune$ object which contains linear estimator and hyperparameter tuning methods

In [4]:
model_list = []
loss_dict = {}

tune = Tuning(train_df.drop(data.id,axis=1),
              data.ylabel,5,model_list,
              loss_dict)


## RandomForestRegressor

RandomForestRegressor hyperparameters:
- ***n_estimators*** controls the number of trees in the forest. i.e. the number of resampled, bagged trees to include in the (averaged) forest. Large *n_estimators* is computationally expensive. Random search found best *n_estimators* to be around 58.
- ***max_depth*** controls the maximum depth of the tree. Larger *max_depth* increases variance and reduces bias.
- ***min_samples_split*** The minimum number of samples required to split a node. Lower values increases variance and reduces bias. Random search found best *max_depth* to be around 58.
- ***min_samples_leaf*** controls the minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. Lower values (float) increases variance and reduces bias. Random search found best *min_samples_leaf* value to be around 92.
- ***min_weight_fraction_leaf*** is minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Lower values increases variance and reduces bias. At the cost of possible loss improvement, *min_weight_fraction_leaf* was left at the default value of 0 in order to save time. 
- ***max_features*** specifies the number of features to consider when looking for the best split. Random search found ***max_fratures*** to be around 19.
***min_impurity_decrease*** controls when a node will be split if this split induces a decrease of the impurity greater than or equal to this value. Lower values increases variance, reduces bias. Random search found best *min_impurity_decrease* to be around 0.017. 

#### Initial RandomSearchCV search for RandomForest hyperparameters  

In [6]:
tune = Tuning(train_df.drop(data.id,axis=1),
              data.ylabel,5,
              model_list,loss_dict)

t0 = time()
rf_pipe = make_pipeline(SimpleImputer(strategy='median'),
                        RandomForestRegressor())
rf_pipe_best = tune._tune_hyperparams(model=rf_pipe,params={
                                            'randomforestregressor__n_estimators':randint(2,100), 
                                            'randomforestregressor__max_depth':randint(2,100),  
                                            'randomforestregressor__min_samples_split':randint(2,100), 
                                            'randomforestregressor__min_samples_leaf':randint(2,100), 
                                            'randomforestregressor__max_features':uniform(0,.99),
                                            'randomforestregressor__max_leaf_nodes':[None], 
                                            'randomforestregressor__min_impurity_decrease':uniform(0,.05), 
                                                   },lossfunc='neg_mean_squared_error',figsz=(10,4),ylabel=data.ylabel,
                                   plot_param=None,
                                   searchRandom=True,n_iter=10,verbose=False)

print('Time to complete:{}'.format((time()-t0)/60))

tuned optimal hyperparmeter:
randomforestregressor__max_depth: 71
randomforestregressor__max_features: 0.865505794816589
randomforestregressor__max_leaf_nodes: None
randomforestregressor__min_impurity_decrease: 0.03631976823954457
randomforestregressor__min_samples_leaf: 84
randomforestregressor__min_samples_split: 7
randomforestregressor__n_estimators: 32

gridsearch best score: 311.1461223626699
Time to complete:25.596480703353883


#### Narrow the range of hyperparameters suggested by initial RandomizedSearch 

In [6]:
tune = Tuning(train_df.drop(data.id,axis=1),
              data.ylabel,5,
              model_list,loss_dict)

t0 = time()
rf_pipe = make_pipeline(SimpleImputer(strategy='median'),RandomForestRegressor())
rf_pipe_best = tune._tune_hyperparams(model=rf_pipe,params={
                                            'randomforestregressor__n_estimators':randint(22,42), 
                                            'randomforestregressor__max_depth':randint(61,81),  
                                            'randomforestregressor__min_samples_split':randint(2,12), 
                                            'randomforestregressor__min_samples_leaf':randint(78,89), 
                                            'randomforestregressor__max_features':randint(16,20),
                                            'randomforestregressor__max_leaf_nodes':[None], 
                                            'randomforestregressor__min_impurity_decrease':uniform(.015,.05), 
                                                   },lossfunc='neg_mean_squared_error',figsz=(10,4),ylabel=data.ylabel,
                                   plot_param=None,
                                   searchRandom=True,n_iter=5,verbose=False)

print('Time to complete:{}'.format((time()-t0)/60))

tuned optimal hyperparmeter:
randomforestregressor__max_depth: 62
randomforestregressor__max_features: 19
randomforestregressor__max_leaf_nodes: None
randomforestregressor__min_impurity_decrease: 0.01706250684255796
randomforestregressor__min_samples_leaf: 88
randomforestregressor__min_samples_split: 11
randomforestregressor__n_estimators: 38

gridsearch best score: 305.613767505715
Time to complete:128.114544403553


#### Final hyperparameter search for RandomForestRegressor 
- MSE improved slightly, so we'll stop here and use the following hyperparameters in RandomForestRegressor:

In [8]:
rf_pipe = make_pipeline(SimpleImputer(strategy='median'),RandomForestRegressor())
t0 = time()
rf_pipe_best = tune._tune_hyperparams(model=rf_pipe,params={
                                            'randomforestregressor__n_estimators':randint(33,43), 
                                            'randomforestregressor__max_depth':randint(57,67),  
                                            'randomforestregressor__min_samples_split':randint(8,14), 
                                            'randomforestregressor__min_samples_leaf':randint(83,93), 
                                            'randomforestregressor__max_features':randint(16,21),
                                            'randomforestregressor__max_leaf_nodes':[None], 
                                            'randomforestregressor__min_impurity_decrease':uniform(.015,.019), 
                                                   },lossfunc='neg_mean_squared_error',figsz=(10,4),ylabel=data.ylabel,
                                   plot_param=None,
                                   searchRandom=True,n_iter=4,verbose=False)

print('Time to complete:{}'.format((time()-t0)/60))

tuned optimal hyperparmeter:
randomforestregressor__max_depth: 58
randomforestregressor__max_features: 19
randomforestregressor__max_leaf_nodes: None
randomforestregressor__min_impurity_decrease: 0.01716494628293873
randomforestregressor__min_samples_leaf: 92
randomforestregressor__min_samples_split: 13
randomforestregressor__n_estimators: 41

gridsearch best score: 305.1139205249845
Time to complete:137.73217782974243


## GradientBoostingRegressor

#### GradientBoostingRegressor hyperparameters:
- ***learning_rate*** shrinks the contribution of each tree by the *learning_rate*. There is a trade-off between *learning_rate* and *n_estimators*. Larger values increase variance and reduce bias. Random search found best *learning_rate* to be around 0.135.
- ***n_estimators*** is the number of boosting stages to perform. Gradient boosting is fairly robust to over-fitting so a large number usually results in better performance. Larger values of *n_estimators* increases model variance and reduces bias. Random search found best *n_estimators* value to be around 268.
- ***subsample*** controls the fraction of samples to be used for fitting the individual base learners. If smaller than 1.0 this results in Stochastic Gradient Boosting. *subsample* interacts with the parameter *n_estimators*. *subsample* values < 1.0 reduces variance and increases bias. Random search found best *subsample* value to be around 0.68 (estimator was regulated for variance).
- ***min_samples_split*** is the minimum number of samples required to split an internal node: If *int*, then consider *min_samples_split* as the minimum number. Larger values increase variance and reduce bias. If *float*, then *min_samples_split* is a fraction and *ceil(min_samples_split * n_samples)* are the minimum number of samples for each split. Random search found best *min_samples_split* to be around 72.
- ***min_samples_leaf***  hyperparameter is the minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least *min_samples_leaf* training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. Smaller values increase variance and reduce bias. Random search found best *min_samples_leaf* to be around 411. 
- ***min_weight_fraction_leaf*** is the minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when *sample_weight* is not provided. Smaller values increase variance and reduce bias. Random search found best *min_weigh_fraction_leaf* to be around .048. 
- ***max_depth*** specifies the maximum depth of the individual regression estimators. The maximum depth limits the number of nodes in the tree. The best value depends on the interaction of the input variables. Larger values increase variance and decrease bias. Random search found best *max_depth* to be around 544.
- ***min_impurity_decrease*** specifies when a node will be split if this split induces a decrease of the impurity greater than or equal to this value. Larger values increase variance and decrease bias. Random search determined the best *min_impurity_decrease* value to be around .0035. 
- ***max_features*** specifies the number of features to consider when looking for the best split. Choosing *max_features* < *n_features* leads to a reduction of variance and an increase in bias. Larger values increase variance and decrease bias. Random search found best *max_features* value to be around 8 (i.e. 8/19 ~= 42% of features were considered for each node split).
- ***max_leaf_nodes*** controls the number of leaves in a tree. Best nodes are defined as relative reduction in impurity. If value is *None* then unlimited number of leaf nodes. Larger values increase variance and decrease bias. Random search found best *max_leaf_nodes* to be around 17.

### GradientBoostingRegressor hyperparameter tuning with RandomizedSearch
- First, search a wide range for each hyperparameter

In [6]:
tune = Tuning(train_df.drop(data.id,axis=1),
              data.ylabel,5,
              model_list,loss_dict)

gb_pipe = make_pipeline(SimpleImputer(strategy='median'),
                        GradientBoostingRegressor())

t0=time()
gb_pipe_best = tune._tune_hyperparams(model=gb_pipe,
                                   params={
                                           'gradientboostingregressor__learning_rate':uniform(.06,.16),
                                           'gradientboostingregressor__n_estimators':randint(100,300),
                                           'gradientboostingregressor__subsample':uniform(.3,.5),
                                           'gradientboostingregressor__min_samples_split':randint(2,100),
                                           'gradientboostingregressor__min_samples_leaf':randint(1,800), 
                                           'gradientboostingregressor__min_weight_fraction_leaf':uniform(0,.1),#max value is .5, 
                                           'gradientboostingregressor__max_depth':randint(2,700), 
                                           'gradientboostingregressor__min_impurity_decrease':uniform(0,.01), 
                                           'gradientboostingregressor__max_features':randint(8,20),  
                                           'gradientboostingregressor__max_leaf_nodes':randint(8,20),  
                                           },lossfunc='neg_mean_squared_error',figsz=(10,4),ylabel=data.ylabel,
                                   plot_param=None,
                                   searchRandom=True,n_iter=5,verbose=False)

t_elapsed = (time()-t0)/60
print('Time to execute:',t_elapsed)

tuned optimal hyperparmeter:
gradientboostingregressor__learning_rate: 0.07793043512142418
gradientboostingregressor__max_depth: 552
gradientboostingregressor__max_features: 10
gradientboostingregressor__max_leaf_nodes: 18
gradientboostingregressor__min_impurity_decrease: 0.002254503549716579
gradientboostingregressor__min_samples_leaf: 413
gradientboostingregressor__min_samples_split: 80
gradientboostingregressor__min_weight_fraction_leaf: 0.04337065230411918
gradientboostingregressor__n_estimators: 268
gradientboostingregressor__subsample: 0.5033336861761807

gridsearch best score: 300.5592771279986
Time to execute: 236.09604076544443


#### Fine-tune GradientBoostingRegressor hyperparameters with RandomizedSearch

In [7]:
tune = Tuning(train_df.drop(data.id,axis=1),
              data.ylabel,5,
              model_list,loss_dict)

gb_pipe = make_pipeline(SimpleImputer(strategy='median'),
                        GradientBoostingRegressor())

t0=time()
gb_pipe_best = tune._tune_hyperparams(model=gb_pipe,
                                   params={
                                           'gradientboostingregressor__learning_rate':uniform(.07,.083),
                                           'gradientboostingregressor__n_estimators':randint(263,273),
                                           'gradientboostingregressor__subsample':uniform(.45,.55),
                                           'gradientboostingregressor__min_samples_split':randint(70,90),
                                           'gradientboostingregressor__min_samples_leaf':randint(393,423), 
                                           'gradientboostingregressor__min_weight_fraction_leaf':uniform(.038,.048),#max value is .5, 
                                           'gradientboostingregressor__max_depth':randint(500,600), 
                                           'gradientboostingregressor__min_impurity_decrease':uniform(0.002,.0025), 
                                           'gradientboostingregressor__max_features':randint(8,12),  
                                           'gradientboostingregressor__max_leaf_nodes':randint(15,22),  
                                           },lossfunc='neg_mean_squared_error',figsz=(10,4),ylabel=data.ylabel,
                                   plot_param=None,
                                   searchRandom=True,n_iter=3,verbose=False)

t_elapsed = (time()-t0)/60
print('Time to execute:',t_elapsed)

tuned optimal hyperparmeter:
gradientboostingregressor__learning_rate: 0.13485569045908746
gradientboostingregressor__max_depth: 544
gradientboostingregressor__max_features: 8
gradientboostingregressor__max_leaf_nodes: 17
gradientboostingregressor__min_impurity_decrease: 0.00354711044501207
gradientboostingregressor__min_samples_leaf: 411
gradientboostingregressor__min_samples_split: 72
gradientboostingregressor__min_weight_fraction_leaf: 0.04833077900157417
gradientboostingregressor__n_estimators: 268
gradientboostingregressor__subsample: 0.6886522918443929

gridsearch best score: 298.984249922744
Time to execute: 175.08813867171605
