In [54]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as mplt

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [52]:
def display_scores(scores):
    print("Scores for %2.0f folds:" % len(scores), pd.DataFrame(scores))
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

## Data preperation

In [46]:
data = pd.read_csv ('Dataset_heart rate_retail stores.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 23 columns):
Id                                165 non-null int64
age                               165 non-null int64
gender                            165 non-null object
BMI                               165 non-null float64
sleeping_time                     165 non-null float64
sporting_activity_(h/week)        165 non-null float64
daytime                           165 non-null object
outdoor_temperature               165 non-null int64
shopping_cart                     165 non-null object
attendants                        165 non-null object
shopping_frequency                165 non-null object
cash_point                        165 non-null object
shopping_amount                   165 non-null object
weekday                           165 non-null object
smoking                           165 non-null int64
ex_max                            165 non-null int64
ex_min                           

In [47]:
# get target_varaible and remove it from data
target_data = np.array(data[['shopping duration (in minutes)']]).flatten()
data = data.drop(['shopping duration (in minutes)'], axis=1) 

# aslo drop id
data = data.drop(['Id'], axis=1)

In [48]:
# drop categorical and non_numeric vlaues
data_numeriacal = data.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])

In [49]:
# get the list of categorical values, aslo include 
categorical_values = list(data.select_dtypes(include=['object']))

In [50]:
# read numeerical pre-precessing piple line
num_pipeline = Pipeline([
        #Normalization
        ('normalize', StandardScaler()) #new
    ])

num_attribs=list(data_numeriacal)
cat_attribs=categorical_values

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

data_prepared = full_pipeline.fit_transform(data)

### RANDOM FOREST

In [55]:
forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(data_prepared, target_data)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [57]:
predictions = forest_reg.predict(data_prepared)
mae = mean_absolute_error(target_data, predictions)
print("Mean absolute error: %2.2f minutes" % mae)

Mean absolute error: 3.00 minutes


#### Cross validation

In [58]:
forest_scores = cross_val_score(forest_reg, data_prepared, target_data,
                                scoring="neg_mean_absolute_error", cv=10)
display_scores(-forest_scores)

Scores for 10 folds:            0
0   7.852941
1   9.888235
2   5.311765
3   7.882353
4   5.664706
5   8.881250
6   9.456250
7  12.068750
8   7.000000
9   9.343750
Mean: 8.334999999999999
Standard deviation: 1.9343348602668566


### Do hyperparameter search on random forest

#### Grid Serach

In [65]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]
param_grid

[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
 {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}]

In [67]:
forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_absolute_error', return_train_score=True)
grid_search.fit(data_prepared, target_data)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 6, 8],
  

In [68]:
# best parameter conbination
grid_search.best_params_

{'max_features': 8, 'n_estimators': 30}

In [69]:
# score of each hyperparameter combination tested during the grid search:

cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)

13.147474747474746 {'max_features': 2, 'n_estimators': 3}
11.153333333333334 {'max_features': 2, 'n_estimators': 10}
10.583636363636364 {'max_features': 2, 'n_estimators': 30}
11.119191919191922 {'max_features': 4, 'n_estimators': 3}
10.975757575757576 {'max_features': 4, 'n_estimators': 10}
9.87737373737374 {'max_features': 4, 'n_estimators': 30}
12.341414141414143 {'max_features': 6, 'n_estimators': 3}
9.955151515151515 {'max_features': 6, 'n_estimators': 10}
9.391919191919193 {'max_features': 6, 'n_estimators': 30}
12.224242424242426 {'max_features': 8, 'n_estimators': 3}
9.852121212121212 {'max_features': 8, 'n_estimators': 10}
9.150303030303032 {'max_features': 8, 'n_estimators': 30}
14.480808080808078 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
11.893333333333334 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
13.054545454545455 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
11.716363636363635 {'bootstrap': False, 'max_features': 3, 'n_estim

#### Random Serach

In [60]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_absolute_error', random_state=42)
rnd_search.fit(data_prepared, target_data)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [61]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)

8.822828282828283 {'max_features': 7, 'n_estimators': 180}
9.898585858585859 {'max_features': 5, 'n_estimators': 15}
9.76952861952862 {'max_features': 3, 'n_estimators': 72}
9.535930735930737 {'max_features': 5, 'n_estimators': 21}
8.789220069547937 {'max_features': 7, 'n_estimators': 122}
9.846222222222222 {'max_features': 3, 'n_estimators': 75}
9.840564738292011 {'max_features': 3, 'n_estimators': 88}
9.185333333333334 {'max_features': 5, 'n_estimators': 100}
9.847486868686868 {'max_features': 3, 'n_estimators': 150}
13.33030303030303 {'max_features': 5, 'n_estimators': 2}


It looks random hyperparameter search yielded better result than grid search.

In [71]:
# best parameter conbination
rnd_search.best_params_

{'max_features': 7, 'n_estimators': 122}

#### Featue importance

In [64]:
# get feature importance values from best estimator
feature_importances = rnd_search.best_estimator_.feature_importances_

#cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs  + cat_one_hot_attribs
pd.DataFrame(sorted(zip(feature_importances, attributes), reverse=True))

Unnamed: 0,0,1
0,0.222137,max_time
1,0.13736,min_time
2,0.041206,age
3,0.039114,hr_min
4,0.033888,BMI
5,0.031367,ex_min
6,0.030836,ex_max
7,0.029409,sleeping_time
8,0.028398,hr_max
9,0.023946,outdoor_temperature
