<a href="https://colab.research.google.com/github/rtajeong/Hallym_univ_M34/blob/main/hallym_lab5_hyper_param.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hyperparameter Tuning

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [35]:
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.model_selection import cross_val_score, train_test_split

In [36]:
!curl -L https://goo.gl/s8qSL5 -o bike_train.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  643k  100  643k    0     0   899k      0 --:--:-- --:--:-- --:--:--  899k


In [37]:
df = pd.read_csv('bike_train.csv', parse_dates=['datetime'])
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [38]:
df.dtypes

datetime      datetime64[ns]
season                 int64
holiday                int64
workingday             int64
weather                int64
temp                 float64
atemp                float64
humidity               int64
windspeed            float64
casual                 int64
registered             int64
count                  int64
dtype: object

In [39]:
df['datetime'].dt.year[0], df['datetime'].dt.month[0], df['datetime'].dt.day[0]

(2011, 1, 1)

In [40]:
df['datetime'].dt.hour[0], df['datetime'].dt.minute[0], df['datetime'].dt.second[0]

(0, 0, 0)

In [41]:
df['datetime'].dt.dayofweek[0]

5

In [42]:
d_month = df['datetime'].dt.month
d_hour = df['datetime'].dt.hour
d_dayofweek = df['datetime'].dt.dayofweek

In [43]:
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [44]:
df['d_month'] = d_month
df['d_hour'] = d_hour
df['d_dayofweek'] = d_dayofweek

In [45]:
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,d_month,d_hour,d_dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1,1,5


In [46]:
df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'd_month', 'd_hour', 'd_dayofweek'],
      dtype='object')

In [47]:
features = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'd_month', 'd_hour', 'd_dayofweek']

In [48]:
X, y = df[features], df['count']

In [49]:
X.shape, y.shape

((10886, 11), (10886,))

In [52]:
# simple prediction model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8658090792231083

In [66]:
# different models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = LinearRegression()
model.fit(X_train, y_train)
print("Linear model: ", model.score(X_test, y_test))

model = DecisionTreeRegressor(max_depth=7)
model.fit(X_train, y_train)
print("Decision Tree model: ", model.score(X_test, y_test))

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)
print("Random forest: ", model.score(X_test, y_test))

Linear model:  0.3487662558187098
Decision Tree model:  0.7464048218842357
Random forest:  0.8721153715326093


# hyperparameter selection

In [68]:
# Grid search
n_estimators = 30
max_depth_list = [10, 20, 30]
max_feature_list = [0.3, 0.5, 0.9, 1.0]
hp_lists = []

for i in max_depth_list:
  for j in max_feature_list:
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth = i,
                                  max_features = j,
                                  random_state = 20,
                                  n_jobs = -1)
    score = cross_val_score(model, X_train, y_train, cv=5).mean()
    hp_lists.append({'score:': score,
                     'max_depth': i,
                     'max_features': j})
hp_lists

[{'max_depth': 10, 'max_features': 0.3, 'score:': 0.7407287837629414},
 {'max_depth': 10, 'max_features': 0.5, 'score:': 0.8086033120343947},
 {'max_depth': 10, 'max_features': 0.9, 'score:': 0.84270199823446},
 {'max_depth': 10, 'max_features': 1.0, 'score:': 0.8424014204374164},
 {'max_depth': 20, 'max_features': 0.3, 'score:': 0.8237484118175502},
 {'max_depth': 20, 'max_features': 0.5, 'score:': 0.8583591099810709},
 {'max_depth': 20, 'max_features': 0.9, 'score:': 0.8603796092776956},
 {'max_depth': 20, 'max_features': 1.0, 'score:': 0.8580284299087697},
 {'max_depth': 30, 'max_features': 0.3, 'score:': 0.8186869135339616},
 {'max_depth': 30, 'max_features': 0.5, 'score:': 0.8565057824464016},
 {'max_depth': 30, 'max_features': 0.9, 'score:': 0.8610090393691621},
 {'max_depth': 30, 'max_features': 1.0, 'score:': 0.8584905232582557}]

In [72]:
# random search
hp_lists = []
num_epoch = 20

for i in range(num_epoch):
  max_depth = np.random.randint(low=5, high=50)
  max_feature = np.random.uniform(low=0.3, high=1.0)
  model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth = max_depth,
                                  max_features = max_feature,
                                  random_state = 20,
                                  n_jobs = -1)
  score = cross_val_score(model, X_train, y_train, cv=5).mean()
  hp_lists.append({'score': score,
                   'max_depth': max_depth,
                   'max_features': max_feature})
# hp_lists
pd.DataFrame(hp_lists).sort_values(by='score', ascending=False)

[{'max_depth': 25,
  'max_features': 0.5469004321147375,
  'score': 0.8614338888026387},
 {'max_depth': 16,
  'max_features': 0.43766734376182187,
  'score': 0.8436939950309755},
 {'max_depth': 23,
  'max_features': 0.48990144650826684,
  'score': 0.8584957589230001},
 {'max_depth': 16,
  'max_features': 0.3771863358765116,
  'score': 0.8436939950309755},
 {'max_depth': 47,
  'max_features': 0.3162856706652246,
  'score': 0.818490073198971},
 {'max_depth': 44,
  'max_features': 0.6213905717892567,
  'score': 0.8607701567416738},
 {'max_depth': 28,
  'max_features': 0.6522308311776167,
  'score': 0.8635974012644547},
 {'max_depth': 18,
  'max_features': 0.7146595176938666,
  'score': 0.8622705073107767},
 {'max_depth': 8,
  'max_features': 0.44242884952766875,
  'score': 0.7190526227168301},
 {'max_depth': 38,
  'max_features': 0.9494041716539343,
  'score': 0.8603160719088454},
 {'max_depth': 42,
  'max_features': 0.9596755151335907,
  'score': 0.8603160719088454},
 {'max_depth': 13,
 

In [75]:
pd.DataFrame(hp_lists).sort_values(by='score', ascending=False)

Unnamed: 0,score,max_depth,max_features
6,0.863597,28,0.652231
7,0.862271,18,0.71466
18,0.861629,16,0.711848
13,0.861606,18,0.866432
15,0.861479,27,0.904811
0,0.861434,25,0.5469
17,0.860792,32,0.633735
12,0.86077,45,0.625057
5,0.86077,44,0.621391
10,0.860316,42,0.959676


In [76]:
# fine tuning
hp_lists = []
num_epoch = 20

for i in range(num_epoch):
  max_depth = np.random.randint(low=16, high=35)
  max_feature = np.random.uniform(low=0.62, high=0.9)
  model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth = max_depth,
                                  max_features = max_feature,
                                  random_state = 20,
                                  n_jobs = -1)
  score = cross_val_score(model, X_train, y_train, cv=5).mean()
  hp_lists.append({'score': score,
                   'max_depth': max_depth,
                   'max_features': max_feature})
# hp_lists
pd.DataFrame(hp_lists).sort_values(by='score', ascending=False)

Unnamed: 0,score,max_depth,max_features
16,0.86362,33,0.771783
17,0.863597,28,0.724091
7,0.863439,30,0.704716
14,0.863379,29,0.704957
9,0.863342,30,0.776552
18,0.863221,21,0.670976
2,0.862879,25,0.745722
1,0.862833,23,0.702155
6,0.862271,18,0.665134
13,0.862032,24,0.709429


In [78]:
# final model decision
model = RandomForestRegressor(n_estimators=30,
                                  max_depth = 33,
                                  max_features = 0.77,
                                  random_state = 20,
                                  n_jobs = -1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8713680285545821

In [79]:
# most significant features

model.feature_importances_

array([0.01825707, 0.00235924, 0.05801557, 0.01929531, 0.06786159,
       0.09046196, 0.07313923, 0.02837971, 0.03959193, 0.55228242,
       0.05035598])

In [81]:
imp_df = pd.DataFrame({'features': features, 'importance': model.feature_importances_})
imp_df.sort_values(by='importance', ascending=False)

Unnamed: 0,features,importance
9,d_hour,0.552282
5,atemp,0.090462
6,humidity,0.073139
4,temp,0.067862
2,workingday,0.058016
10,d_dayofweek,0.050356
8,d_month,0.039592
7,windspeed,0.02838
3,weather,0.019295
0,season,0.018257


# Use Library functions

In [82]:
from sklearn.model_selection import GridSearchCV

params = [{"max_depth": [10, 20, 30],
           "max_features": [0.3, 0.5, 0.9, 1.0]}]

clf = GridSearchCV(RandomForestRegressor(), params, cv=5)
clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [84]:
best_val = clf.best_estimator_
best_score = clf.best_score_
print(best_val,best_score )
print("final score:", clf.score(X_test, y_test))

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features=0.9, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False) 0.8655795026067944
final score: 0.8776590968128535


In [85]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, 11)]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
rf = RandomizedSearchCV(RandomForestRegressor(), random_grid, cv=5)
rf.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [None]:
rf.best_params_, rf.best_estimator_, rf.best_score_

In [None]:
print("Final test score:", rf.score(X_test, y_test))