<a href="https://colab.research.google.com/github/Jinops/ml-study/blob/main/week04/pre/ch5-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine-date')

data = wine[['alcohol','sugar','pH']].to_numpy()
target = wine['class'].to_numpy()

In [3]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

In [4]:
from sklearn.model_selection import train_test_split
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)

In [5]:
print(sub_input.shape, val_input.shape, train_input.shape)

(4157, 3) (1040, 3) (5197, 3)


In [6]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [7]:
# k-fold cross validation
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target) # cv=StratifiedKFold가 자동 적용
print(scores)

import numpy as np
print(np.mean(scores['test_score']))

{'fit_time': array([0.05033183, 0.01314259, 0.01169634, 0.01231647, 0.01117682]), 'score_time': array([0.00175977, 0.00170612, 0.00221086, 0.00209737, 0.00235486]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}
0.855300214703487


In [8]:
# same as above : cross_validate에서 StratifiedKFold 분할기가 자동으로 적용 
from sklearn.model_selection import StratifiedKFold

splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))


0.8574181117533719


In [30]:
# grid search
from sklearn.model_selection import GridSearchCV
params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}
gs = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=-1)
gs.fit(train_input, train_target)

dt = gs.best_estimator_
print('params: ', gs.best_params_)
print('score: ', dt.score(train_input, train_target))
print('best validation score: ', np.max(gs.cv_results_['mean_test_score']))

params:  {'min_impurity_decrease': 0.0001}
score:  0.9615162593804117
best validation score:  0.8683858369734212


In [17]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5,20,1),
          'min_samples_split': range(2,100,10)
          }

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [26]:
from scipy.stats import uniform, randint

rgen = randint(0, 10) #범위 지정
rgen.rvs(10) #n개 생성
#np.unique(rgen.rvs(1000), return_counts=True)

ugen = uniform(0,1)
ugen.rvs(10)


array([0.55940948, 0.54200009, 0.17624767, 0.81799785, 0.5936852 ,
       0.69860624, 0.63401684, 0.65354776, 0.48819842, 0.10820849])

In [32]:
from scipy.stats import uniform, randint
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2,25),
          'min_samples_leaf': range(1,25), # not random
          }

from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(), params, n_iter=100, n_jobs=-1)
gs.fit(train_input, train_target)

dt = gs.best_estimator_
print('params: ', gs.best_params_)
print('train score: ', dt.score(train_input, train_target))
print('validation score: ', gs.cv_results_['mean_test_score'])
print('test score: ', dt.score(test_input, test_target))

params:  {'max_depth': 31, 'min_impurity_decrease': 0.0005060710854584864, 'min_samples_leaf': 5, 'min_samples_split': 15}
train score:  0.8870502212815086
validation score:  [0.86800104 0.86145961 0.86068983 0.86184478 0.86626971 0.86222773
 0.86222755 0.86396146 0.86222755 0.86415266 0.86126731 0.86473014
 0.86569316 0.86665414 0.86165174 0.86241986 0.86049808 0.86357518
 0.86780873 0.86338473 0.86203487 0.86184275 0.86800122 0.86165044
 0.86184478 0.86222847 0.86338454 0.86088306 0.86338139 0.86434626
 0.86261327 0.86222884 0.865499   0.86280521 0.86126731 0.86299845
 0.86184312 0.86530743 0.86434664 0.86338473 0.86222958 0.86165026
 0.86203524 0.86511494 0.86280484 0.86242097 0.86222884 0.8629977
 0.86126712 0.86357703 0.86145961 0.86780928 0.86203728 0.86781039
 0.86242004 0.86261253 0.86145961 0.86280503 0.86819594 0.86184478
 0.86280484 0.86203709 0.86838658 0.86511513 0.86222773 0.86261309
 0.86434478 0.86299974 0.8668483  0.86222773 0.86800344 0.86030521
 0.86550067 0.86357592