In [1]:
import pandas as pd

In [2]:
wine = pd.read_csv('https://bit.ly/wine_csv_data')


In [3]:
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [5]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
data

array([[ 9.4 ,  1.9 ,  3.51],
       [ 9.8 ,  2.6 ,  3.2 ],
       [ 9.8 ,  2.3 ,  3.26],
       ...,
       [ 9.4 ,  1.2 ,  2.99],
       [12.8 ,  1.1 ,  3.34],
       [11.8 ,  0.8 ,  3.26]])

In [6]:
target = wine['class'].to_numpy()

In [8]:
# make train_set, test_set and validation_set
from sklearn.model_selection import train_test_split

In [9]:
# train and test sets
train_input, test_input, train_target, test_target = train_test_split(
    data,
    target,
    test_size = 0.2, # 20% of data and target will be test input and target
    random_state=42  # no need in real life
)

In [12]:
# validation set from train set
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input,
    train_target,
    test_size = 0.2, # 20% of train_data and train_target will be validation input and validation target
    random_state=42
)

In [13]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [14]:
# import decision tree
from sklearn.tree import DecisionTreeClassifier

In [19]:
dt = DecisionTreeClassifier(
    random_state=42
)
dt.fit(sub_input, sub_target)

# overfitting
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))
print(dt.score(test_input, test_target))

0.9971133028626413
0.864423076923077
0.8569230769230769


In [20]:
# Cross Validation
from sklearn.model_selection import cross_validate

# default 5-Fold Cross Validation as a default
scores = cross_validate(
    dt,
    train_input,
    train_target
)

print(scores)
'''
scores will produce:
    - fit_time: time taken for fit, i.e. training the machine learning model
    - score_time: time taken for validation
    - test_score: score for each validation
'''

{'fit_time': array([0.00946927, 0.0074656 , 0.00803423, 0.00751328, 0.00649118]), 'score_time': array([0.00111103, 0.00238895, 0.00120759, 0.00113368, 0.00107265]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [22]:
import numpy as np

print(np.mean(scores.get("test_score")))

0.855300214703487


Cross Validation can help us to approximate the highest score that we can obtain from the given model.
#### But,
`cross_validate()` does not mix up the train_set for each fold.

We need to use splitter to randomize the case selection from the train_set

In [24]:
# import splitter -> StratifiedKFold from sklearn
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(
    dt,
    train_input,
    train_target,
    cv=StratifiedKFold()
)

In [25]:
print(np.mean(scores.get("test_score")))

0.855300214703487


In [26]:
# To run the 10-Fold Cross Validation
"""
# cross_validate use, as a default,
    # KFold as a splitter for classification
    # StratifiedKFole as a for regression
"""
splitter = StratifiedKFold(
    n_splits=10, # k-fold cross validation
    shuffle=True, # randomized selection from the train_data
    random_state=42
)
scores = cross_validate(
    dt,
    train_input,
    train_target,
    cv=splitter
).get("test_score")

In [28]:
print(np.mean(scores))

0.8574181117533719


##### Hyper-Parameter Tuning

In [30]:
from sklearn.model_selection import GridSearchCV

In [39]:
params = {
    'min_impurity_decrease': [
        0.0001,
        0.0002,
        0.0003,
        0.0004,
        0.0005
    ]
}

# n_jobs for using all available cores of CPU
gs = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    params,
    n_jobs = -1
)

gs.fit(train_input, train_target)
"""
# best_estimator_
    # store the model with the best parameter value which is trained with the whole train_input and train_target
# best_params_
    # store the best combination of each parameter value
"""

'\n# best_estimator_\n    # store the model with the best parameter value which is trained with the whole train_input and train_target\n# best_params_\n    # store the best combination of each parameter value\n'

In [40]:
dt = gs.best_estimator_

In [41]:
print(dt.score(train_input, train_target))

0.9615162593804117


In [42]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [45]:
print(gs.cv_results_.get("mean_test_score"))

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [46]:
best_index = np.argmax(
    gs.cv_results_['mean_test_score']
)
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [49]:
# Multiple Parameters
params = dict(
    min_impurity_decrease = np.arange(
        0.0001,
        0.001,
        0.0001
    ),
    max_depth = range(
        5,
        20,
        1
    ),
    min_samples_split = range(
        2,
        100,
        10
    )
)


# gs will try every possible combination
gs = GridSearchCV(
    DecisionTreeClassifier(
        random_state=42
    ),
    params,
    n_jobs=-1 # FULL POWER
)

gs.fit(train_input, train_target)

In [50]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [51]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


### Random Search

In [52]:
from scipy.stats import uniform, randint
# uniform distribution

In [54]:
rgen = randint(0, 10)
rgen.rvs(10)

array([9, 4, 8, 7, 9, 5, 5, 1, 9, 6])

In [55]:
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 85, 109,  98, 100,  95,  91, 104,  92, 111, 115]))

In [59]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.43149716, 0.12695694, 0.27745152, 0.40828594, 0.81454785,
       0.86932172, 0.27749653, 0.28447367, 0.76262111, 0.32257441])

In [60]:
params = dict(
    min_impurity_decrease = uniform(0.0001, 0.001),
    max_depth = randint(20, 50),
    min_samples_split = randint(2, 25),
    min_samples_leaf = randint(1, 25),
)

In [66]:
from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(
    DecisionTreeClassifier(random_state=42),
    params,
    n_iter = 1000, # sampling 100 times
    n_jobs = -1,
    random_state = 42 # should be commented in real life
)

In [67]:
gs.fit(train_input, train_target)

In [68]:
print(gs.best_params_)

{'max_depth': 44, 'min_impurity_decrease': 0.00042838122788508106, 'min_samples_leaf': 1, 'min_samples_split': 13}


In [69]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8703105796994152


In [70]:
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

0.8630769230769231
