###  import libs

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

### load iris datasets from sklearn

In [6]:
data = load_iris()

In [7]:
print(type(data))

<class 'sklearn.utils.Bunch'>


### about sklearn.utils.Bunch

In [11]:
print(data.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [12]:
print(data.feature_names)
print(data.target_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


### bunch to dataframe

In [13]:
df = pd.DataFrame(columns=data.feature_names, data=data.data)
df["label"] = data.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### data split

In [14]:
from sklearn.model_selection import train_test_split

In [25]:
X = df.iloc[:,:-1]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### classification using decision tree classifier

In [26]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

### metrics and scoring for quantifying the quality of predictions 

In [36]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

In [38]:
print("decision tree score using train : {}".format(clf.score(X_train, y_train)))
print("decision tree score using test : {}".format(clf.score(X_test, y_test)))
print("accuracy : {}".format(accuracy_score(y_test, y_pred)))
print("MSE : {}".format(mean_squared_error(y_test, y_pred)))

decision tree score using train : 1.0
decision tree score using test : 1.0
accuracy : 1.0
MSE : 0.0


### k fold cross validation

In [39]:
from sklearn.model_selection import KFold

In [51]:
kfold = KFold(n_splits=5)
num_iter = 0

cv_acc = []

for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X.iloc[train_idx, :], X.iloc[test_idx, :]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    num_iter = num_iter + 1
    acc = round(accuracy_score(y_test, y_pred), 10)
    
    print("{}번 검증 셋 정확도 : {} ".format(num_iter, acc))
    cv_acc.append(acc)

print("mean of accuracy : {} ".format(round(np.mean(cv_acc),7)))

1번 검증 셋 정확도 : 1.0 
2번 검증 셋 정확도 : 0.9666666667 
3번 검증 셋 정확도 : 0.9 
4번 검증 셋 정확도 : 0.9333333333 
5번 검증 셋 정확도 : 0.7333333333 
mean of accuracy : 0.9066667 


### cross_val_score

In [52]:
from sklearn.model_selection import cross_val_score

In [55]:
scores = cross_val_score(clf, X, y, scoring="accuracy", cv=5)
print(scores)
print(np.mean(scores))

[0.96666667 0.96666667 0.9        1.         1.        ]
0.9666666666666668


### GridSearchCV

In [56]:
from sklearn.model_selection import GridSearchCV

In [57]:
clf = DecisionTreeClassifier()
param = {
    "max_depth":[1, 2, 3, 4, 5],
    "min_samples_split":[2, 3, 4]
}

grid_clf = GridSearchCV(clf, param_grid=param, cv=5, refit=True)
grid_clf.fit(X_train, y_train)

scores_df = pd.DataFrame(grid_clf.cv_results_)
scores_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001982,1.5e-05,0.001403,0.000497,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,0.0,13
1,0.001994,4e-06,0.001202,0.00041,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,0.0,13
2,0.002,0.00064,0.001189,0.000405,1,4,"{'max_depth': 1, 'min_samples_split': 4}",0.833333,0.833333,0.833333,0.833333,0.833333,0.833333,0.0,13
3,0.001602,0.000492,0.001197,0.000398,2,2,"{'max_depth': 2, 'min_samples_split': 2}",1.0,0.958333,0.916667,0.916667,0.958333,0.95,0.03118,2
4,0.001998,1.2e-05,0.001193,0.000402,2,3,"{'max_depth': 2, 'min_samples_split': 3}",1.0,0.958333,0.916667,0.916667,0.958333,0.95,0.03118,2


In [60]:
print(type(grid_clf))
scores_df[["params", "mean_test_score", "rank_test_score"]]

<class 'sklearn.model_selection._search.GridSearchCV'>


Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.833333,13
1,"{'max_depth': 1, 'min_samples_split': 3}",0.833333,13
2,"{'max_depth': 1, 'min_samples_split': 4}",0.833333,13
3,"{'max_depth': 2, 'min_samples_split': 2}",0.95,2
4,"{'max_depth': 2, 'min_samples_split': 3}",0.95,2
5,"{'max_depth': 2, 'min_samples_split': 4}",0.95,2
6,"{'max_depth': 3, 'min_samples_split': 2}",0.95,2
7,"{'max_depth': 3, 'min_samples_split': 3}",0.941667,10
8,"{'max_depth': 3, 'min_samples_split': 4}",0.95,2
9,"{'max_depth': 4, 'min_samples_split': 2}",0.95,2


In [63]:
print(grid_clf.best_params_)
print(grid_clf.best_score_)

clf = grid_clf.best_estimator_
print(clf.score(X_test, y_test))

{'max_depth': 5, 'min_samples_split': 2}
0.9583333333333334
0.7333333333333333


### label encoding

In [78]:
from sklearn.preprocessing import LabelEncoder

items = ["한국", "미국", "미국", "중국", "러시아", "호주"]

encoder = LabelEncoder()
encoder.fit(items)

lables = encoder.transform(items)
print(lables)
print(encoder.classes_)

[3 1 1 2 0 4]
['러시아' '미국' '중국' '한국' '호주']


### one hot encoding

In [80]:
from sklearn.preprocessing import OneHotEncoder
lables = lables.reshape(-1,1)
print(lables)

oh = OneHotEncoder()
oh.fit(lables)
oh_lables = oh.transform(lables)

print(oh_lables.toarray())

[[3]
 [1]
 [1]
 [2]
 [0]
 [4]]
[[0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]]


### Standard Scaler 
- feature transformes to a value with a mean of zero and variance of 1

In [89]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
df_scaled = pd.DataFrame(data=X_scaled, columns=data.feature_names)


print("befor scaling")
print(X.mean())
print(X.var())

print("\nafter scaling")
print(df_scaled.mean())
print(df_scaled.var())

befor scaling
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64

after scaling
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


In [88]:
clf = SVC()
scores = cross_val_score(clf, df_scaled, y, scoring="accuracy", cv=5)
print("cross validation score with scaled data")
print(scores)
print(np.mean(scores))


scores = cross_val_score(clf, X, y, scoring="accuracy", cv=5)
print("cross validation score")
print(scores)
print(np.mean(scores))

cross validation score with scaled data
[0.96666667 0.96666667 0.96666667 0.93333333 1.        ]
0.9666666666666666
cross validation score
[0.96666667 0.96666667 0.96666667 0.93333333 1.        ]
0.9666666666666666


### MinMaxScaler
- transforms data to a value between 0 and 1

(if negative value included, transform between -1 and 1) 

In [90]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

df_scaled = pd.DataFrame(data=X_scaled, columns=data.feature_names)
print(df_scaled.min())
print(df_scaled.max())

sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64
