In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# digits image classification
---

In [2]:
from sklearn.datasets import load_digits 

digits = load_digits()

print(digits.keys())

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])


In [3]:
digits_data = digits.data
digits_label = digits.target

print(type(digits_data))
print('data shape :', digits_data.shape)
print('label shape', digits_label.shape)
print('label velue : ', digits.target_names)

<class 'numpy.ndarray'>
data shape : (1797, 64)
label shape (1797,)
label velue :  [0 1 2 3 4 5 6 7 8 9]


#### digits dataset은 8x8(64)의 pixel을 가진 1797개 image이다

In [4]:
# data를 training data 와 test data로 split

X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_label, test_size=0.2, random_state=7)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1437, 64)
(360, 64)
(1437,)
(360,)


In [18]:
from sklearn.metrics import accuracy_score

# dscision tree model training, prediction
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print('decision_tree accuracy score : ', accuracy_score(y_test, y_pred))


# random forest model training, prediction
rf = RandomForestClassifier(random_state=32)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('RandomForestClassifier accuracy score : ', accuracy_score(y_test, y_pred))


# Supprot Vector Machine model training, prediction
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print('Supprot Vector Machine accuracy score : ', accuracy_score(y_test, y_pred))

# Stochastic Gradient Descent Classifier model training, prediction
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

print('Stochastic Gradient Descent Classifier accuracy score : ', accuracy_score(y_test, y_pred))


# LogisticRegression model training, prediction
LR = LogisticRegression(max_iter=10000)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)

print('LogisticRegression accuracy score : ', accuracy_score(y_test, y_pred))

decision_tree accuracy score :  0.8555555555555555
RandomForestClassifier accuracy score :  0.9638888888888889
Supprot Vector Machine accuracy score :  0.9888888888888889
Stochastic Gradient Descent Classifier accuracy score :  0.9416666666666667
LogisticRegression accuracy score :  0.9527777777777777


#### 해당 data에 최고 성능을 찾는 model을 학습하는 것이 주 목적이 아니기에
#### 모든 model의 hyperparameter 는 default 값으로 학습하였고\
#### 이때 predict accuracy가 가장 높은 model은 SVM이었다

In [6]:
# y_predict data 를 다시 svm으로 뽑아준다
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [7]:
confusion_matrix(y_test, y_pred)

array([[43,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 42,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 40,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 34,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 37,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 28,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 28,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 33,  0,  0],
       [ 0,  2,  0,  0,  0,  1,  0,  0, 40,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  0, 31]])

#### 사실 label별 중요도가 존재하지 않는 multi label dataset 이기에
#### confusion matrix를 통한 평가지표는 큰 의미가 없다고 생각하지만
#### confusion matrix를 살펴보았을때 
#### 8 의 recall값이 가장 낮았고 (8을 다른 숫자로 예측할 확률이 높은 것)
#### 5 의 precision값이 가장 낮았다 (다른 숫자를 5로 예측할 확률이 높은 것)
#### 아래와같이 표를통해 확인 할 수 있다

In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        43
           1       0.95      1.00      0.98        42
           2       1.00      1.00      1.00        40
           3       1.00      1.00      1.00        34
           4       1.00      1.00      1.00        37
           5       0.93      1.00      0.97        28
           6       1.00      1.00      1.00        28
           7       1.00      1.00      1.00        33
           8       1.00      0.93      0.96        43
           9       1.00      0.97      0.98        32

    accuracy                           0.99       360
   macro avg       0.99      0.99      0.99       360
weighted avg       0.99      0.99      0.99       360



# wine dataset classicication
---

In [9]:
from sklearn.datasets import load_wine

wine = load_wine()

wine_data = wine.data
wine_label = wine.target

wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [10]:
print(wine_data.shape)
print(wine.feature_names)
print(wine.target_names)

(178, 13)
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
['class_0' 'class_1' 'class_2']


In [45]:
import collections
import pandas as pd

df_wine = pd.DataFrame(data = wine_data, columns=wine.feature_names)

df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: fl

In [47]:
for i in range(len(df_wine.columns)):
    print(df_wine.columns[i], " :")
    print(df_wine.iloc[:,i].value_counts(),'\n')

alcohol  :
12.37    6
13.05    6
12.08    5
12.29    4
12.00    3
        ..
13.34    1
13.69    1
13.90    1
13.84    1
13.75    1
Name: alcohol, Length: 126, dtype: int64 

malic_acid  :
1.73    7
1.81    4
1.67    4
1.68    3
1.61    3
       ..
3.45    1
2.51    1
4.61    1
3.83    1
2.68    1
Name: malic_acid, Length: 133, dtype: int64 

ash  :
2.30    7
2.28    7
2.70    6
2.36    6
2.32    6
       ..
2.16    1
2.78    1
2.53    1
1.71    1
1.95    1
Name: ash, Length: 79, dtype: int64 

alcalinity_of_ash  :
20.0    15
21.0    11
16.0    11
18.0    10
19.0     9
        ..
19.4     1
11.2     1
21.6     1
18.1     1
14.8     1
Name: alcalinity_of_ash, Length: 63, dtype: int64 

magnesium  :
88.0     13
86.0     11
98.0      9
101.0     9
96.0      8
102.0     7
112.0     6
85.0      6
94.0      6
80.0      5
92.0      5
89.0      5
97.0      5
103.0     5
107.0     4
106.0     4
90.0      4
108.0     4
104.0     3
111.0     3
78.0      3
116.0     3
95.0      3
120.0     3
110.0

#### feature name 을 살펴보니 13개의 feature 는 wine의 성분들을 뜻하는 듯 하다
#### 전부 수치데이터라 뭐가뭔진 잘 모르겠다
#### 해당 dataset은 3개의 label을 가지고있으며 178개의 data를 가지고있다

In [11]:
# data를 training data 와 test data로 split

wine_X_train, wine_X_test, wine_y_train, wine_y_test = train_test_split(wine_data, wine_label, test_size=0.2, random_state=7)

print(wine_X_train.shape)
print(wine_X_test.shape)
print(wine_y_train.shape)
print(wine_y_test.shape)

(142, 13)
(36, 13)
(142,)
(36,)


In [39]:
# dscision tree model training, prediction
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(wine_X_train, wine_y_train)
wine_y_pred = decision_tree.predict(wine_X_test)

print('decision_tree accuracy score : ', accuracy_score(wine_y_test, wine_y_pred))


# RandomForest model training, prediction
rf = RandomForestClassifier(random_state=8)
rf.fit(wine_X_train, wine_y_train)
wine_y_pred = rf.predict(wine_X_test)

print('RandomForestClassifier accuracy score : ', accuracy_score(wine_y_test, wine_y_pred))


# dscision tree model training, prediction
svm = SVC(kernel='linear')
svm.fit(wine_X_train, wine_y_train)
wine_y_pred = svm.predict(wine_X_test)

print('Supprot Vector Machine accuracy score : ', accuracy_score(wine_y_test, wine_y_pred))


# SGD model training, prediction
sgd = SGDClassifier()
sgd.fit(wine_X_train, wine_y_train)
wine_y_pred = sgd.predict(wine_X_test)

print('Stochastic Gradient Descent Classifier accuracy score : ', accuracy_score(wine_y_test, wine_y_pred))


# LogisticRegression model training, prediction
LR = LogisticRegression(max_iter=10000)
LR.fit(wine_X_train, wine_y_train)
wine_y_pred = LR.predict(wine_X_test)

print('LogisticRegression accuracy score : ', accuracy_score(wine_y_test, wine_y_pred))

decision_tree accuracy score :  0.9444444444444444
RandomForestClassifier accuracy score :  1.0
Supprot Vector Machine accuracy score :  0.9444444444444444
Stochastic Gradient Descent Classifier accuracy score :  0.5277777777777778
LogisticRegression accuracy score :  0.9722222222222222


#### 해당 dataset은 Decision Tree, Random Forest, Logistic Regression 에 대하여 높은 예측을 보였고 SGD는 낮은 성능을 보인다, SVM은 디폴트 파라미터에선 낮은 성능을 보였지만 파라미터인 kernel을 linear로 설정하여 성능을 높혔다.
#### Logistic Regression model을 사용한 예측은 다음과같은 지표를 보인다

In [22]:
print(classification_report(wine_y_test, wine_y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.94      1.00      0.97        17
           2       1.00      0.92      0.96        12

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36



# breast cancer data classification
---

In [23]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

cancer_data = cancer.data
cancer_label = cancer.target

cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [24]:
print(cancer_data.shape)
print(cancer.feature_names)
print(cancer.target_names)

(569, 30)
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']


#### cancer dataset은 30개의 feature를 갖고있는 569개의 dataset이다
#### malignant(악성)과 benign(양성) 의 label을 갖은 binary classification 문제다.

In [26]:
cancer_X_train, cancer_X_test, cancer_y_train, cancer_y_test = train_test_split(cancer_data, cancer_label, test_size=0.2, random_state=7)

print(cancer_X_train.shape)
print(cancer_X_test.shape)
print(cancer_y_train.shape)
print(cancer_y_test.shape)

(455, 30)
(114, 30)
(455,)
(114,)


In [42]:
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(cancer_X_train, cancer_y_train)
cancer_y_pred = decision_tree.predict(cancer_X_test)

print('decision_tree accuracy score : ', accuracy_score(cancer_y_test, cancer_y_pred))

rf = RandomForestClassifier(random_state=32)
rf.fit(cancer_X_train, cancer_y_train)
cancer_y_pred = rf.predict(cancer_X_test)

print('RandomForestClassifier accuracy score : ', accuracy_score(cancer_y_test, cancer_y_pred))

svm = SVC(kernel='linear')
svm.fit(cancer_X_train, cancer_y_train)
cancer_y_pred = svm.predict(cancer_X_test)

print('Supprot Vector Machine accuracy score : ', accuracy_score(cancer_y_test, cancer_y_pred))

sgd = SGDClassifier()
sgd.fit(cancer_X_train, cancer_y_train)
cancer_y_pred = sgd.predict(cancer_X_test)

print('Stochastic Gradient Descent Classifier accuracy score : ', accuracy_score(cancer_y_test, cancer_y_pred))

LR = LogisticRegression(max_iter=10000)
LR.fit(cancer_X_train, cancer_y_train)
cancer_y_pred = LR.predict(cancer_X_test)

print('LogisticRegression accuracy score : ', accuracy_score(cancer_y_test, cancer_y_pred))

decision_tree accuracy score :  0.9122807017543859
RandomForestClassifier accuracy score :  1.0
Supprot Vector Machine accuracy score :  0.9473684210526315
Stochastic Gradient Descent Classifier accuracy score :  0.8245614035087719
LogisticRegression accuracy score :  0.9473684210526315


#### 해당 dataset은 대부분의 모델에서 좋은 성능을 보였으며
#### RandomForestClassifier 모델이 완벽한 성능을 보였다