# 프로젝트_머신러닝으로 데이터 분류하기

## 1. 손글씨 분류하기

### 1.1 필요한 모듈 import하기

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use('ggplot')
sns.set(font_scale=1.5) # 이 두줄은 본 필자가 항상 쓰는 방법입니다. matplotlib 의 기본 scheme 말고 seaborn scheme 을 세팅하고, 일일이 graph 의 font size 를 지정할 필요 없이 seaborn 의 font_scale 을 사용하면 편합니다.

# ignore warnings
# import warnings
# warnings.filterwarnings('ignore')
%matplotlib inline

# 손글씨 데이터
from sklearn.datasets import load_digits

# 데이터를 나눠준다.
from sklearn.model_selection import train_test_split

# 모델
from sklearn.tree import DecisionTreeClassifier # 의사결정나무 모델
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트 모델
from sklearn import svm # 서포트 벡터 머신 모델
from sklearn.linear_model import SGDClassifier # SGD Classifier 모델
from sklearn.linear_model import LogisticRegression # 로지스틱회귀 모델

# 모델 평가
# 정확도, 정밀도, 재현율, f1
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report # 분류 리포트
from sklearn.metrics import confusion_matrix # 오차행렬

### 1.2 데이터 준비

`load_digts` 메서드 사용합니다.

In [3]:
digits = load_digits()
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

### 1.3 데이터 이해하기

+ Feature Data 지정하기
+ Label Data 지정하기
+ Target Names 출력해 보기
+ 데이터 Describe 해 보기

In [4]:
# feature data 지정하기
digits_data = digits.data
digits_data.shape

(1797, 64)

In [8]:
# label data 지정하기
digits_label = digits.target
digits_label.shape

(1797,)

In [15]:
# feature_names
digits.feature_names[:5]

['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4']

In [9]:
# target_names
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
# describe
print(digits.DESCR)

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

In [16]:
# 손글씨 데이터 데이터프레임으로 만들기
digits_df = pd.DataFrame(data=digits_data, columns=digits.feature_names)
digits_df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [17]:
# label 컬럼 추가하기
digits_df['label'] = digits_label
digits_df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,label
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [18]:
# 라벨 컬럼 개수 확인하기
digits_df['label'].value_counts().sort_index()

0    178
1    182
2    177
3    183
4    181
5    182
6    181
7    179
8    174
9    180
Name: label, dtype: int64

### 1.4 train, test 데이터 분리

모델 학습과 테스트용 문제지와 정답지를 준비해 봅시다.

X_train, X_test, y_train, y_test를 생성하는 방법을 참고해 보세요.

In [23]:
X_train, X_test, y_train, y_test = train_test_split(digits_data, # input data(feature)
                                                    digits_label, # ouput data(label or target)
                                                    test_size=0.2, # 테스트셋의 비율(전체 중 20%, 기본값은 0.25)
                                                    stratify=digits.target, # 각 class들을 같은 비율로 나눠준다.
                                                    random_state=8) # random 시드값

print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

X_train 개수:  1437 , X_test 개수:  360


In [24]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print()

# 비율 확인하기
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

(1437, 64) (360, 64) (1437,) (360,)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([142, 145, 142, 146, 145, 146, 145, 143, 139, 144]))
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([36, 37, 35, 37, 36, 36, 36, 36, 35, 36]))


### 1.5 다양한 모델로 학습시켜보기

학습데이터 X_train, y_train 을 활용해 분류기 모델을 만들어 봅시다. 어떤 모델이 가장 좋은 성능을 보일까요?

+ Decision Tree 사용해 보기
+ Random Forest 사용해 보기
+ SVM 사용해 보기
+ SGD Classifier 사용해 보기
+ Logistic Regression 사용해 보기

#### 1.5.1 Decision Tree 모델

In [88]:
# 모델 생성하기
decision_tree = DecisionTreeClassifier(random_state=32)
print(decision_tree._estimator_type)

# 모델 학습하기
decision_tree.fit(X_train, y_train)

# 검증하기
y_pred = decision_tree.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.8508771929824561
0.8767123287671232
0.8888888888888888
0.8827586206896552
              precision    recall  f1-score   support

           0       0.80      0.79      0.80        42
           1       0.88      0.89      0.88        72

    accuracy                           0.85       114
   macro avg       0.84      0.84      0.84       114
weighted avg       0.85      0.85      0.85       114

[[33  9]
 [ 8 64]]


#### 1.5.2 Random Forest 모델

In [89]:
# 모델 생성하기
random_forest = RandomForestClassifier(random_state=32)
print(random_forest._estimator_type)

# 모델 학습하기
random_forest.fit(X_train, y_train)

# 예측하기
y_pred = random_forest.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9736842105263158
0.9726027397260274
0.9861111111111112
0.9793103448275863
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  2]
 [ 1 71]]


#### 1.5.3 Support Vector Machine (SVM) 모델

In [90]:
# 모델 생성하기
svm_model = svm.SVC()
print(svm_model._estimator_type)

# 모델 학습하기
svm_model.fit(X_train, y_train)

# 예측하기
y_pred = random_forest.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9736842105263158
0.9726027397260274
0.9861111111111112
0.9793103448275863
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  2]
 [ 1 71]]


#### 1.5.4 Stochastic Gradient Descent Classifier (SGDClassifier) 모델

In [91]:
# 모델 생성하기
sgd_model = SGDClassifier()
print(sgd_model._estimator_type)

# 모델 학습하기
sgd_model.fit(X_train, y_train)

# 예측하기
y_pred = sgd_model.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.8508771929824561
0.9365079365079365
0.8194444444444444
0.874074074074074
              precision    recall  f1-score   support

           0       0.75      0.90      0.82        42
           1       0.94      0.82      0.87        72

    accuracy                           0.85       114
   macro avg       0.84      0.86      0.85       114
weighted avg       0.87      0.85      0.85       114

[[38  4]
 [13 59]]


#### 1.5.5 Logistic Regression

In [92]:
# 모델 생성하기
logistic_model = LogisticRegression()
print(logistic_model._estimator_type)

# 모델 학습하기
logistic_model.fit(X_train, y_train)

# 예측하기
y_pred = logistic_model.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9385964912280702
0.9577464788732394
0.9444444444444444
0.951048951048951
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.96      0.94      0.95        72

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114

[[39  3]
 [ 4 68]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 1.6 모델을 평가해보기

학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 모델의 성능을 평가하는 지표로는 무엇이 좋을까요? sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

데이터가 고르게 분포되어 있고, 특별히 오차에 있어서 큰 영향을 받지 않으므로 정확도로 평가했다.

Decision Tree를 제외한 나머지 4개의 모델에서 높은 정확도가 나왔다.

그러나 다중분류에 강한 랜덤포레스트 모델과, 서포트벡터머신 모델이 높은 정확도를 기록했다.

## 2. 와인 분류해보기

이번에는 와인 데이터입니다. 와인의 어떤 특징으로 와인의 종류를 분류해 볼 수 있을까요?

데이터에 어떤 정보가 담겨있는지, feature는 무엇이고 label은 무엇인지 확인해 보면서 진행하는 점, 잊지마세요!

### 2.1 필요한 모듈 import하기

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use('ggplot')
sns.set(font_scale=1.5) # 이 두줄은 본 필자가 항상 쓰는 방법입니다. matplotlib 의 기본 scheme 말고 seaborn scheme 을 세팅하고, 일일이 graph 의 font size 를 지정할 필요 없이 seaborn 의 font_scale 을 사용하면 편합니다.

# ignore warnings
# import warnings
# warnings.filterwarnings('ignore')
%matplotlib inline

# 와인 데이터
from sklearn.datasets import load_wine

# 데이터를 나눠준다.
from sklearn.model_selection import train_test_split

# 모델
from sklearn.tree import DecisionTreeClassifier # 의사결정나무 모델
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트 모델
from sklearn import svm # 서포트 벡터 머신 모델
from sklearn.linear_model import SGDClassifier # SGD Classifier 모델
from sklearn.linear_model import LogisticRegression # 로지스틱회귀 모델

# 모델 평가
# 정확도, 정밀도, 재현율, f1
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report # 분류 리포트
from sklearn.metrics import confusion_matrix # 오차행렬

### 2.2 데이터 준비

`load_wine` 메서드를 사용합니다.

In [45]:
wine = load_wine()
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

### 2.3 데이터 이해하기

지피지기면 백전불태! 다루어야 할 데이터를 자세히 살펴봅시다.

+ Feature Data 지정하기
+ Label Data 지정하기
+ Target Names 출력해보기
+ 데이터 Describe 해보기

In [46]:
# feature data 지정하기
wine_data = wine.data
wine_data.shape

(178, 13)

In [47]:
# label data 지정하기
wine_label = wine.target
wine_label.shape

(178,)

In [48]:
# feature_names
wine.feature_names[:5]

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium']

In [49]:
# target_names
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [50]:
# describe
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [69]:
# 와인 데이터 데이터프레임으로 만들기
wine_df = pd.DataFrame(data=wine_data, columns=wine.feature_names)
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [52]:
# label 컬럼 추가하기
wine_df['label'] = wine_label
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,label
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [53]:
# 라벨 컬럼 개수 확인하기
wine_df['label'].value_counts().sort_index()

0    59
1    71
2    48
Name: label, dtype: int64

### 2.4 train, test 데이터 분리

모델 학습과 테스트용 문제지와 정답지를 준비해 봅시다.

X_train, X_test, y_train, y_test를 생성하는 방법을 참고해 보세요.

In [55]:
X_train, X_test, y_train, y_test = train_test_split(wine_data, # input data(feature)
                                                    wine_label, # ouput data(label or target)
                                                    test_size=0.2, # 테스트셋의 비율(전체 중 20%, 기본값은 0.25)
                                                    stratify=wine.target, # 각 class들을 같은 비율로 나눠준다.
                                                    random_state=8) # random 시드값

print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

X_train 개수:  142 , X_test 개수:  36


In [56]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print()

# 비율 확인하기
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

(142, 13) (36, 13) (142,) (36,)

(array([0, 1, 2]), array([47, 57, 38]))
(array([0, 1, 2]), array([12, 14, 10]))


### 2.5 다양한 모델로 학습시켜보기

학습데이터 X_train, y_train 을 활용해 분류기 모델을 만들어 봅시다. 어떤 모델이 가장 좋은 성능을 보일까요?

+ Decision Tree 사용해보기
+ Random Forest 사용해보기
+ SVM 사용해 보기
+ SGD Classifier 사용해보기
+ Logistic Regression 사용해보기

#### 2.5.1 Decision Tree 모델

In [87]:
# 모델 생성하기
decision_tree = DecisionTreeClassifier(random_state=32)
print(decision_tree._estimator_type)

# 모델 학습하기
decision_tree.fit(X_train, y_train)

# 검증하기
y_pred = decision_tree.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.8508771929824561
0.8767123287671232
0.8888888888888888
0.8827586206896552
              precision    recall  f1-score   support

           0       0.80      0.79      0.80        42
           1       0.88      0.89      0.88        72

    accuracy                           0.85       114
   macro avg       0.84      0.84      0.84       114
weighted avg       0.85      0.85      0.85       114

[[33  9]
 [ 8 64]]


#### 2.5.2 Random Forest 모델

In [93]:
# 모델 생성하기
random_forest = RandomForestClassifier(random_state=32)
print(random_forest._estimator_type)

# 모델 학습하기
random_forest.fit(X_train, y_train)

# 예측하기
y_pred = random_forest.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9736842105263158
0.9726027397260274
0.9861111111111112
0.9793103448275863
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  2]
 [ 1 71]]


#### 2.5.3 Support Vector Machine (SVM) 모델

In [94]:
# 모델 생성하기
svm_model = svm.SVC()
print(svm_model._estimator_type)

# 모델 학습하기
svm_model.fit(X_train, y_train)

# 예측하기
y_pred = random_forest.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9736842105263158
0.9726027397260274
0.9861111111111112
0.9793103448275863
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  2]
 [ 1 71]]


#### 2.5.4 Stochastic Gradient Descent Classifier (SGDClassifier) 모델

In [95]:
# 모델 생성하기
sgd_model = SGDClassifier()
print(sgd_model._estimator_type)

# 모델 학습하기
sgd_model.fit(X_train, y_train)

# 예측하기
y_pred = sgd_model.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9122807017543859
0.8974358974358975
0.9722222222222222
0.9333333333333333
              precision    recall  f1-score   support

           0       0.94      0.81      0.87        42
           1       0.90      0.97      0.93        72

    accuracy                           0.91       114
   macro avg       0.92      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114

[[34  8]
 [ 2 70]]


#### 2.5.5 Logistic Regression

In [96]:
# 모델 생성하기
logistic_model = LogisticRegression()
print(logistic_model._estimator_type)

# 모델 학습하기
logistic_model.fit(X_train, y_train)

# 예측하기
y_pred = logistic_model.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9385964912280702
0.9577464788732394
0.9444444444444444
0.951048951048951
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.96      0.94      0.95        72

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114

[[39  3]
 [ 4 68]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 2.6 모델 평가해보기

학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 모델의 성능을 평가하는 지표로는 무엇이 좋을까요? sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

손글씨와 마찬가지로 확실히 다항분류에서는 랜덤포레스트와 서포트벡터머신이 좋은 점수가 나온다.

그러나 각 모델별로 점수의 편차가 손글씨에 비해서 커졌다.

## 3. 유방암 여부 진단하기

마지막으로 유방암 여부를 진단해 보겠습니다.
이 데이터 또한 여러 사람의 건강 지표에 대한 데이터가 feature로 들어가있고, 유방암의 여부가 True, False로 label이 됩니다.

주어진 데이터로 환자의 유방암 여부를 분류해 볼 수 있을까요?

### 3.1 필요한 모듈 import하기

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use('ggplot')
sns.set(font_scale=1.5) # 이 두줄은 본 필자가 항상 쓰는 방법입니다. matplotlib 의 기본 scheme 말고 seaborn scheme 을 세팅하고, 일일이 graph 의 font size 를 지정할 필요 없이 seaborn 의 font_scale 을 사용하면 편합니다.

# ignore warnings
# import warnings
# warnings.filterwarnings('ignore')
%matplotlib inline

# 유방암 데이터
from sklearn.datasets import load_breast_cancer

# 데이터를 나눠준다.
from sklearn.model_selection import train_test_split

# 모델
from sklearn.tree import DecisionTreeClassifier # 의사결정나무 모델
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트 모델
from sklearn import svm # 서포트 벡터 머신 모델
from sklearn.linear_model import SGDClassifier # SGD Classifier 모델
from sklearn.linear_model import LogisticRegression # 로지스틱회귀 모델

# 모델 평가
# 정확도, 정밀도, 재현율, f1
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report # 분류 리포트
from sklearn.metrics import confusion_matrix # 오차행렬

### 3.2 데이터 준비

`load_breast_cancer` 메서드를 사용합니다.

In [63]:
breast_cancer = load_breast_cancer()
breast_cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

### 3.3 데이터 이해하기

지피지기면 백전불태! 다루어야 할 데이터를 자세히 살펴봅시다.

+ Feature Data 지정하기
+ Label Data 지정하기
+ Target Names 출력해보기
+ 데이터 Describe 해보기

In [64]:
# feature data 지정하기
breast_cancer_data = breast_cancer.data
breast_cancer_data.shape

(569, 30)

In [65]:
# label data 지정하기
breast_cancer_label = breast_cancer.target
breast_cancer_label.shape

(569,)

In [66]:
# feature_names
breast_cancer.feature_names[:5]

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness'], dtype='<U23')

In [67]:
# target_names
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [68]:
# describe
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [70]:
# 유방암 데이터 데이터프레임으로 만들기
breast_cancer_df = pd.DataFrame(data=breast_cancer_data, columns=breast_cancer.feature_names)
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [71]:
# label 컬럼 추가하기
breast_cancer_df['label'] = breast_cancer_label
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [72]:
# 라벨 컬럼 개수 확인하기
breast_cancer_df['label'].value_counts().sort_index()

0    212
1    357
Name: label, dtype: int64

### 3.4 train, test 데이터 분리

모델 학습과 테스트용 문제지와 정답지를 준비해 봅시다.

X_train, X_test, y_train, y_test를 생성하는 방법을 참고해 보세요.

In [75]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, # input data(feature)
                                                    breast_cancer_label, # ouput data(label or target)
                                                    test_size=0.2, # 테스트셋의 비율(전체 중 20%, 기본값은 0.25)
                                                    stratify=breast_cancer.target, # 각 class들을 같은 비율로 나눠준다.
                                                    random_state=8) # random 시드값

print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

X_train 개수:  455 , X_test 개수:  114


In [76]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print()

# 비율 확인하기
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True))

(455, 30) (114, 30) (455,) (114,)

(array([0, 1]), array([170, 285]))
(array([0, 1]), array([42, 72]))


### 3.5 다양한 모델로 학습시켜보기

학습데이터 X_train, y_train 을 활용해 분류기 모델을 만들어 봅시다. 어떤 모델이 가장 좋은 성능을 보일까요?

+ Decision Tree 사용해보기
+ Random Forest 사용해보기
+ SVM 사용해 보기
+ SGD Classifier 사용해보기
+ Logistic Regression 사용해보기

#### 3.5.1 Decision Tree 모델

In [82]:
# 모델 생성하기
decision_tree = DecisionTreeClassifier(random_state=32)
print(decision_tree._estimator_type)

# 모델 학습하기
decision_tree.fit(X_train, y_train)

# 검증하기
y_pred = decision_tree.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.8508771929824561
0.8767123287671232
0.8888888888888888
0.8827586206896552
              precision    recall  f1-score   support

           0       0.80      0.79      0.80        42
           1       0.88      0.89      0.88        72

    accuracy                           0.85       114
   macro avg       0.84      0.84      0.84       114
weighted avg       0.85      0.85      0.85       114

[[33  9]
 [ 8 64]]


#### 3.5.2 Random Forest 모델

In [83]:
# 모델 생성하기
random_forest = RandomForestClassifier(random_state=32)
print(random_forest._estimator_type)

# 모델 학습하기
random_forest.fit(X_train, y_train)

# 예측하기
y_pred = random_forest.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9736842105263158
0.9726027397260274
0.9861111111111112
0.9793103448275863
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  2]
 [ 1 71]]


#### 3.5.3 Support Vector Machine (SVM) 모델

In [84]:
# 모델 생성하기
svm_model = svm.SVC()
print(svm_model._estimator_type)

# 모델 학습하기
svm_model.fit(X_train, y_train)

# 예측하기
y_pred = random_forest.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9736842105263158
0.9726027397260274
0.9861111111111112
0.9793103448275863
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        42
           1       0.97      0.99      0.98        72

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

[[40  2]
 [ 1 71]]


#### 3.5.4 Stochastic Gradient Descent Classifier (SGDClassifier) 모델

In [85]:
# 모델 생성하기
sgd_model = SGDClassifier()
print(sgd_model._estimator_type)

# 모델 학습하기
sgd_model.fit(X_train, y_train)

# 예측하기
y_pred = sgd_model.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.8859649122807017
0.8470588235294118
1.0
0.9171974522292993
              precision    recall  f1-score   support

           0       1.00      0.69      0.82        42
           1       0.85      1.00      0.92        72

    accuracy                           0.89       114
   macro avg       0.92      0.85      0.87       114
weighted avg       0.90      0.89      0.88       114

[[29 13]
 [ 0 72]]


#### 3.5.5 Logistic Regression

In [86]:
# 모델 생성하기
logistic_model = LogisticRegression()
print(logistic_model._estimator_type)

# 모델 학습하기
logistic_model.fit(X_train, y_train)

# 예측하기
y_pred = logistic_model.predict(X_test)

# 평가하기
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

classifier
0.9385964912280702
0.9577464788732394
0.9444444444444444
0.951048951048951
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.96      0.94      0.95        72

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114

[[39  3]
 [ 4 68]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 3.6 모델 평가해보기

학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 모델의 성능을 평가하는 지표로는 무엇이 좋을까요? sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

병원에서 암을 진단한다고 했을 때 진단이 더 정확한 것이 중요하다. 그래서 예측값이 얼마나 정확한가를 나타내는 정밀도를 통해 평가했다.

분류 모델에서는 확실하게 랜덤포레스트 모델과 서포트벡터머신 모델이 강하다!

## 4. 마치며

데이터 안에서 새로운 의미를 찾아가는 것은 정말 즐겁다!

이번 프로젝트는 데이터를 분석했다기보다는 데이터를 불러와서 여러가지 모델을 돌려보고, 그 결과에 대해 평가하고 어떤 방식으로 평가할 것인지에 대해 더 초점을 맞췄다.

분류에 관한 여러가지 모델들을 사용해보면서 분류에 대해서 어느정도는 알게 됐다(느낌적인 느낌).

아쉬운 점이 있다면,

+ 분류 모델에 대해서 모델마다 어떤 방식으로 작동하는지 자세하게 알지 못한다는 점
+ 그래서 왜 점수가 다르게 나오는지를 확실하게 알 수가 없었다.
+ 그리고 평가지표에 대해 어떤 상황에서 어떤 지표를 사용해야 더 정확하게 평가할 수 있는지를 알 수 없어서 아쉬웠다.
+ 머신러닝과 딥러닝을 배울수록 수학과 통계에 대한 부분이 아쉽게 느껴진다. 확실히 공부가 필요하다고 느껴진다.

하지만 배울 것이 있다는 것에 대해 즐거운 생각이 든다는 것에 스스로에게 대견함을 느끼면서 더 잘하고 싶다는 욕심도 생긴다.

다음 프로젝트는 직접 만든 데이터를 가지고 머신러닝을 사용해볼 수 있기를.