# load_digits

## 데이터 셋 설명
-scikit-learn 에 있는 load_digits 는 43명의 사람들의 숫자를 손으로 쓴 이미지 데이터  

## 머신러닝 모델로 분류하고자 하는 것

손글씨 데이터로
- Decision Tree
- Random Forest
- SVM
- SGD Classifier
- Logistic Regression

다양한 머신러닝 모델들을 사용해 손글씨 이미지를 제대로 0-9 까지 10가지 카테고리로 분류하기

## 1. 필요한 모듈 import 하기

In [1]:
##1. 필요한 모듈 import하기
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

## 2. 데이터 준비
- load_digits 에 어떤 정보들이 담겼을지, key()라는 메서드로 확인

In [2]:
digits = load_digits()
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [3]:
print(digits.DESCR)

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

## 3. 데이터 이해하기
- Feature Data 지정하기
- Label Data 지정하기
- Target Names 출력해 보기
- 데이터 Describe 해 보기

In [4]:
digits_data = digits.data
digits_data.shape #(1797,64) : 1797개의 데이터가 각각 64개의 숫자로 이루어져 있다.

(1797, 64)

In [5]:
digits_label=digits.target
print(digits_label.shape)
digits_label

(1797,)


array([0, 1, 2, ..., 8, 9, 8])

In [6]:
digits.target_names  #0-9까지의 정수

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
digits.feature_names  #픽셀값 : (8X8) 크기의 이미지를 일렬로 배열

['pixel_0_0',
 'pixel_0_1',
 'pixel_0_2',
 'pixel_0_3',
 'pixel_0_4',
 'pixel_0_5',
 'pixel_0_6',
 'pixel_0_7',
 'pixel_1_0',
 'pixel_1_1',
 'pixel_1_2',
 'pixel_1_3',
 'pixel_1_4',
 'pixel_1_5',
 'pixel_1_6',
 'pixel_1_7',
 'pixel_2_0',
 'pixel_2_1',
 'pixel_2_2',
 'pixel_2_3',
 'pixel_2_4',
 'pixel_2_5',
 'pixel_2_6',
 'pixel_2_7',
 'pixel_3_0',
 'pixel_3_1',
 'pixel_3_2',
 'pixel_3_3',
 'pixel_3_4',
 'pixel_3_5',
 'pixel_3_6',
 'pixel_3_7',
 'pixel_4_0',
 'pixel_4_1',
 'pixel_4_2',
 'pixel_4_3',
 'pixel_4_4',
 'pixel_4_5',
 'pixel_4_6',
 'pixel_4_7',
 'pixel_5_0',
 'pixel_5_1',
 'pixel_5_2',
 'pixel_5_3',
 'pixel_5_4',
 'pixel_5_5',
 'pixel_5_6',
 'pixel_5_7',
 'pixel_6_0',
 'pixel_6_1',
 'pixel_6_2',
 'pixel_6_3',
 'pixel_6_4',
 'pixel_6_5',
 'pixel_6_6',
 'pixel_6_7',
 'pixel_7_0',
 'pixel_7_1',
 'pixel_7_2',
 'pixel_7_3',
 'pixel_7_4',
 'pixel_7_5',
 'pixel_7_6',
 'pixel_7_7']

In [8]:
digits_df = pd.DataFrame(data=digits_data, columns=digits.feature_names)
digits_df  #데이터프레임으로 load_digits 표현

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0


In [9]:
digits_df["label"] = digits.target
digits_df

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,label
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0,9
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0,0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0,8
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0,9


## 4. Train,Test 데이터 분리
Train (719, 64), Test (719,) 

In [10]:
##4. Train/Test data 분리
X_train, X_test, y_train, y_test = train_test_split(digits_data, 
                                                    digits_label, 
                                                    test_size=0.4, 
                                                    random_state=10)

print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

X_train 개수:  1078 , X_test 개수:  719


In [11]:
X_train.shape, y_train.shape

((1078, 64), (1078,))

In [12]:
X_test.shape, y_test.shape

((719, 64), (719,))

In [13]:
y_train, y_test

(array([1, 5, 9, ..., 1, 8, 2]),
 array([5, 2, 5, 4, 8, 2, 4, 3, 3, 0, 8, 7, 0, 1, 8, 6, 9, 7, 9, 7, 1, 8,
        6, 7, 8, 8, 5, 3, 5, 9, 3, 3, 7, 3, 4, 1, 9, 2, 5, 4, 2, 1, 0, 9,
        2, 3, 6, 1, 9, 4, 4, 9, 8, 4, 8, 5, 9, 7, 8, 0, 4, 5, 8, 4, 7, 9,
        0, 7, 1, 3, 9, 3, 3, 8, 0, 7, 3, 6, 5, 2, 0, 8, 8, 0, 1, 1, 2, 8,
        8, 8, 2, 6, 3, 4, 7, 9, 8, 2, 9, 2, 5, 0, 8, 0, 4, 8, 8, 0, 6, 7,
        3, 3, 9, 1, 5, 4, 6, 0, 8, 8, 1, 1, 7, 9, 9, 5, 2, 3, 3, 9, 7, 6,
        2, 5, 4, 3, 3, 7, 6, 7, 2, 7, 4, 9, 5, 1, 9, 4, 6, 1, 1, 1, 4, 0,
        4, 9, 1, 2, 3, 5, 0, 3, 4, 1, 5, 4, 9, 3, 5, 6, 4, 0, 8, 6, 7, 0,
        9, 9, 4, 7, 3, 5, 2, 0, 6, 7, 5, 3, 9, 7, 1, 3, 2, 8, 3, 3, 1, 7,
        1, 1, 1, 7, 1, 6, 7, 6, 9, 5, 2, 3, 5, 2, 9, 5, 4, 8, 2, 9, 1, 5,
        0, 2, 3, 9, 0, 2, 0, 2, 1, 0, 5, 0, 6, 4, 2, 1, 9, 0, 9, 0, 6, 9,
        4, 4, 9, 7, 5, 6, 1, 8, 7, 0, 8, 6, 2, 0, 1, 2, 3, 8, 4, 4, 3, 5,
        7, 9, 7, 2, 0, 2, 0, 9, 2, 8, 6, 3, 6, 0, 6, 6, 6, 7, 1, 6, 1, 7,
     

## 5. 다양한 모델로 학습시켜보기

- Decision Tree
- Random Forest
- SVM
- SGD Classifier
- Logistic Regression

In [14]:
#(1) DecisionTree
decision_tree = DecisionTreeClassifier(random_state=32)
print(decision_tree._estimator_type)

classifier


In [15]:
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=32)

In [16]:
y_pred = decision_tree.predict(X_test)
y_pred

array([5, 2, 5, 4, 1, 2, 4, 3, 3, 0, 8, 7, 0, 1, 8, 6, 9, 7, 9, 7, 1, 8,
       6, 7, 8, 8, 5, 3, 6, 9, 3, 3, 7, 3, 4, 1, 9, 2, 5, 4, 2, 1, 0, 9,
       2, 3, 6, 1, 9, 9, 4, 3, 8, 4, 8, 5, 9, 3, 1, 0, 4, 3, 1, 4, 7, 0,
       0, 7, 2, 3, 8, 3, 3, 8, 0, 7, 3, 6, 5, 2, 0, 8, 8, 0, 1, 2, 2, 8,
       8, 8, 3, 6, 3, 4, 7, 9, 1, 2, 9, 2, 5, 0, 8, 0, 6, 8, 8, 0, 6, 7,
       7, 3, 9, 1, 5, 4, 6, 0, 8, 8, 1, 2, 7, 2, 8, 5, 2, 3, 3, 7, 7, 6,
       2, 5, 4, 3, 3, 7, 6, 7, 2, 7, 4, 9, 3, 1, 9, 9, 7, 1, 2, 1, 4, 0,
       9, 9, 1, 2, 3, 5, 0, 3, 4, 1, 5, 4, 9, 3, 5, 6, 4, 0, 8, 6, 7, 0,
       9, 9, 4, 9, 3, 5, 2, 0, 6, 7, 5, 3, 9, 7, 1, 3, 2, 9, 3, 3, 1, 4,
       2, 8, 2, 7, 1, 6, 4, 6, 9, 5, 2, 3, 5, 2, 9, 7, 1, 8, 2, 8, 1, 5,
       7, 2, 3, 9, 0, 2, 0, 2, 1, 0, 5, 0, 6, 4, 2, 1, 9, 0, 9, 0, 6, 9,
       4, 4, 9, 1, 5, 6, 1, 8, 7, 0, 9, 6, 2, 0, 1, 2, 3, 8, 0, 1, 3, 5,
       7, 3, 7, 2, 0, 2, 0, 9, 2, 8, 6, 3, 3, 0, 6, 6, 6, 7, 1, 6, 1, 7,
       6, 0, 6, 3, 9, 5, 6, 2, 2, 1, 8, 4, 7, 3, 3,

In [17]:
y_test

array([5, 2, 5, 4, 8, 2, 4, 3, 3, 0, 8, 7, 0, 1, 8, 6, 9, 7, 9, 7, 1, 8,
       6, 7, 8, 8, 5, 3, 5, 9, 3, 3, 7, 3, 4, 1, 9, 2, 5, 4, 2, 1, 0, 9,
       2, 3, 6, 1, 9, 4, 4, 9, 8, 4, 8, 5, 9, 7, 8, 0, 4, 5, 8, 4, 7, 9,
       0, 7, 1, 3, 9, 3, 3, 8, 0, 7, 3, 6, 5, 2, 0, 8, 8, 0, 1, 1, 2, 8,
       8, 8, 2, 6, 3, 4, 7, 9, 8, 2, 9, 2, 5, 0, 8, 0, 4, 8, 8, 0, 6, 7,
       3, 3, 9, 1, 5, 4, 6, 0, 8, 8, 1, 1, 7, 9, 9, 5, 2, 3, 3, 9, 7, 6,
       2, 5, 4, 3, 3, 7, 6, 7, 2, 7, 4, 9, 5, 1, 9, 4, 6, 1, 1, 1, 4, 0,
       4, 9, 1, 2, 3, 5, 0, 3, 4, 1, 5, 4, 9, 3, 5, 6, 4, 0, 8, 6, 7, 0,
       9, 9, 4, 7, 3, 5, 2, 0, 6, 7, 5, 3, 9, 7, 1, 3, 2, 8, 3, 3, 1, 7,
       1, 1, 1, 7, 1, 6, 7, 6, 9, 5, 2, 3, 5, 2, 9, 5, 4, 8, 2, 9, 1, 5,
       0, 2, 3, 9, 0, 2, 0, 2, 1, 0, 5, 0, 6, 4, 2, 1, 9, 0, 9, 0, 6, 9,
       4, 4, 9, 7, 5, 6, 1, 8, 7, 0, 8, 6, 2, 0, 1, 2, 3, 8, 4, 4, 3, 5,
       7, 9, 7, 2, 0, 2, 0, 9, 2, 8, 6, 3, 6, 0, 6, 6, 6, 7, 1, 6, 1, 7,
       6, 0, 6, 3, 7, 4, 6, 2, 8, 0, 8, 4, 7, 3, 3,

In [18]:
accuracy = accuracy_score(y_test, y_pred)  #정확도
accuracy

0.8428372739916551

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96        71
           1       0.79      0.78      0.78        77
           2       0.86      0.88      0.87        75
           3       0.82      0.86      0.84        74
           4       0.91      0.75      0.82        68
           5       0.89      0.84      0.87        70
           6       0.89      0.93      0.91        70
           7       0.81      0.87      0.84        78
           8       0.83      0.80      0.82        66
           9       0.71      0.73      0.72        70

    accuracy                           0.84       719
   macro avg       0.85      0.84      0.84       719
weighted avg       0.84      0.84      0.84       719



In [20]:
#(2) Random Forest
random_forest = RandomForestClassifier(random_state=32)
print(decision_tree._estimator_type)

classifier


In [21]:
random_forest.fit(X_train, y_train)

RandomForestClassifier(random_state=32)

In [22]:
y_pred = random_forest.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)  #정확도
accuracy

0.9652294853963839

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.99      0.97      0.98        77
           2       0.99      0.99      0.99        75
           3       0.97      0.97      0.97        74
           4       0.96      0.97      0.96        68
           5       0.91      0.97      0.94        70
           6       1.00      0.99      0.99        70
           7       0.95      0.99      0.97        78
           8       0.95      0.91      0.93        66
           9       0.97      0.90      0.93        70

    accuracy                           0.97       719
   macro avg       0.97      0.96      0.96       719
weighted avg       0.97      0.97      0.97       719



In [24]:
#(3) Support Vector Machine(SVM)
svm_model = svm.SVC()

print(svm_model._estimator_type)

classifier


In [25]:
svm_model.fit(X_train, y_train)

SVC()

In [26]:
y_pred = svm_model.predict(X_test)

In [27]:
accuracy = accuracy_score(y_test, y_pred)  #정확도
accuracy

0.9819193324061196

In [28]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        71
           1       0.97      1.00      0.99        77
           2       0.99      1.00      0.99        75
           3       1.00      0.97      0.99        74
           4       0.99      0.97      0.98        68
           5       0.96      0.99      0.97        70
           6       1.00      1.00      1.00        70
           7       1.00      1.00      1.00        78
           8       0.94      0.95      0.95        66
           9       0.97      0.94      0.96        70

    accuracy                           0.98       719
   macro avg       0.98      0.98      0.98       719
weighted avg       0.98      0.98      0.98       719



In [29]:
#(4) SGD Classifier
sgd_model = SGDClassifier()
print(sgd_model._estimator_type)

classifier


In [30]:
sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)  #정확도
accuracy

0.9457579972183588

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       0.93      0.90      0.91        77
           2       0.95      1.00      0.97        75
           3       0.99      0.91      0.94        74
           4       0.97      0.96      0.96        68
           5       0.88      1.00      0.93        70
           6       1.00      1.00      1.00        70
           7       0.94      0.96      0.95        78
           8       0.85      0.92      0.88        66
           9       0.98      0.81      0.89        70

    accuracy                           0.95       719
   macro avg       0.95      0.95      0.95       719
weighted avg       0.95      0.95      0.95       719



In [32]:
#(5) Logistic Regression
logistic_model = LogisticRegression(solver='liblinear',max_iter=1000)
print(logistic_model._estimator_type)

classifier


In [33]:
logistic_model.fit(X_train,y_train)
y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)  #정확도
accuracy

0.9568845618915159

In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       0.90      0.95      0.92        77
           2       0.97      1.00      0.99        75
           3       0.96      0.95      0.95        74
           4       0.97      0.96      0.96        68
           5       0.92      0.96      0.94        70
           6       1.00      1.00      1.00        70
           7       0.96      0.96      0.96        78
           8       0.91      0.92      0.92        66
           9       0.98      0.87      0.92        70

    accuracy                           0.96       719
   macro avg       0.96      0.96      0.96       719
weighted avg       0.96      0.96      0.96       719



## 6. 모델평가하기
숫자를 분류하는 모델에서는 Accuracy 지표가 중요하다. 픽셀값으로 이루어진 이미지들이 얼마나 정확히 분류가 되었는지 알아야 하기 때문이다.

- Decision Tree
Accuracy : 0.8428372739916551
- Random Forest
Accuracy : 0.9652294853963839
- Support Vector Machine(SVM)
Accuracy : 0.9819193324061196
- SGD Classifier
Accuracy : 0.9457579972183588
- Logistic Regression
Accuracy : 0.9568845618915159

로 SVM이 Accuracy 지표 수치가 0.9819193324061196 로 가장 크다. 따라서, 5가지의 모델 중에서 숫자 분류기에 SVM이 가장 적합하다고 볼 수 있다.