# 손글씨 이미지 데이터(MIST)를 이용한 예측 시스템 
---
- MIST(바이너리 데이터) 
- 바이너리 데이터를 디코딩하여(디코딩 처리법)
- 샘플데이터(전체 데이터의 일부분)을 이용하여 머신러닝 모델을 구축 
- 예측 수행(64%정도 예상)
- 머신러닝 시험(내일) 테스트 => 정확도를 높이는 것  
---

In [7]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as rq
# 경로 
import os , os.path, gzip

In [2]:
target_url = 'http://yann.lecun.com/exdb/mnist/'

In [3]:
res  = rq.urlopen(target_url)
soup = BeautifulSoup(res, 'html5lib')

In [4]:
tmp = soup.find_all('tt')[:4]
type(tmp)

list

In [5]:
# bs4 => string(스트링)
# Selenium => 텍스트 
for tt in tmp :
    print(tt.a.string)

train-images-idx3-ubyte.gz
train-labels-idx1-ubyte.gz
t10k-images-idx3-ubyte.gz
t10k-labels-idx1-ubyte.gz


In [6]:
for tt in soup.find_all('tt')[:4] :
    print(target_url + tt.a.string )

http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz


In [7]:
# files = [target_url + tt.a.string for tt in soup.find_all('tt')[:4]]
# files

In [8]:
files = [tt.a.string for tt in soup.find_all('tt')[:4]]
files

['train-images-idx3-ubyte.gz',
 'train-labels-idx1-ubyte.gz',
 't10k-images-idx3-ubyte.gz',
 't10k-labels-idx1-ubyte.gz']

In [9]:
savePath = './data/mnist'
if not os.path.exists(savePath):
    # 디렉토리를만들어라 
    os.makedirs( savePath )

In [10]:
from tqdm import tqdm_notebook  # 진행바
# 저장 
for file in tqdm_notebook(files):
    # 로컬에 저장 위치 
    print(f'{savePath}/{file}')
    local_path = f'{savePath}/{file}'
    if not os.path.exists(local_path):
        # 디렉토리를만들어라 
        rq.urlretrieve( target_url + file, local_path)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

./data/mnist/train-images-idx3-ubyte.gz
./data/mnist/train-labels-idx1-ubyte.gz
./data/mnist/t10k-images-idx3-ubyte.gz
./data/mnist/t10k-labels-idx1-ubyte.gz



In [11]:
# 압축 해제 
for file in tqdm_notebook(files):
    # 원본 파일(* .gz)
    ori_path = f'{savePath}/{file}'
    # 압축 해제 파일 (*) <= .gz 제거
    raw_path = f'{savePath}/{file[:-3]}'
    print(raw_path)
    # 파일 오픈해서 -> 기록 
    with gzip.open(ori_path, 'rb') as fg :
        # 대용량이면 분활해서 
        # 소용량이니까 그냥 전체 읽기 
        tmp = fg.read()
        with open(raw_path, 'wb') as f:
            f.write(tmp)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

./data/mnist/train-images-idx3-ubyte
./data/mnist/train-labels-idx1-ubyte
./data/mnist/t10k-images-idx3-ubyte
./data/mnist/t10k-labels-idx1-ubyte



## 규격의한 본파일 -> high endian(빅 엔딩안으로 저장 )

In [8]:
# 바이너리 데이터를 읽을때 사용한 모듈
import struct 

In [9]:
def decode_mnist( dataType='train', dir='./data/mnist', samples=1000 ):
    
    label_name = f'{dir}/{dataType}-labels-idx1-ubyte'  
    image_name = f'{dir}/{dataType}-images-idx3-ubyte'
    print(label_name, '\n', image_name)
    
    label_f = open( label_name, 'rb')
    image_f = open( image_name, 'rb')
    
    csv_f = open( f'{dir}/{dataType}.csv', 'w', encoding='utf-8' )

    LABEL_HEAD_SIZE = 4 + 4
    magic, label_cnt = struct.unpack('>II', label_f.read(LABEL_HEAD_SIZE) )
    print(magic, label_cnt)

    IMAGE_HEAD_SIZE = 4 + 4 + 4 + 4
    magic_img, image_cnt, row, col = struct.unpack('>IIII', image_f.read(IMAGE_HEAD_SIZE) )
    print(magic_img, image_cnt, row, col)
    pixels = row * col
    

    for idx in range(label_cnt):
        if idx >= samples:
            break
    
        label_value = struct.unpack( 'B', label_f.read(1) )    
        label = label_value[0] # (5,) => 0번째만 추출

        binary_data = image_f.read( pixels )
        strPixelData= list( map( lambda x:str(x) , binary_data ) ) 
        csv_f.write( str(label) + ',' )
        csv_f.write( ','.join(strPixelData) + '\n' )
        #break
        if idx == 0: # 1회만 수행
            with open('test.pgm', 'w', encoding='utf-8') as f:
                f.write( 'P2 28 28 255\n' + ' '.join(strPixelData) )


    if label_f:label_f.close()
    if image_f:image_f.close()
    if csv_f:csv_f.close()
        
decode_mnist()

./data/mnist/train-labels-idx1-ubyte 
 ./data/mnist/train-images-idx3-ubyte
2049 60000
2051 60000 28 28


In [14]:
decode_mnist( dataType='train', samples=750)
decode_mnist( dataType='t10k' , samples=250)

./data/mnist/train-labels-idx1-ubyte 
 ./data/mnist/train-images-idx3-ubyte
2049 60000
2051 60000 28 28
./data/mnist/t10k-labels-idx1-ubyte 
 ./data/mnist/t10k-images-idx3-ubyte
2049 10000
2051 10000 28 28


---

## 4. 데이터 분석 및 탐색 

- 픽셀 데이터의 정규화 추가
- 훈련에 필요한 데이터의 형태도 준비
---

In [10]:
def load_csv( dataType='train', dir='./data/mnist' ):

    # 0. 데이터를 담는 자료구조
    labels = list()
    images = list()

    # 1. csv 파일 오픈
    with open( f'{dir}/{dataType}.csv', 'r' ) as f:
    # 한줄씩 읽는다
        for line in f:      
            # 분해
            tmp = line.strip().split(',')
            #print(tmp)
            # 정답 데이터 담기 -> 타입은 수치로 변환
            labels.append( int(tmp[0]) )
            # 각 픽셀을 256개(색상의 총수)로 정규화 하여서 리스트로처리
            images.append( list( map( lambda x:int(x)/256, tmp[1:] ) ) )
    return { 'labels':labels   ,'images':images }

#load_csv()

---

## 5.데이터 모델링 
---

In [11]:
# sklearn 모듈 가져오기
from sklearn import svm, model_selection, metrics

In [17]:
# 알고리즘 생성
clf = svm.SVC()

In [18]:
# 훈련, 테스트 데이터 준비
train = load_csv()
test  = load_csv( dataType='t10k' )
# 데이터 : 75,25
len(train['labels']), len(test['labels'])

(750, 250)

In [19]:
X_train = train['images']
y_train = train['labels']

X_test = test['images']
y_test = test['labels']

In [20]:
# 3. 학습
clf.fit(X_train, y_train)
# clf.fit( train['images'], train['labels'] )

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
# 4. 예측
predict = clf.predict( X_test )
# print(predict)

In [22]:
# 5. 성능평가 정확도 확인
metrics.accuracy_score(y_test, predict )

0.9

In [23]:
t = metrics.classification_report( y_test, predict )
print( t )

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        34
           2       0.81      0.92      0.86        24
           3       1.00      0.70      0.82        23
           4       0.91      0.94      0.93        33
           5       0.72      0.84      0.78        25
           6       1.00      0.86      0.93        22
           7       0.81      0.90      0.85        29
           8       0.93      0.93      0.93        14
           9       0.92      0.89      0.91        27

    accuracy                           0.90       250
   macro avg       0.91      0.90      0.90       250
weighted avg       0.91      0.90      0.90       250



## 세트 전체

In [24]:
decode_mnist( dataType='train', samples = 60000) 
decode_mnist( dataType='t10k' , samples = 10000) 

./data/mnist/train-labels-idx1-ubyte 
 ./data/mnist/train-images-idx3-ubyte
2049 60000
2051 60000 28 28
./data/mnist/t10k-labels-idx1-ubyte 
 ./data/mnist/t10k-images-idx3-ubyte
2049 10000
2051 10000 28 28


In [25]:
# 데이터 준비
train = load_csv()
test  = load_csv( dataType='t10k' )
len(train['labels']), len(test['labels']) # 데이터 준비 완료 

(60000, 10000)

In [26]:
# 담기
X_train = train['images']
y_train = train['labels']

X_test = test['images']
y_test = test['labels']

In [27]:
# 정확도 확인
clf = svm.SVC()
clf.fit(X_train, y_train)
predict = clf.predict( X_test )

In [28]:
metrics.accuracy_score(y_test, predict )

0.9792

In [12]:
# 데이터를 학습용과 테스트용으로 나눌수 있는 함수
from sklearn.model_selection import train_test_split
# 데이터 표준화
from sklearn.preprocessing import StandardScaler
# Perceptron 머신러닝을 위한 클래스
from sklearn.linear_model import Perceptron
# 로지스트 회귀를 위한 클래스 
from sklearn.linear_model import LogisticRegression
# SVM을 위한 클래스 
from sklearn.svm import SVC
# 의사결정 나무를 위한 클래스 
from sklearn.tree import DecisionTreeClassifier
# 랜덤 포레스트
from sklearn.ensemble import RandomForestClassifier
# 정확도 계산을 위한 함수 
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

In [30]:
scaler = MinMaxScaler().fit( X_train )
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [31]:
# 스케일러를 통해서 변환
X_train_scaled = scaler.transform( X_train )
X_test_scaled = scaler.transform( X_test ) 

In [32]:
param_grid = {
  'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
  'gamma':[0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

In [33]:
grid   = GridSearchCV( SVC(), param_grid, cv=5 )
grid_R = RandomForestClassifier(criterion='entropy', n_estimators=10, max_depth=3,n_jobs=2, random_state=0)
grid_L = LogisticRegression(C=1000.0,random_state=0)
grid_S = SVC( kernel='linear', C= 1.0, random_state= 0 )
grid_D = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)

In [34]:
tmp = [grid,grid_R,grid_L,grid_S,grid_D]

In [None]:
for arg in tqdm_notebook(tmp) :
    arg.fit( X_train, y_train )
    arg.score( X_test_scaled, y_test )
    print('=')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
grid.score( X_test_scaled, y_test )

In [13]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# 생성 방법 비교
pipe_std1 = Pipeline( [ ('scaler', MinMaxScaler()),
                        ('classifier', SVC())   
                      ] )

pipe_std2 = make_pipeline( MinMaxScaler(), SVC() )

In [None]:
pipe = Pipeline( [  ('preprocessing', StandardScaler()),
                    ('classifier',    SVC())   
                  ] )

In [None]:
# 하이퍼파라미터 튜닝
param_grid = [
  {
    'preprocessing':[StandardScaler(), MinMaxScaler()],
    'classifier':[SVC()],
    'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'classifier__gamma':[0.001, 0.01, 0.1, 1, 10, 100, 1000]
  },
  {
    'preprocessing':[None],
    'classifier':[RandomForestClassifier(n_estimators=100)],
    'classifier__max_features':[1,2,3]
  }
]

In [None]:
grid = GridSearchCV( pipe, param_grid, cv=5 )

In [None]:
grid.fit( X_train, y_train )

In [None]:
grid.best_params_

In [None]:
# 최고 점수
grid.best_score_

In [None]:
# 예측및 평가
grid.score( X_test, y_test )

---

In [39]:
def tmp_test(al_data , t_data, s_data):

    data = load_csv()
    X_data = data['images'] # X_test
    y_data = data['labels'] # y_test

    decode_mnist( dataType = t_data , samples = s_data )
    # 알고리즘
    clf = al_data
    # 학습     
    clf.fit(X_data, y_data)
    # 예측
    predict = clf.predict( X_data )
    # 실제 정답 
    ml_accuracy = accuracy_score(y_data, predict )
#     print(f'accuracy : {ml_accuracy}')
    # 리포트
    t = classification_report( y_data, predict )
    print('='*50)
    # 평가
    ml_score= clf.score( X_data, y_data )
    print(f'Accuracy : {ml_accuracy} || Score : {ml_score}')
    print('='*50)
    
    return print(t)


In [46]:
def tmp(al_data , t_data, s_data, mult = 1 ):

    data = load_csv()
    X_data = data['images'] # X_test
    y_data = data['labels'] # y_test
    
    data_cnt = int(s_data* mult)
    print(f'data_cnt:{data_cnt}')
    
    decode_mnist( dataType = t_data , samples = data_cnt  )

    
    # 알고리즘
    clf = al_data
    # 학습     
    clf.fit(X_data, y_data)
    # 예측
    predict = clf.predict( X_data )
    # 실제 정답 
    ml_accuracy = accuracy_score(y_data, predict )
#     print(f'accuracy : {ml_accuracy}')
    # 리포트
    t = classification_report( y_data, predict )
    print('='*50)
    # 평가
    ml_score= clf.score( X_data, y_data )
    print(f'Accuracy : {ml_accuracy} || Score : {ml_score}')
    print('='*50)
    
    return print(t)

In [48]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , classification_report
tmp_test( SVC(), 't10k', 1000)

./data/mnist/t10k-labels-idx1-ubyte 
 ./data/mnist/t10k-images-idx3-ubyte
2049 10000
2051 10000 28 28
||Accuracy : 0.982 || Score : 0.982||
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        97
           1       0.99      0.98      0.99       116
           2       0.99      0.97      0.98        99
           3       1.00      0.98      0.99        93
           4       0.96      1.00      0.98       105
           5       0.96      0.98      0.97        92
           6       1.00      0.99      0.99        94
           7       0.96      1.00      0.98       117
           8       1.00      0.98      0.99        87
           9       1.00      0.94      0.97       100

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000



In [49]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , classification_report
tmp( SVC(), 't10k', 1000, 6)

data_cnt:6000
./data/mnist/t10k-labels-idx1-ubyte 
 ./data/mnist/t10k-images-idx3-ubyte
2049 10000
2051 10000 28 28
Accuracy : 0.982 || Score : 0.982
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        97
           1       0.99      0.98      0.99       116
           2       0.99      0.97      0.98        99
           3       1.00      0.98      0.99        93
           4       0.96      1.00      0.98       105
           5       0.96      0.98      0.97        92
           6       1.00      0.99      0.99        94
           7       0.96      1.00      0.98       117
           8       1.00      0.98      0.99        87
           9       1.00      0.94      0.97       100

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000



In [None]:
tmp_test( SVC(), 't10k', 1000)