<a href="https://colab.research.google.com/github/HighLvRiver/MachineLearning/blob/master/2019_ML_JAM_MNIST_scikit_learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [0]:
from keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()
print("X_train original shape", X_train.shape)
print("y_train original shape", y_train.shape)

In [0]:
print("Image Data Shape" , X_train.shape)
print("Label Data Shape", y_train.shape)

In [0]:
X_train

In [0]:
import matplotlib
import matplotlib.pyplot as plt

n = 2

some_digit = X_train[n]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,interpolation="nearest")
plt.axis("off")

print("Label : " , y_train[n])

In [0]:
fig = plt.figure()
for i in range(9):
  plt.subplot(3,3,i+1)
  plt.tight_layout()
  plt.imshow(X_train[i], cmap='gray', interpolation='none')
  plt.title("Digit: {}".format(y_train[i]))
  plt.xticks([])
  plt.yticks([])
fig

In [0]:
# 데이터셋 섞기

import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [0]:
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))

nsamples2, nx2, ny2 = X_test.shape
X_test = X_test.reshape((nsamples2,nx2*ny2))

In [0]:
# 이진 분류기 훈련

y_train_5 = (y_train == 5) # 5는 true, 다른 숫자는 false
y_test_5 = (y_test == 5)

In [0]:
y_train_5

In [0]:
print("Image Data Shape" , X_train.shape)

In [0]:
# 확률적 경사하강법 이용해보기

from sklearn.linear_model import SGDClassifier # 확률적 경사하강법

sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train_5)

In [0]:
# 교차 검증 구현하여 정확도 구해보기

from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

In [0]:
# 오차 행렬

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3) # K-교차검증을 수행하지만 평가 점수를 반환하지 않고 각 테스트 폴드에서 얻은 예측을 반환

In [0]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_train_5, y_train_pred))
print('-----------------')

matrix_index = [['참 음성(TN)', '허위 양성(FP)'],['허위 음성(FN)', '참 양성(TP)']] 

for arr in matrix_index:
    line = ""
    for num in arr:
        line += str(num)+" "
    print(line)

In [0]:
from sklearn.metrics import precision_score, recall_score

# 정밀도(precision)

print('precision = %.2f%%' % (precision_score(y_train_5, y_train_pred)*100)) # == 4489 / (4489 + 1586)

In [0]:
# 재현율

print('recall = %.2f%%' % (recall_score(y_train_5, y_train_pred)*100)) # == 4489 / (4489 + 936)

In [0]:
# F1 Score = 정밀도와 재현율의 조화평균

from sklearn.metrics import f1_score

print('f1 score = %.2f%%' % (f1_score(y_train_5, y_train_pred)*100))

In [0]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [0]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
  plt.plot(thresholds, precisions[:-1], "b--", label ="precisions")
  plt.plot(thresholds, recalls[:-1], "g-", label ="recalls")
  plt.xlabel("thresholds")
  plt.legend(loc="center left")
  plt.ylim([0, 1])
  
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

In [0]:
y_scores

In [0]:
y_train_pred_90 = (y_scores > 70000)

In [0]:
print('precision = %.2f%%' % (precision_score(y_train_5, y_train_pred_90)*100))

In [0]:
print('recall = %.2f%%' % (recall_score(y_train_5, y_train_pred_90)*100))

In [0]:
# ROC Curve (receiver operating characteristic, 수신기 조작 특성)
## 거짓 양성 비율(FPR)에 대한 진짜 양성 비율(TPR)의 곡선
## 민감도(재현율)에 대한 1-특이도(진짜 음성 비율, TNR) 그래프

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [0]:
def plot_roc_curve(fpr, tpr, label=None):
  plt.plot(fpr, tpr, linewidth=2, label=label)
  plt.plot([0, 1],[0, 1], 'k--')
  plt.axis([0, 1, 0, 1])
  plt.xlabel('FRP')
  plt.ylabel('TPR')
  
plot_roc_curve(fpr, tpr)

In [0]:
# ROC의 AUC(area under the curve, 곡선 아래의 면적) 계산
## 완벽한 분류기는 ROC의 AUC가 1이고, 완전한 랜덤 분류기는 0.5

from sklearn.metrics import roc_auc_score

print('AUC = %.2f%%' % (roc_auc_score(y_train_5, y_scores)*100))

In [0]:
# RandomForestClassifier를 훈련시켜 ROC의 AUC 점수 비교

from sklearn.ensemble import RandomForestClassifier # 랜덤포레스트

forest_clf = RandomForestClassifier()
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict_proba")

y_scores_forest = y_probas_forest[:,1] # 양성 클래스에 대한 확률을 점수로 사용
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

In [0]:
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right")

In [0]:
print('SGD AUC           = %.2f%%' % (roc_auc_score(y_train_5, y_scores)*100))
print('Random Forest AUC = %.2f%%' % (roc_auc_score(y_train_5, y_scores_forest)*100))

In [0]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [0]:
accuracy = model.score(X_test, y_test)

print("accuracy = %.2f%%" % (accuracy * 100))

In [0]:
y_pred = model.predict(X_test)

In [0]:
y_pred

In [0]:
import pandas as pd

test_result = pd.DataFrame([])
test_result["test_labels"] = y_test
test_result["estimated_labels"] = y_pred

In [0]:
test_result

In [0]:
n = 9975

some_digit_image = X_test[n].reshape(28,28)
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,interpolation="nearest")
plt.axis("off")

print('Test labels :', y_test[n])
print('Estimated labels :', y_pred[n])