### MNIST

In [None]:
#MNIST 데이터셋을 내려받는 코드
from sklearn.datasets import fetch_openml

mnist=fetch_openml('mnist_784',version=1)
mnist.keys()

In [None]:
X,y=mnist["data"],mnist["target"]
print(X.shape) #70000개의 이미지가 있고, 각 이미지에는 784개의 특성이 있다.
print(y.shape)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

some_digit=X[0]
some_digit_image=some_digit.reshape(28,28)

plt.imshow(some_digit_image,cmap="binary")
plt.axis("off")
plt.show()

In [None]:
y[0]

In [None]:
#string인 y를 정수형으로 변형
import numpy as np
y=y.astype(np.uint8)

In [None]:
# 훈련세트와 테스트세트
X_train,X_test,y_train,y_test=X[:60000],X[60000:],y[:60000],y[60000:]

### 이진 분류기 훈련
- SGD클래스를 사용해 확률적 경사 하강법 분류기로 시작 → 매우 큰 데이터셋을 효율적으로 처리하는 장점을 지닌다.


In [None]:
y_train_5=(y_train==5) # 5는 True고, 다른 숫자는 모두 False
y_test=(y_test==5)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf=SGDClassifier(random_state=42)
sgd_clf.fit(X_train,y_train_5)

In [None]:
sgd_clf.predict([some_digit])

#### 교차검증 구현

In [None]:
# 사이킷런의 cross_val_score()함수와 거의 같은 작업을 수행하고 동일한 결과를 출력
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds=StratifiedKFold(n_splits=3,random_state=42,shuffle=True)

for train_index, test_index in skfolds.split(X_train,y_train_5):
  clone_clf=clone(sgd_clf)
  X_train_folds=X_train[train_index]
  y_train_folds=y_train_5[train_index]
  X_test_fold=X_train[test_index]
  y_test_fold=y_train_5[test_index]
  
  clone_clf.fit(X_train_folds,y_train_folds)
  y_pred=clone_clf.predict(X_test_fold)
  n_correct=sum(y_pred==y_test_fold)
  print(n_correct/len(y_pred)) 


In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf,X_train,y_train_5,cv=3,scoring="accuracy")

In [None]:
# 5아님 클래스로 분류하는 더미 분류기
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
  def fit(self,X,y=None):
    return self
  def predict(self,X):
    return np.zeros((len(X),1),dtype=bool)

In [None]:
never_5_clf=Never5Classifier()
cross_val_score(never_5_clf,X_train,y_train_5,cv=3,scoring="accuracy")

#### 오차 행렬
- cross_val_predict(): k-겹 교차 검증을 수행하지만 평가 점수를 반환하지 않고 각 테스트 폴드에서 얻은 예측을 반환한다.


In [None]:
#오차행렬을 만들려면 실제 타깃과 비교할 수 있도록 먼저 예측값을 만들어야 한다.
from sklearn.model_selection import cross_val_predict

y_train_pred=cross_val_predict(sgd_clf,X_train,y_train_5,cv=3)

In [None]:
#오차행렬 만들기
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5,y_train_pred)

In [None]:
y_train_perfect_predictions=y_train_5 
confusion_matrix(y_train_5,y_train_perfect_predictions)

#### 정밀도와 재현율

- 정밀도:(TP/TP+FP)
- 재현율:(TP/TP+FN)



In [None]:
from sklearn.metrics import precision_score,recall_score
precision_score(y_train_5,y_train_pred)

In [None]:
#f1점수 계산
from sklearn.metrics import f1_score
f1_score(y_train_5,y_train_pred)

In [None]:
y_scores=sgd_clf.decision_function([some_digit])
print(y_scores)



In [None]:
threshold=8000
y_some_digit_pred=(y_scores>threshold)
y_some_digit_pred

In [None]:
y_scores=cross_val_predict(sgd_clf,X_train,y_train_5,cv=3, method="decision_function")

In [None]:
from sklearn.metrics import precision_recall_curve
precisions,recalls,thresholds=precision_recall_curve(y_train_5,y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions,recalls,thresholds):
  plt.plot(thresholds,precisions[:-1],"b--",label="정밀도")
  plt.plot(thresholds,recalls[:-1],"g-",label="재현율")

plot_precision_recall_vs_threshold(precisions,recalls,thresholds)
plt.show()

In [None]:
threshold_90_precision=thresholds[np.argmax(precisions>=0.90)]

In [None]:
y_train_pred_90=(y_scores>=threshold_90_precision)

In [None]:
precision_score(y_train_5,y_train_pred_90)


In [None]:
recall_score(y_train_5,y_train_pred_90)