In [1]:
import numpy as np
from sklearn.datasets import fetch_openml

mnist = fetch_openml("mnist_784",version =1)

In [2]:
X = mnist.data
y = mnist.target
y = y.astype(np.uint8) # ! remember that target column is str



X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [3]:
y_train_5 = (y_train == 5) # True for all 5s, False for all other digits.
y_test_5 = (y_test == 5)

In [4]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3,random_state=42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(random_state=42)

## Measuring Accuracy Using Cross-Validation
    - they are using a self implemented cross_validation and we are gonna explore it a bit to understand it better

In [24]:
from sklearn.model_selection import StratifiedKFold
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

from sklearn.base import clone

X_array = np.array([[1,2],[3,4],[4,3],[2,1],[2,1]]) #dummy feature array
y_array = np.array([1,2,1,2,2]) #dummy label array

skfolds = StratifiedKFold(n_splits=2) # we have to 2 values in y_array

for train_index, test_index in skfolds.split(X_array,y_array):
    print(train_index,test_index)


[2 4] [0 1 3]
[0 1 3] [2 4]


In [28]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
clone_lin_reg = clone(lin_reg)

clone_lin_reg , lin_reg

# !!! clone works only on estimators
# https://scikit-learn.org/stable/modules/generated/sklearn.base.clone.html?highlight=clone#sklearn.base.clone

(LinearRegression(), LinearRegression())

#### The StratifiedKFold class performs stratified sampling to produce folds that contain a representative ratio of each class.

#### So let's see the for loop they use

In [21]:
skfolds = StratifiedKFold(n_splits=3)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)   
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold) #this sums up the correct predictions see next cell
    print(n_correct / len(y_pred))

0.95035
0.96035
0.9604


In [36]:
array_1 = np.array([0,1,1,0])
array_2 = np.array([0,1,0,0])

array_1 == array_2  ,sum(array_1 == array_2)   # pretty handy trick

(array([ True,  True, False,  True]), 3)

In [29]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([0.95035, 0.96035, 0.9604 ])

### The accuracy scores are very misleading 
    - This is simply because only about 10% of the images are 5s, so if you always guess that an image is not a 5, you will be right about 90% of the time.
    - This demonstrates why accuracy is generally not the preferred performance measure for classifiers, especially when you are dealing with skewed datasets (i.e., when some classes are much more frequent than others).

In [56]:
# let's explore the "dumb classifier" in the book

from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool) #returns an array of booleans (false) see below

In [59]:
never_5_clf = Never5Classifier()

array_5_Xtrain = np.array([1,2,3,4,5,6,7,0]) #feature dummy
array_5_ytrain = np.array([5,0,0,0,0,0,0,0]) #label dummy 1/8 is a 5 

In [60]:
never_5_clf.predict(array_5)  #use the dumb classifier

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False]])

In [64]:
cross_val_score(never_5_clf,array_5_Xtrain,array_5_ytrain,cv=2,scoring="accuracy") #evaluate it

array([0.75, 1.  ])

#### we have 75% and 100 % :) 