In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#Importing MNIST dataset from scikit-learn library

from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

In [3]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [4]:
X = mnist['data']
y = mnist['target']

In [5]:
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [6]:
#X is a dataset having 70000 images and each image has 784 features.
#784 features refers to the image size of 28X28 pixels

X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
# y consists of the actual digits represented each 70000 images

y

array(['5', '0', '4', ..., '4', '5', '6'], dtype=object)

In [13]:
y = y.astype(np.uint8)   #Since y labels are strings, we need to make them integers

In [14]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [15]:
y_train_9 = (y_train == 9)
y_test_9 = (y_test == 9)

In [17]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd.fit(X_train, y_train_9)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [21]:
#To verify 
sgd.predict([X[4]])

array([ True])

In [22]:
#Above prediction says the digit in X[4] is indeed 9
#Lets check:-
y[4]

9

In [24]:
#So it works correctly
#Lets test on another digit
print(y[2])
sgd.predict([X[2]])

4


array([False])

In [25]:
#Seeing our model works correctly, we still need to measure its performance
#Hence now we will perform its PERFORMANCE MEASURE using K-fold Cross Validation

from sklearn.model_selection import cross_val_score
cross_val_score(sgd, X_train, y_train, cv=3, scoring='accuracy')

array([0.87365, 0.85835, 0.8689 ])

In [35]:
# 87% accuracy...Not that good accuracy percentage but does it accurately depict the accuracy?
# scoring='accuracy' measure isn't a good method especially for datasets where the distribution is partial
# For example the no. of 9s are far less than no. of non-9s. Hence predicting non-9 class is far very easy
# and will obviously have high accuracy.

#We see the no. of 9s are 1009
#and no. of non 9s are 8991

np.unique(y_test, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8),
 array([ 980, 1135, 1032, 1010,  982,  892,  958, 1028,  974, 1009],
       dtype=int64))

In [36]:
# We need another method: Confusion Matrix
# For confusion matrix, we need to get the set of predictions instead of set of scores
# Hence we use cross_val_predict instead of cross_val_score

from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd, X_train, y_train_9, cv=3)

In [37]:
#Implementing Confusion Matrix

from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_9, y_train_pred)

array([[52715,  1336],
       [ 1695,  4254]], dtype=int64)

In [38]:
#TP: 4254
#TN: 52715
#FP: 1336
#FN: 1695