from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')
mnist

In [5]:
from sklearn.datasets import fetch_openml

In [None]:
mnist = fetch_openml('mnist_784', version=1, cache=True) # Loading dataset from  sklearn

In [None]:
mnist

In [None]:
x, y = mnist["data"], mnist["target"]

In [None]:
#checking the shape of the data set
x.shape,y.shape

# exploring the dataset

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [None]:
some_digit = x[60000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = matplotlib.cm.binary,
interpolation="nearest")
plt.axis("off")
plt.show()

In [None]:
x_train, x_test = x[:60000], x[60000:]
y_train, y_test = y[:60000], y[60000:]

# shuffling the training set; wish will guarantee that all cross-validation folds will be similar

In [None]:
import numpy as np
shuffle_index = np.random.permutation(60000)
x_train = x_train[shuffle_index] 
y_train = y_train[shuffle_index]

In [None]:
# Training a Binary Classifier
y_train_5 = y_train.astype(np.int8)
y_test_5 = y_train.astype(np.int8)

y_train_5 = (y_train == 5) # True for all 7s, False for all other digits.

y_test_5 = (y_test == 5)


In [None]:
y_train_5

# using Stochastic Gradient Descent (SGD) classifier


In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit])

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy")

# Implementing Cross-Validation

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

In [None]:
skfolds = StratifiedKFold(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(x_train, y_train_5):
    clone_clf = clone(sgd_clf)
    x_train_folds = x_train[train_index]
    y_train_folds = (y_train_5[train_index])
    x_test_fold = x_train[test_index]
    y_test_fold = (y_train_5[test_index])
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(x_test_fold)
    n_correct = sum(y_pred == y_test_fold) 
    print(n_correct / len(y_pred))  