In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

In [None]:
X, y = mnist['data'], mnist['target']

print('X Shape: {}\ny Shape: {}'.format(X.shape, y.shape))

There are 70,000 images and each image has 784 features (28x28 pixels). Each feature represents one pixel's intensity - from 0 (white) to 255 (black).

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap=mpl.cm.binary, interpolation='nearest')
plt.axis('off')
plt.show()

##### Create a Train & Test Set

In [None]:
# MNIST is already split into a training (first 60k images) and test (last 10k images) set.

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

##### Measuring Accuracy

A good way to evaluate a model is to use cross-validation. 

In [None]:
# Let's only look at 5s
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

# Stochastic Gradient Descent (SGD)
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)


from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring='accuracy')

In [4]:
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring='accuracy')

This demonstrates why accuracy is generally not the preferred performance measure for classifiers, especially when you're dealing with skewed datasets.

A better method to evaluate performance is to use a confusion matrix.

In [5]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [6]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

##### Precision & Recall

In [7]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_5, y_train_pred)
recall_score(y_train_5, y_train_pred)

We can combine precision and recall into a single metric called F1 Score.

In [8]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

##### ROC Curve & AUC

In [9]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--') # dashed diagonal
    plt.ylabel('True Positive Rate (Recall)')
    plt.xlabel('False Positive Rate')
    
plot_roc_curve(fpr, tpr)
plt.show();

In [10]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_scores)

In [11]:
# Let's compare a RandomForestClassifier ROC curve and ROC AUC score to SGDClassifier.

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method='predict_proba')

# We need scores, not probabilities for ROC curves
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

plt.plot(fpr, tpr, 'b:', label='SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='lower right')
plt.show();

In [None]:
roc_auc_score(y_train_5, y_scores_forest)

### Multiclass Classification

In [51]:
# sk-learn detects when you try to use a binary classifier for a multiclass task 
# (and automatically runs OvA - except for SVM where it uses OvO)

sgd_clf.fit(X_train, y_train) # previously 'y_train_5'
sgd_clf.predict([some_digit]) 

In [None]:
# As proof, the decision_function() method should return 10 scores now. Note the highest score is for the 5.
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

Scikit-Learn can also be forced to use OvO or OvA.

In [52]:
from sklearn.multiclass import OneVsOneClassifier

ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])

print('Number of classes trained: {}'.format(len(ovo_clf.estimators_)))

In [None]:
# RandomForest can handle multiclass classification

forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])

forest_clf.predict_proba([some_digit])

The proba list shows a 0.9 estimation for a digit-5.

Now of course we can evaluate these classifiers using cross-validation.

In [None]:
# Evaluating SGDClassifier's accuracy
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')

84% is not that bad. We can scale the inputs to increase accuracy to 89%.

In [53]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')

#### Error Analysis

Confusion Matrix

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

In [None]:
# Hopefully the diagnols are mostly white - black would mean a lot of error.
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

We need to determine if 5s didn't perform well due to lack of data or the classifier just doesn't perform well on 5s.

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

# Fill diagonol with zeros since we want to hide the correct values and only see the errors
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

The 8 column has many misclassifications. But the 8 row isn't bad at all, which means most 8s are correctly classified.

The 3s and 5s also get confused between each other.

In [None]:
cl_a, cl_b = 3, 5

X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

The reason for this confusion is because SGDClassifier is a linear model. It just assigns a weight per class to each pixel, and when it sees a new image it just sums up the weighted pixel intensities to get a score for each class. So since 3s and 5s differ by only a few pixels, the model easily confuses them.

The main difference between 3s and 5s is the position of the small line that joins the top line to the bottom arc. This classifier is quite sensitive to this.

### Multilabel Classification

Our goal here is to output multiple binary tags. So for example, a facial recognition system may recognize 3 faces. When we show it 2 faces, it should output [1, 0, 1]

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

knn_clf.predict([some_digit])

To evaluate a multilabel classifier, we can use the F1 score.

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average='macro') # or use average='weighted' if dataset contains unbalanced faces

### Multioutput Classification

Multioutput classification is a generalization of multilabel classification where each label can be multiclass (i.e., it can have more than 2 possible values).

Let's say we want to remove noise from images. The classifier's output is multilabel (one label per pixel) and each label can have multiple values (pixel intensity ranges from 0 to 255). This is an example of multioutput classification.

In [None]:
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

Let's train the classifier to clean the noisy background.

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)