In [None]:
# Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
# License: BSD 3 clause
# The following workshop is a more in-depth tutorial based off of the MNIST tutorial from sklearn

# Workshop 1 - Classification of Handwritten Digits from MNIST Dataset

### The objective of the workshop is to learn a simple machine learning workflow in order to solve a classic machine learning problem: classifying handwritten digits from 0-9

#### Import some bread and butter libraries in Python

In [None]:
# Standard scientific Python imports
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, neural_network,metrics

#### Load the MNIST digits dataset

In [None]:
digits = datasets.load_digits()

#### Inspect the dataset-  Knowing what format the data is presented in and how it was generated can give insight on creating better models. The dataset in this workshop is a very popular one used for digit classification, and sklearn has formatted it into their own object type. As seen here, data is split into different "fields".

#### Format data into feature variables and target variables list, inspect first element. We expect to see an 8x8 matrix as an array, paired with its target variable. 

In [None]:
images_and_labels = list(zip(digits.images, digits.target))

#### The data that we are interested in is made up of 8x8 images of digits, let's have a look at the first 4 images, stored in the "images" attribute of the dataset.  NOTE: If we were working from image files, we could load them usingmatplotlib.pyplot.imread.  Notice that each image must have the same size. For these images, we know which digit they represent: it is given in the "target" of the dataset.

In [None]:
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)
    plt.show()

#### In order to feed the image data into a classifier, we flatten each image matrix into a vector

In [None]:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

#### Below, we verify the shape of the first sample to see if we have correctly flattened the image matrix into a vector.

In [None]:
data[0].shape 

####  Import classifier from sklearn initialize. We will use an "SVM" (support vector machine), which will be described more in depth later in the course. In sklearn, this is reduced to a simple function call.

In [None]:
# Initialize SVM model from sklearn imported earlier 

#### We then fit the data into the classifier, using the following format: clf.fit(X, Y), where X is the feature matrix and Y is the target variable. Note that the number of samples in the feature matrix must correspond to the number of samples in Y.

In [None]:
# Use fit command to learn the digits on the first half of data


#### Using the classifier, predict on the rest of the data

In [None]:
# Now predict the value of the digit on the second half:

#### Classification report includes classfication metrics used, notably precision, recall, f1, and support

In [None]:
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))

#### Print out the confusion matrix to see which classes are incorrectly classified

In [None]:
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

### Print out some of the predictions and some of their corresponding images

In [None]:
images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
    plt.subplot(2, 4, index + 5)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)

    plt.show()

### Some things to play with during workshop, try changing the train_test split, create pivot variable to make code more legible, create parameters that can be changed so that users can easily change the model and evaluate the workflow

In [None]:
# Seeing is believing, sample a head of around 20 examples, between the expected output vs the predictions made by each
#classifier

In [None]:
# import a few classifiers, explore from http://scikit-learn.org/stable/supervised_learning.html to compare classification algo performance.