In [None]:
# Now, we are going to load the dataset! Here, I am going to be using
# the MNIST dataset. Scikit does us a favour of preparing testing & training data.
from sklearn.datasets import load_digits
digitdata = load_digits()

In [None]:
print("Hello World")

In [None]:
# If we run this code, we can see that the data is filed as an Matrix using numpy!
type(digitdata.data)

In [None]:
# However, this does not tell us the whole story. Let's look closer at the matrix, to see how high dimensional our data is!
(digitdata.data.shape, digitdata.target.shape, digitdata.images.shape)

In [None]:
# From the above, we can see that our dataset contains 1797 images, each 8x8 in dimension and 1797 labels.
# So, each picture can be categorized as a 64 x 1 vector! The labels allow for us to classify each image. 
# Now, we can begin working on the data. 
# The code below will display some of the data for visualization!
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(digitdata.data[0:5], 
                                           digitdata.target[0:5])):
    plt.subplot(1, 5, index + 1)
    plt.imshow(np.reshape(image, (8,8)), cmap=plt.cm.gray)
    plt.title('Training: %i\n' % label, fontsize = 20);

In [None]:
# As we can see from above, we have shown 5 training images, showing 5 different digits from the dataset!
# Seeing as this is grayscale images, each pixel takes on some value between 0 and 225. 
# We can now begin to split these digits into both training and test data. 
# To do so, I again rely on the predefined test and training data given by Scikit!
# So,
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digitdata.data, 
                                                    digitdata.target,
                                                   test_size=0.25,
                                                   random_state=0)

In [None]:
# I will do the same procedure as before, by printing out the Size and Shape of both testing and training data
X_train.shape, X_test.shape

In [None]:
# So from above, we see that each image is still categorized as a 64x1 vector. The training data contains 1347 sample
# testing contains 450 sample
# For the purpose of this project, I am going to be testing logistic regression on this data, before and after
# dimension reduction methods. My hypothesis is that we will expect normal logistic regression to work best.
# Let's import the logistic regression from scikit!
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(fit_intercept=True,
                        multi_class='auto',
                        penalty='l2', #ridge regression, for more info please look at "Introduction to Statistical learning with R"
                        solver='saga',
                        max_iter=10000,
                        C=50)

In [None]:
# Now, let's print out the result of the above code, to see how exactly this logistic regression will work
clf

In [None]:
# Now, lets test the model and see if the classifier detects the true number of labels. The answer should be 10!
clf.fit(X_train, y_train)
clf.classes_

In [None]:
# So, we have 10 labels successfully made. The shape of each label should still be searching for a 64x1 vector
# which is the image in vector form. This should look like (10,64) due to 10 labels, 64 pixel image
clf.coef_.shape


In [None]:
# We can examine the weights closer for inspection. Let's look at the number 9 
clf.coef_[9].round(2)

In [None]:
# Now, let's examine the same, except for all classes!
clf.intercept_

In [None]:
# We can view how long this process took, by calculating the number of iterations the software performed to reach
# the tolerance! 
clf.n_iter_[0]

In [None]:
# So, we can now view these coefficients as an image, if we'd like!
coef = clf.coef_.copy()
plt.imshow(coef[0].reshape(8,8).round(2))
# However, this doesn't really capture anything for every class
# So, we can create a picture for the coefficients of each class, 10 in total

In [None]:
coef = clf.coef_.copy()
scale = np.abs(coef).max()
plt.figure(figsize=(12,5))

for i in range(10): # 0-9
    coef_plot = plt.subplot(3, 4, i + 1) # 2x5 plot

    coef_plot.imshow(coef[i].reshape(8,8), 
                     cmap=plt.cm.RdBu,
                     vmin=-scale, vmax=scale,
                    interpolation='bilinear')
    
    coef_plot.set_xticks(()); coef_plot.set_yticks(())
    coef_plot.set_xlabel(f'Class {i}')

plt.suptitle('Coefficients for various classes');

In [None]:
# Now, let's focus on the prediction of the MNIST data!

In [None]:
# We can do so by comparing the unknown against the truth!
print(clf.predict(X_test[0:9]))
print(y_test[0:9])
# We also need to score against the training and testing data
# So, 
clf.score(X_train, y_train) # training score

In [None]:
score = clf.score(X_test, y_test) # test score
score

In [None]:
# So, we have 100% accuracy for the training data, and yet 95.5% for the testing score!
# not bad!
# Now, we begin on the Confusion matrix. 
# For reference, please visit https://en.wikipedia.org/wiki/Confusion_matrix
# We do this by using scikit package 
from sklearn import metrics
PredictionAccuracy = clf.predict(X_test)

cm = metrics.confusion_matrix(y_true=y_test, 
                         y_pred = PredictionAccuracy, 
                        labels = clf.classes_)
cm

In [None]:
# The above diagram does not do our data a good representation
# So, let's import seaborn! This is a nice package for innovation and heatmaps
import seaborn as sns

plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, 
            linewidths=.5, square = True, cmap = 'YlOrRd');

plt.ylabel('Correct label')
plt.xlabel('Predicted label')
all_sample_title = 'Model Accuracy Score: {0}'.format(score)
plt.title(all_sample_title);

In [None]:
# Now, let's focus on the digits which were misclassified!
index = 0
misclassified_images = []
for label, predict in zip(y_test, PredictionAccuracy):
    if label != predict: 
        misclassified_images.append(index)
    index +=1
print(misclassified_images)

plt.figure(figsize=(10,10))
plt.suptitle('Misclassifications');

for plot_index, bad_index in enumerate(misclassified_images[0:20]):
    p = plt.subplot(4,5, plot_index+1)
    
    p.imshow(X_test[bad_index].reshape(8,8), cmap=plt.cm.gray,
            interpolation='bilinear')
    p.set_xticks(()); p.set_yticks(())
    
    p.set_title(f'Pred: {PredictionAccuracy[bad_index]}, Actual: {y_test[bad_index]}');