## Handwriting Digits Classifier: Naive Bayes

### Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
#pip install -U scikit-learn

# MNIST

In [None]:
from sklearn.datasets import fetch_openml

# fetch the data
mnist = fetch_openml('mnist_784')

print(type(mnist))
print(mnist.keys())

In [None]:
# get X (explanatory data), y (target: class label) data
X, y = mnist['data'], mnist['target']


In [None]:
# check X
print(type(X), X.shape)

# check y
print(type(y), y.shape)

In [None]:
# convert X to numpy array
X = X.values

X


In [None]:
print(X[0])

In [None]:
28 * 28

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# get one single data
# It is a 784 (28*28) value digit image) so reshape it to 28x28
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)

plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")
save_fig("some_digit_plot")
plt.show()

In [None]:
# the class label of X[0]
y[0]


In [None]:
# class labels of all data
# change the to integer

y = y.astype(np.uint8)

y

In [None]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [None]:
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [None]:
plt.figure(figsize=(9,9))
example_images = X[:100]

plot_digits(example_images, images_per_row=10)
save_fig("more_digits_plot")
plt.show()

# Binary classifier

In [None]:
# Prepare binary classes
# assign 1 to the data with class label 5
# all other data will be set to 0

y_b = (y == 5)
y_b

In [None]:
# training and testing split with test data size 30% (you can play with this hyper-parameter)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X[:1000], y_b[:1000], test_size=0.3)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

#### Naive Bayes classifier



In [None]:
from sklearn.naive_bayes import GaussianNB

# create a classifier and fit it
nb_clf = GaussianNB().fit(X_train, y_train)

cross_val_score(nb_clf, X_train, y_train, cv = 3, scoring='accuracy')

In [None]:
# predict the class labels for test data
# get the performance evaluation metrics

y_pred = nb_clf.predict(X_test)

# plot a confusion matrix
print(confusion_matrix(y_test, y_pred))

# Print classification report

target_names = ['not 5', '5']

result_metrics = classification_report(y_test, y_pred, target_names = target_names)

print(result_metrics)

#### Stocastic Gradient Descent (SGD) classifier

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)

sgd_clf.fit(X_train, y_train)

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
y_pred = sgd_clf.predict(X_test)

# plot a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

print(confusion_mat)

# Print classification report
target_names = ['not 5', '5']

result_metrics = classification_report(y_test, y_pred, target_names=target_names)

print(result_metrics)

results = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)

print(results)
print('\nClass not 5: precision is', results['not 5']['precision'])

#### using KFold 

In [None]:
from sklearn.model_selection import KFold # import k-fold validation

kf = KFold(n_splits=3, random_state=None, shuffle=True) # Define the split - into k folds 

kf

In [None]:
sgd_clf = SGDClassifier(random_state=42)

target_names = ['not 5', '5']

c0_precisions = []
c0_recalls = []
c1_precisions = []
c1_recalls = []
for train_index, test_index in kf.split(X):
    
    X_train, y_train = X[train_index], y_b[train_index]
    X_test, y_test = X[test_index], y_b[test_index]

    sgd_clf.fit(X_train, y_train)
    y_pred = sgd_clf.predict(X_test)

    print(confusion_matrix(y_test, y_pred))
    
    # Get a dictionary of performance results and access the results
    results = classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    
    # example
    c0_precisions.append(results['not 5']['precision'])
    c0_recalls.append(results['not 5']['recall'])
    c1_precisions.append(results['5']['precision'])
    c1_recalls.append(results['5']['recall'])
    
print('not 5: precision', sum(c0_precisions)/3)
print('not 5: recall', sum(c0_recalls)/3)

print('5: precision', sum(c1_precisions)/3)
print('5: recall', sum(c1_recalls)/3)

#### Decision tree classifier

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=42)

target_names = ['not 5', '5']

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_b[train_index], y_b[test_index]

    tree_clf.fit(X_train, y_train)
    y_pred = tree_clf.predict(X_test)
    
    print(confusion_matrix(y_test, y_pred))
    
    # Print classification report
    print(classification_report(y_test, y_pred, target_names=target_names))

#### The following shows the verification of results (Decision classifier above)

- The last cross validation result

In [None]:
from sklearn.metrics import precision_score

precision_score(y_test, y_pred)

In [None]:
# precision calculation from confusion matrix
p = 1699 / (1699+181)
p

In [None]:
from sklearn.metrics import precision_score, recall_score
recall_score(y_test, y_pred)

In [None]:
r = 1699 / (1699+445)
r

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred)

In [None]:
f1 = 2*p*r / (p+r)
f1

#### precision vs. recall trade-off

- When SGD classifier is used

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html

In [None]:
some_digit = X[0]
y[0]

In [None]:
y_score = sgd_clf.decision_function([some_digit])
y_score

In [None]:
threshold = 8000

y_some_digit_pred = (y_score > threshold)

y_some_digit_pred

In [None]:
from sklearn.model_selection import cross_val_predict

y_scores = cross_val_predict(sgd_clf, X_test, y_test, cv=3,
                             method="decision_function")

y_scores

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_test, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) # Not shown in the book
    plt.xlabel("Threshold", fontsize=16)        # Not shown
    plt.grid(True)                              # Not shown
    plt.axis([-50000, 50000, 0, 1])             # Not shown



recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]


plt.figure(figsize=(8, 4))                                                                  # Not shown
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)

plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:")                 # Not shown
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:")                                # Not shown
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")# Not shown
plt.plot([threshold_90_precision], [0.9], "ro")                                             # Not shown
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                             # Not shown
save_fig("precision_recall_vs_threshold_plot")                                              # Not shown
plt.show()

In [None]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)

plt.plot([0.60, 0.60], [0., 0.9], "r:")
plt.plot([0.0, 0.60], [0.9, 0.9], "r:")
plt.plot([0.60], [0.9], "ro")
save_fig("precision_vs_recall_plot")
plt.show()

In [None]:
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

In [None]:
threshold_90_precision

In [None]:
y_pred_90 = (y_scores >= threshold_90_precision)

In [None]:
precision_score(y_test, y_pred_90)

In [None]:
recall_score(y_test, y_pred_90)

# ROC curves and scores

#### ski-learn library

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    # Not shown in the book
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
    plt.grid(True)                                            # Not shown

plt.figure(figsize=(8, 6))                         # Not shown
plot_roc_curve(fpr, tpr)

plt.plot([4.837e-3, 4.837e-3], [0., 0.60], "r:") # Not shown
plt.plot([0.0, 4.837e-3], [0.60, 0.60], "r:")  # Not shown
plt.plot([4.837e-3], [0.60], "ro")               # Not shown
save_fig("roc_curve_plot")                         # Not shown
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_scores)

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

precision_score(y_train, y_train_pred)

In [None]:
recall_score(y_train, y_train_pred)

# Multiclass classification

- 10 digits

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[:1000], y[:1000], test_size=0.30)

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
sgd_clf.fit(X_train, y_train)

y_pred = sgd_clf.predict(X_test)
print(y_pred)

# plot a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
print(confusion_mat)

# Print classification report
target_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

result_metrics = classification_report(y_test, y_pred, target_names=target_names)

print(result_metrics)


### How normalization affect the model performance

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.fit_transform(X_test.astype(np.float64))

cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

In [None]:
y_pred_norm = sgd_clf.predict(X_test_scaled)

# plot a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred_norm)

print(confusion_mat)

# Print classification report
target_names = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

result_metrics = classification_report(y_test, y_pred_norm, target_names=target_names)

print(result_metrics)