# Machine Learning Models - Chad R. - K-Nearest Neighbors & Gaussian Naive Bayes for Optical Character Recognition

## K-Nearest Neighbor

 Importing all the necessary libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, metrics, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
digits = datasets.load_digits()

In [3]:
X_train, X_test,y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.5, random_state = False)

Declaring my training and testing variables as well as declaring my test size is half of the set, so the other half is my train set

In [4]:
k = 1
predictions = []

I only set K here = 1 for a base test. This is replaced during 1 and 2

In [5]:
for measures in X_test:
    distance = np.sum((X_train - measures)**2, axis=1)
    #print(distance)
    neighbor_idxs = np.argsort(distance)[:k]
    #print(neighbor_idxs)
    neighbor_classes= y_train[neighbor_idxs]
    #print(neighbor_classes)
    pred_class=np.bincount(neighbor_classes).argmax()
    #print(pred_class)
    predictions.append(pred_class)

In [6]:
accuracy = np.mean(np.array(predictions) == y_test)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.98


Function to make 1 and 2 easier to calulate. 

In [7]:
def knn(X_train, y_train, X_test, k):
    predictions = []
    for measures in X_test:
        distance = np.sum((X_train - measures)**2, axis=1)
        #print(distance)
        neighbor_idxs = np.argsort(distance)[:k]
        #print(neighbor_idxs)
        neighbor_classes= y_train[neighbor_idxs]
        #print(neighbor_classes)
        pred_class=np.bincount(neighbor_classes).argmax()
        #print(pred_class)
        predictions.append(pred_class)
    return np.array(predictions)

### Differing k Values

The block below makes the calculations easier to use.

In [8]:
k_vals = [1,3,5,100,500]
results = []
for k in k_vals:
    guess = knn(X_train, y_train, X_test, k)
    accuracy = np.mean(guess == y_test)
    results.append((k, accuracy))

print("k  Accuracy")
for k, acc in results: 
    print(f"{k} {acc:.4f}")

k  Accuracy
1 0.9822
3 0.9789
5 0.9778
100 0.8754
500 0.5139


### Classification Matrix

In [11]:
k_vals = [1,3,5,100,500]
results = []
for k in k_vals:
    guess = knn(X_train, y_train, X_test, k)
    accuracy = np.mean(guess == y_test)
    results.append((k, accuracy))
    classmat = confusion_matrix(y_test, guess)
    print(f"\nClassification Matric for k={k}:")
    num_labels = [str(i) for i in range(10)]
    print("  " + " ".join(f"{n:>3}" for n in num_labels))
    for idx, row in enumerate(classmat):
        print(f"{num_labels[idx]:>3}" + " ".join(f"{val:>3}" for val in row))
    


Classification Matric for k=1:
    0   1   2   3   4   5   6   7   8   9
  0 89   0   0   0   0   0   0   0   0   0
  1  0  90   0   0   0   0   0   0   0   0
  2  0   0  92   0   0   0   0   0   0   0
  3  0   0   0  92   0   0   0   0   0   1
  4  0   0   0   0  76   0   0   0   0   0
  5  0   1   0   1   1 102   1   0   0   2
  6  0   0   0   0   0   0  89   0   0   0
  7  0   0   0   0   0   0   0  78   0   0
  8  0   2   0   2   0   0   0   0  87   1
  9  0   0   0   2   0   2   0   0   0  88

Classification Matric for k=3:
    0   1   2   3   4   5   6   7   8   9
  0 89   0   0   0   0   0   0   0   0   0
  1  0  90   0   0   0   0   0   0   0   0
  2  0   1  91   0   0   0   0   0   0   0
  3  0   0   1  92   0   0   0   0   0   0
  4  0   0   0   0  75   0   0   1   0   0
  5  0   1   0   1   0 103   1   0   0   2
  6  0   0   0   0   0   0  89   0   0   0
  7  0   0   0   0   0   0   0  78   0   0
  8  0   5   0   1   0   0   1   0  84   1
  9  0   0   0   1   0   2   0   0 

Some of the most common errors within the KNN is 2 being confused with 1 and 3, 3 with 2, 5 with 9, 8 with 1,3, 2 and 9 being congused with 3 and 5. This matches with my intuition due to the fact that really these mistakes only exist with higher k values but that each number comparison listed are somewhat similar with one another. The way that 8 is written could be mistaken for a 1 by a computer analyzing a large amount of its nearest neighbors. 

## Guasian Naive Bayes 

this works with the data from the first one as well and reuses some of the variables. 

In [12]:
classes, counts = np.unique(y_train, return_counts=True)
priors = counts / len(y_train)

In [13]:
means = np.zeros((len(classes), X_train.shape[1]))
variances = np.zeros((len(classes), X_train.shape[1]))

for idx, c in enumerate(classes):
    X_c = X_train[y_train == c]
    means[idx, :] = X_c.mean(axis=0)
    variances[idx, :] = X_c.var(axis=0) + 1e-6

In [14]:
def gaussian_likelihood(x, mean, var):
    coeff = 1.0 / np.sqrt(2.0 * np.pi * var)
    exponent = np.exp(- ( x- mean) ** 2 / (2*var))
    return coeff * exponent

In [15]:
def predict_naive_bayes(x):
    posteriors = []
    for k in range(len(classes)):
        log_prior = np.log(priors[k])
        log_likelihood = np.sum(np.log(gaussian_likelihood(x, means[k], variances[k]) + 1e-9))
        log_posterior = log_prior + log_likelihood
        posteriors.append(log_posterior)
    return np.argmax(posteriors)

### Overall Accuracy

In [16]:
nb_preds = np.array([predict_naive_bayes(x) for x in X_test])
accuracy = np.mean(nb_preds == y_test)
print(f"Naive Bayes accuracy: {accuracy:.4f}")

Naive Bayes accuracy: 0.8587


### Classification Matric

In [18]:
classmat = confusion_matrix(y_test, nb_preds)
num_labels = [str(i) for i in range(10)]
print (f"\nGaussian Naive Bayes Classification Matrix (Accuracy: {accuracy:.4f}):")
print("   " + " ".join(f"{n:>3}" for n in num_labels))
for idx, row in enumerate(classmat):
    print(f"{num_labels[idx]:>3} " + " ".join(f"{val:>3}" for val in row))


Gaussian Naive Bayes Classification Matrix (Accuracy: 0.8587):
     0   1   2   3   4   5   6   7   8   9
  0  89   0   0   0   0   0   0   0   0   0
  1   0  77   1   0   0   1   0   0   8   3
  2   0   7  57   6   0   0   0   1  21   0
  3   0   1   1  80   0   1   0   3   5   2
  4   0   2   0   0  68   0   1   4   1   0
  5   0   0   0   2   1  98   1   4   1   1
  6   1   2   0   0   0   1  85   0   0   0
  7   0   0   0   0   1   0   0  77   0   0
  8   0   4   0   3   0   3   0   1  81   0
  9   1   1   0  11   1   3   0   7   8  60


The errors are consistent with that of the KNN model which found similar errors with 9 with 3 and 2 with 8. I believe the lower accuracy is because naive bayes is not commonly what we would use for image classification, as that would be KNN and CNN, and even then looking at the images of what the computer sees when it is examining these handwritten numbers the conclusions it comes to would make some sense due to the poor quality in what it sees. 