In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as la
import sklearn.metrics as met
import scipy.linalg as scipy

Here we add our 1 term so the bias can be included in our weights

In [2]:
# read MNIST training data
df = pd.read_csv('data/mnist_train.csv')
df.insert(1, 'bias', 1.0)


X = df.iloc[:, 1:].to_numpy()       # values are scaled to be between 0 and 1
X[:,2:] /= 255.0
Y = df.iloc[:, 0].to_numpy()   

In this step we filter out the columns that are 0 for more than 90% of the entries.

In [3]:
# Calculate the percentage of non-zero values in each column
column_percentages = np.mean(X != 0, axis=0)
# Identify columns where the percentage is greater than 10% (90% are 0)
selected_columns = column_percentages > 0.1
# Create a new matrix with the selected columns
cleaned_train = X[:, selected_columns]
print(cleaned_train.shape)

(60000, 344)


Here we create the label vectors for each one of our classifiers

In [4]:
list_train = list()

for digit in df['label'].unique():
    label_train = np.array([1 if item == digit else -1 for item in Y])
    list_train.append(label_train)


Here we map which vector classifier has the highest confidence to its associated label

In [5]:
# Define a mapping function to map numbers to strings
def map_to_digit(number):
    mapping = {# 5 0 4 1 9 2 3 6 7 8
        0: 5,
        1: 0,
        2: 4,
        3: 1,
        4: 9,
        5: 2,
        6: 3,
        7: 6,
        8: 7,
        9: 8,

    }
    return mapping.get(number, -1)

def get_classification(data,weights):
    guesses = np.argmax(data@(np.vstack(weights).T),axis=1)
    digits = np.vectorize(map_to_digit)(guesses)
    return digits

This is the step where we preform QR decomposition and obtain the minimized weights for each one of our classifiers

In [6]:
#QR composition
Q, R = np.linalg.qr(cleaned_train)
weight_list = list()
for label in list_train:
    weight_list.append(scipy.solve_triangular(R,(Q.T@label))) #returns our weights

Here we obtain our classifications and compute the confusion matrix. Our error rate here is 15% which seems to classify the training data well, now let us see the testing data

In [7]:
training_classifications = get_classification(cleaned_train,weight_list)
df_confusion = pd.crosstab(Y, training_classifications, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion)


print("Error rate is ", np.sum(Y != training_classifications)/Y.size)

Predicted     0     1     2     3     4     5     6     7     8     9    All
Actual                                                                      
0          5578     6    32    22    21    88   104    10    58     4   5923
1             2  6536    44    16     6    38     6    21    66     7   6742
2            82   301  4729   131   132    16   168   171   195    33   5958
3            36   181   193  5049    24   121    82   167   140   138   6131
4             8   108    50     3  5088    34    98    42    75   336   5842
5           190   112    34   498   131  3718   260    95   242   141   5421
6            85    88    50     5    55    89  5518     2    23     3   5918
7            64   190    49    44   129     9    10  5536    17   217   6265
8            57   489    64   227    88   155   103    49  4446   173   5851
9            78    83    36   107   374    29    16   396    46  4784   5949
All        6180  8094  5281  6102  6048  4297  6365  6489  5308  5836  60000

We repeat the same steps for our testing data

In [8]:
df2 = pd.read_csv('data/mnist_test.csv')
df2.insert(1, 'bias', 1.0)


X_test = df2.iloc[:, 1:].to_numpy()       # values are scaled to be between 0 and 1
X_test[:,2:] /= 255.0
Y_test = df2.iloc[:, 0].to_numpy()  

In [9]:
cleaned_test = X_test[:, selected_columns]
print(cleaned_test.shape)

(10000, 344)


Here we observed an accuracy of 14.4% percent. This surprising as it performed better than the training data, but it is still roughly similar. This indicates our model did not overfit which is good.

In [10]:
tst_class = get_classification(cleaned_test,weight_list)
df_confusion2 = pd.crosstab(Y_test, tst_class, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion2)


print("Error rate is ", np.sum(Y_test != tst_class)/Y_test.size)

Predicted     0     1    2     3     4    5     6     7    8    9    All
Actual                                                                  
0           936     1    3     2     1   12    17     2    5    1    980
1             0  1099    3     3     2    1     5     3   19    0   1135
2            18    59  822    19    16    1    23    34   36    4   1032
3             7    21   26   863     5   16    12    32   18   10   1010
4             1    24    5     1   864    3    16     3   10   55    982
5            23    16    4    82    27  610    38    31   42   19    892
6            15    10    5     0    19   14   892     1    2    0    958
7             4    42   15     6    16    0     3   899    0   43   1028
8            14    50   10    34    19   22    26    18  757   24    974
9            19    13    7    12    70    5     4    57    7  815   1009
All        1037  1335  900  1022  1039  684  1036  1080  896  971  10000
Error rate is  0.1443


In [11]:
#print(weight_list[1])
newY = np.array([1 if item == 0 else -1 for item in Y_test])

[-7.41738431e-01 -4.47561204e-02  1.57613190e-02  2.92590027e-02
 -3.93377988e-02  2.77487481e-02 -3.11306992e-02 -1.22409507e-02
 -4.54178206e-02 -3.01630712e-03 -5.98146399e-02 -8.23959563e-02
 -8.06097991e-03  6.46713148e-04 -2.24318270e-02  3.13106751e-02
  5.94932211e-04  1.61432524e-02  1.17897664e-02  1.56509766e-02
 -4.17891929e-05  2.28552504e-02  7.97734181e-03 -6.74672232e-02
 -1.40812316e-02  2.00630290e-02  1.13748734e-03 -1.51752608e-02
 -7.14412335e-04  7.07339537e-03  2.94859972e-02  1.73632499e-02
  6.00248178e-03  2.47610234e-02  1.07122495e-02  2.44609048e-02
  1.53369213e-03  4.29942195e-03  5.32495106e-02 -7.90213807e-02
  1.85818510e-02 -2.25485636e-02  3.51359259e-02 -1.45260412e-02
  1.82861033e-02  1.92334327e-02  1.80033481e-02  3.42737646e-02
  2.98748796e-02  1.41713595e-03  3.60368860e-02  3.04466877e-02
  1.18512690e-02 -1.76865316e-02 -3.40373433e-02  9.53718320e-02
 -1.85399712e-01  1.73573948e-02 -7.00988063e-03  5.09844727e-03
  4.38805957e-03  2.19388