We start this using the same steps as Q5

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as la
import sklearn.metrics as met
import scipy.linalg as scipy

In [3]:
# read MNIST training data
df = pd.read_csv('data/mnist_train.csv')
df.insert(1, 'bias', 1.0)


X = df.iloc[:, 1:].to_numpy()       # values are scaled to be between 0 and 1
X[:,2:] /= 255.0
Y = df.iloc[:, 0].to_numpy()   

In [4]:
# Calculate the percentage of non-zero values in each column
column_percentages = np.mean(X != 0, axis=0)
# Identify columns where the percentage is greater than 10% (90% are 0)
selected_columns = column_percentages > 0.1
# Create a new matrix with the selected columns
X= X[:, selected_columns]
print(X.shape)

(60000, 344)


Below we generate a random matrix and multiply it by X transpose, then we use the maximum function to capture positive correlations

In [5]:
Rand= np.random.choice([-1, 1], size=(5000, 344))

In [6]:
RX = Rand@X.T
print(RX)

[[-5.43529412 11.84313725  5.97647059 ... -3.30588235  9.85098039
  12.33333333]
 [11.77254902  9.4745098   9.34901961 ...  3.79215686 17.01960784
   6.75686275]
 [-0.97254902 -1.12941176 -6.02352941 ...  5.38431373 -1.5372549
  -2.76470588]
 ...
 [ 0.38431373 20.18039216  9.70196078 ... -5.6745098   4.26666667
  17.95686275]
 [ 4.29019608 -8.61176471  8.87843137 ...  3.16470588 -7.43529412
   1.35294118]
 [12.76862745  9.63921569 -2.41568627 ...  2.09019608  7.5372549
  24.09803922]]


In [7]:
RX = np.maximum(0,RX)
print(RX)

[[ 0.         11.84313725  5.97647059 ...  0.          9.85098039
  12.33333333]
 [11.77254902  9.4745098   9.34901961 ...  3.79215686 17.01960784
   6.75686275]
 [ 0.          0.          0.         ...  5.38431373  0.
   0.        ]
 ...
 [ 0.38431373 20.18039216  9.70196078 ...  0.          4.26666667
  17.95686275]
 [ 4.29019608  0.          8.87843137 ...  3.16470588  0.
   1.35294118]
 [12.76862745  9.63921569  0.         ...  2.09019608  7.5372549
  24.09803922]]


We concatenate this matrix to our feature matrix and then use a binary classifier like the ones in Q5

In [8]:
result_matrix = np.concatenate((X,RX.T),axis=1)

In [9]:
label_train = np.array([1 if item ==0 else -1 for item in Y])

In [10]:
Q, R = scipy.qr(result_matrix,mode='economic',check_finite=False)

In [11]:
weights = scipy.solve_triangular(R,(Q.T@label_train))

In [12]:
print(weights.shape)

(5344,)


In [13]:
predictions = result_matrix@weights

In [14]:
print(predictions.shape)
print(predictions)

(60000,)
[-1.02357496  1.12024667 -1.13545865 ... -0.98007273 -0.92337733
 -0.98308799]


In [15]:
classifications = np.array([1 if item > 0.0 else -1 for item in predictions])

In [16]:
print(classifications)

[-1  1 -1 ... -1 -1 -1]


We observe our error rate to be 0.2% this is the lowest error rate so far computed for any of the binary classifiers written for our MNIST questions.It seems these features capture some latent information in the image that gives more information than standard pixel values

In [17]:
df_confusion = pd.crosstab(label_train, classifications, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion)


print("Error rate is ", np.sum(label_train != classifications)/label_train.size)

Predicted     -1     1    All
Actual                       
-1         54051    26  54077
1            111  5812   5923
All        54162  5838  60000
Error rate is  0.0022833333333333334


In [18]:
# read MNIST training data
df = pd.read_csv('data/mnist_test.csv')
df.insert(1, 'bias', 1.0)


X = df.iloc[:, 1:].to_numpy()       # values are scaled to be between 0 and 1
X[:,2:] /= 255.0
Y = df.iloc[:, 0].to_numpy() 

In [19]:
X= X[:, selected_columns]
print(X.shape)

(10000, 344)


In [20]:
RX = Rand@X.T
print(RX)


[[10.43921569 -8.68235294  1.24705882 ...  2.21960784  3.34117647
   9.36078431]
 [-0.03921569 10.10980392  6.45490196 ... 14.47843137 11.6627451
  29.34509804]
 [ 9.49019608 -7.96078431 -7.29411765 ...  3.45882353 -3.29411765
   2.00392157]
 ...
 [ 8.79215686  1.45882353 -2.24313725 ... 12.84705882 11.1372549
   6.14509804]
 [-2.6745098  -1.75686275  8.03921569 ... 11.76470588  2.
  -6.60784314]
 [ 3.19215686  3.9372549   9.70980392 ...  9.31764706  8.94117647
  31.37647059]]


In [21]:
RX = np.maximum(0,RX)
result_matrix = np.concatenate((X,RX.T),axis=1)
print(RX)

[[10.43921569  0.          1.24705882 ...  2.21960784  3.34117647
   9.36078431]
 [ 0.         10.10980392  6.45490196 ... 14.47843137 11.6627451
  29.34509804]
 [ 9.49019608  0.          0.         ...  3.45882353  0.
   2.00392157]
 ...
 [ 8.79215686  1.45882353  0.         ... 12.84705882 11.1372549
   6.14509804]
 [ 0.          0.          8.03921569 ... 11.76470588  2.
   0.        ]
 [ 3.19215686  3.9372549   9.70980392 ...  9.31764706  8.94117647
  31.37647059]]


In [22]:
label_test = np.array([1 if item ==0 else -1 for item in Y])

In [23]:
predictions_test = result_matrix@weights

In [24]:
print(predictions_test.shape)
print(predictions_test)
classifications_test = np.array([1 if item > 0.0 else -1 for item in predictions_test])
print(classifications_test)

(10000,)
[-0.94851943 -0.79179232 -1.05804777 ... -0.91271914 -1.0789264
 -0.98047174]
[-1 -1 -1 ... -1 -1 -1]


We observe the test error rate to be almost exactly like our training data. This means that our model is generalizing and these features we generated carry more information about unseen data as well. This classifier did way better than our Q5 binary classifier for 0. Our binary classifier had an error rate 0.0182 (screenshot provided in submission). This is roughly 7x increase in accuracy for our features

In [25]:
df_confusion = pd.crosstab(label_test, classifications_test, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion)


print("Error rate is ", np.sum(label_test != classifications_test)/label_test.size)

Predicted    -1    1    All
Actual                     
-1         9014    6   9020
1            19  961    980
All        9033  967  10000
Error rate is  0.0025
