## Number classifier task

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import scipy

In [3]:
#Helper extraction functions

def extract_data(path):
    """Load data file"""
    return np.loadtxt(path, dtype=np.int16, delimiter=',')

def reshape_matrix(mat):
    """Reshape each line in the matrix into a quadratic image"""
    reshapedMat = []
    
    for i in range(np.size(mat,0)):
        reshapedMat.append(mat[i,:].reshape(28,28))

    return np.array(reshapedMat)

In [4]:
# Extract data from csv files

trainv = extract_data("numbers_data/trainv.csv")  
testv = extract_data("numbers_data/testv.csv")
trainlab = extract_data("numbers_data/trainlab.csv")
testlab = extract_data("numbers_data/testlab.csv")

# Reshaped data contains images
# reshapedTrainv = reshape_matrix(trainv)
# reshapedTestv = reshape_matrix(testv)

In [5]:
# Toy example to figure out numpy functions

# List of 2x2 images already classified
train = np.array([
    [1, 1, 1, 1], 
    [2, 2, 2, 2],
    [3, 3, 3, 3],
    [4, 4, 4, 4]
])

label = np.array(["1", "2", "3", "4"])

# Images we want classified
test = np.array([
    [0, 0, 0, 0],
    [1, 1, 1, 1],
    [5, 5, 5, 5]
])

print(train.shape)
print(test.shape)

# For each element in test: subtract it from all elements in data
# We should get a new matrix for each element in test
diff = train - test[:, np.newaxis]
print("Difference")
print(diff)

# Calculate the norm. Each row in the matrix is the distance between one test and all train
norm1 = np.linalg.norm(diff, axis=2)
print("\nNorm 1")
print(norm1)


# Choose the smallest distance
nn_idx = np.argmin(norm1, axis=1)
print("\nClosest training sample index")
print(nn_idx)

# Print the label
predicted_class = label[nn_idx]
print("\nClosest training sample class")
print(predicted_class)

(2, 4)
(3, 4)
Difference
[[[ 1  1  1  1]
  [ 2  2  2  2]]

 [[ 0  0  0  0]
  [ 1  1  1  1]]

 [[-4 -4 -4 -4]
  [-3 -3 -3 -3]]]

Norm 1
[[2. 4.]
 [0. 2.]
 [8. 6.]]

Closest training sample index
[0 0 1]

Closest training sample class
['1' '1' '2']


In [29]:
# Calculate results of NN
import scipy.spatial


start = 0
n_samples = int(1e4)
buff_size = 500

nn_idx = np.zeros(n_samples, dtype=int)

for i in range(start, n_samples, buff_size):
    # diff = (trainv - testv[i:i+buff_size, np.newaxis])
    # norm = np.linalg.norm(diff, axis=2)
    norm = scipy.spatial.distance.cdist(testv[i:i+buff_size], trainv, metric="euclidean")  # this is 10x faster than the above lines
    nn_idx[i:i+buff_size] = np.argmin(norm, axis=1)

nn_pred = trainlab[nn_idx]
correct_predictions = (testlab[:n_samples] == nn_pred)
print(sum(correct_predictions)/n_samples)



0.9691


In [28]:
print(nn_idx.shape)
np.savetxt("pred_no_cluster.txt", nn_idx, fmt="%.d")


(10000,)
