In [168]:
import sklearn.datasets
import numpy as np
from numpy.linalg import norm
from scipy.spatial.distance import hamming
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
import time

np.random.seed(1)
NUM_FEATURES = 784

TRAIN_SUBSET = 5000
TEST_SUBSET = 1000

In [2]:
dataset = sklearn.datasets.fetch_openml("mnist_784", version=1)

In [140]:
X = dataset.data.to_numpy()
y = dataset.target.to_numpy()

In [169]:
def randomVector(n):
    """
    Randomly generates hypervector of size n
    """
    return (2*np.random.randint(0, 2, n)) - 1

def bind(a, b):
    return np.multiply(a, b)

def randomFlip(vec, idxs):
    """
    Returns a vector with the inx bits flipped.
    """
    assert len(idxs) <= len(vec), "Number of bits to flip greater than vector length"
    
    flipped_vec = np.copy(vec)
    
    for i in idxs:
        flipped_vec[i] *= -1
        
    return flipped_vec
    
def cosSim(a, b):
    """
    Performs cosine similarity on vectors a and b
    """
    
    if norm(a) == 0 or norm(b) == 0:
        return 0
    else:
        return (np.dot(a, b) / (norm(a) * norm(b)))

def hammingDist(a, b):
    """
    Normalized hamming distance between two vectors.
    Vectors are assumed to be binarized.
    """
    assert len(a) == len(b), "Vectors are not the same length"
    return hamming(a, b)
    #return np.count_nonzero(a!=b) / len(a)

In [170]:
def buildLevelVectors(levels, dim):
    """
    Level hypervectors, level hypervector at i is copied from level hypervector i-1 with D/(2*255) bits flipped
    """
    levelvectors = []
    min_vector = randomVector(dim)
    
    levelvectors.append(min_vector)
    
    window = 0
    indexes = np.arange(0, dim)
    np.random.shuffle(indexes)
    bit_flips = int(dim/(2*255))

    for i in range(1, levels):
        mid_vec = np.copy(levelvectors[i-1])
        mid_vec = randomFlip(mid_vec, indexes[window:window+bit_flips])
        window += bit_flips
        
        levelvectors.append(mid_vec)
        
    return np.array(levelvectors)

def buildPosVectors(n, dim):
    """
    Array of n random hypervectors to encode position of features.
    """
    posvectors = []
    for i in range(n):
        posvectors.append(randomVector(dim))
    return np.array(posvectors)

def rmrBinarize(x):
    """
    Random majority rule. 0 are randomly set to 1 or -1
    """
    h = [i/i if i != 0 else np.random.choice([-1.0, 1.0]) for i in x]
    return np.array(h)

def fmrBinarize(x, k):
    """
    Fixed majority rule. 0 are set to k
    """
    h = [i/i if i != 0 else k for i in x]
    return np.array(h)

In [171]:
class Encoder:
    def __init__(self, dim, n_features, binarizer):
        self.D = dim
        self.n_features = n_features
        self.B = buildPosVectors(n_features, dim)
        self.levelvectors = buildLevelVectors(256, dim)
        self.binarizer = binarizer        
        self.classvecs = {}
        
    def binarize(self, x):
        if self.binarizer == 'rmr':
            return rmrBinarize(x)
        else:
            return fmrBinarize(x, -1)
    
    def encodeVector(self, X):
        h = np.zeros(self.D)
        for i, feature in enumerate(X):
            #h += bind(self.levelvectors[int(feature)], self.B[i])
            h = self.bundle(h, bind(self.levelvectors[int(feature)], self.B[i]))
        
        return self.binarize(h)
        
    
    def bundle(self, a, b):
        """
        Adding with binarizing afterwards
        """
        return self.binarize(a + b)
    
    def train(self, X, y):
        assert len(X) == len(y), "len(X) len(y) not equal"
        
        #Encode and add all train data
        for i, obs in enumerate(X):
            target = y[i]
            h = self.encodeVector(obs)
            if target not in self.classvecs.keys():
                self.classvecs[target] = h
            else:
                self.classvecs[target] += h
                
        #Binarize classvectors
        for key, val in self.classvecs.items():
            self.classvecs[key] = self.binarize(val)
                
    def predict(self, x):
        h = self.encodeVector(x)
        dists = {}
        min_dist = None
        min_key = None
        
        for key, val in self.classvecs.items():
            score = hammingDist(h, val)
            dists[key] = score
            if min_dist == None or min_dist < score:
                min_dist = score
                min_key = key
            
        return min_key, dists
    
    def predict_all(self, X, y):
        correct_matrix = np.zeros((10,10))
        acc = 0.0
        for i, x in enumerate(X):
            pred, _ = self.predict(x)
            if pred == y[i]:
                acc += 1.0
            correct_matrix[int(y[i])][int(pred)] += 1
        
        acc = acc / len(X)
        return acc, correct_matrix

In [172]:
encoder = Encoder(10240, NUM_FEATURES, 'rmr')

In [173]:
start = time.time()
encoder.train(X[0:TRAIN_SUBSET], y[0:TRAIN_SUBSET])
end = time.time()
print("%f seconds elapsed. Avg %f seconds per observation" % (end-start, (end-start) / TRAIN_SUBSET))

97.246207 seconds elapsed. Avg 0.019449 seconds per observation


In [174]:
acc, mat = encoder.predict_all(X[TRAIN_SUBSET:TRAIN_SUBSET+TEST_SUBSET], y[TRAIN_SUBSET:TRAIN_SUBSET+TEST_SUBSET])

In [175]:
print(acc)
print(mat)

0.082
[[ 9.  0. 23.  0.  0. 43.  0. 10. 28.  0.]
 [ 9.  0. 25.  0.  0. 36.  0.  9. 29.  0.]
 [ 4.  0. 15.  0.  0. 37.  0.  6. 31.  0.]
 [12.  0. 20.  0.  0. 37.  0. 15. 31.  0.]
 [10.  0. 20.  0.  0. 36.  0.  6. 16.  0.]
 [ 5.  0. 25.  0.  0. 26.  0.  7. 17.  0.]
 [10.  0. 26.  0.  0. 36.  0. 10. 25.  0.]
 [ 6.  0. 21.  0.  0. 35.  0. 11. 28.  0.]
 [11.  0. 18.  0.  0. 31.  0.  8. 21.  0.]
 [ 9.  0. 13.  0.  0. 39.  0. 14. 31.  0.]]


In [176]:
for key, val in encoder.classvecs.items():
        print(np.sum(val))

10238.0
10240.0
10240.0
10240.0
10240.0
10238.0
10240.0
10240.0
10238.0
10236.0
