In [1]:
import sklearn.datasets
import numpy as np
from numpy.linalg import norm
import matplotlib.pyplot as plt
import pandas as pd
import os
import torch
import time

np.random.seed(1)
NUM_FEATURES = 784

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = sklearn.datasets.fetch_openml("mnist_784", version=1)

In [3]:
X = dataset.data.to_numpy()
y = dataset.target.to_numpy()

In [8]:
def randomVector(n):
    """
    Randomly generates hypervector of size n
    """
    return (2*np.random.randint(0, 2, n)) - 1

def bind(a, b):
    return np.multiply(a, b)

def randomFlip(vec, idxs):
    """
    Returns a vector with the inx bits flipped.
    """
    assert len(idxs) <= len(vec), "Number of bits to flip greater than vector length"
    
    flipped_vec = np.copy(vec)
    
    for i in idxs:
        flipped_vec[i] *= -1
        
    return flipped_vec
    

def hammingDist(a, b):
    """
    Normalized hamming distance between two vectors.
    Vectors are assumed to be binarized.
    """
    assert len(a) == len(b), "Vectors are not the same length"
    return np.count_nonzero(a!=b) / len(a)

In [163]:
def buildLevelVectors(levels, dim):
    """
    Level hypervectors, level hypervector at i is copied from level hypervector i-1 with D/(2*255) bits flipped
    """
    levelvectors = []
    min_vector = randomVector(dim)
    
    levelvectors.append(min_vector)
    
    window = 0
    indexes = np.arange(0, dim)
    np.random.shuffle(indexes)
    bit_flips = int(dim/(2*255))

    for i in range(1, levels):
        mid_vec = np.copy(levelvectors[i-1])
        mid_vec = randomFlip(mid_vec, indexes[window:window+bit_flips])
        window += bit_flips

        levelvectors.append(mid_vec)
        
    return np.array(levelvectors)

def buildPosVectors(n, dim):
    """
    Array of n random hypervectors to encode position of features.
    """
    posvectors = []
    for i in range(n):
        posvectors.append(randomVector(dim))
    return np.array(posvectors)

def rmrBinarize(x):
    """
    Random majority rule. 0 are randomly set to 1 or -1
    """
    h = [i/i if i != 0 else np.random.choice([-1.0, 1.0]) for i in x]
    return h

def fmrBinarize(x, k):
    """
    Fixed majority rule. 0 are set to k
    """
    h = [i/i if i != 0 else k for i in x]
    return h

In [164]:
class Encoder:
    def __init__(self, dim, n_features, binarizer):
        self.D = dim
        self.n_features = n_features
        self.B = buildPosVectors(n_features, dim)
        self.levelvectors = buildLevelVectors(256, dim)
        self.binarizer = binarizer        
        self.classvecs = {}
        
    def binarize(self, x):
        if self.binarizer == 'rmr':
            return rmrBinarize(x)
        else:
            return fmrBinarize(x, -1)
    
    def encodeVector(self, X):
        h = np.zeros(self.D)
        for i, feature in enumerate(X):
            h = np.add(h, np.multiply(self.levelvectors[int(feature)], self.B[i]))
        
        return h
        
    
    def train(self, X, y):
        assert len(X) == len(y), "len(X) len(y) not equal"
        
        #Encode and add all train data
        for i, obs in enumerate(X):
            target = y[i]
            h = self.encodeVector(obs)
            h = self.binarize(h)
            if target not in self.classvecs.keys():
                self.classvecs[target] = h
            else:
                self.classvecs[target] = np.add(self.classvecs[target], h)
                
        #Binarize classvectors
        for key, val in self.classvecs.items():
            self.classvecs[key] = self.binarize(val)
                
    def predict(self, x):
        h = self.encodeVector(x)
        h = self.binarize(h)
        dists = {}
        
        for key, val in self.classvecs.items():
            dists[key] = hammingDist(h, val)
            
        print(list(self.classvecs.keys())[np.argmax(self.classvecs.values())])
            
        return dists

In [167]:
encoder = Encoder(10240, NUM_FEATURES, 'rmr')

In [168]:
TRAIN_SUBSET = 35000

start = time.time()
encoder.train(X[0:TRAIN_SUBSET], y[0:TRAIN_SUBSET])
end = time.time()
print("%f seconds elapsed. Avg %f seconds per observation" % (end-start, (end-start) / TRAIN_SUBSET))

626.501752 seconds elapsed. Avg 0.017900 seconds per observation


In [None]:
for i in range(TRAIN_SUBSET):
    print("------------")
    encoder.predict(X[i])
    print(y[i])
    print("------------")

In [128]:
print(np.array(encoder.classvecs['0']).shape)

(10240,)


In [169]:
for key, val in encoder.classvecs.items():
    #print(val)
    print(np.sum(val))

10240.0
10238.0
10240.0
10236.0
10238.0
10240.0
10240.0
10240.0
10238.0
10240.0
