In [362]:
import numpy as np
import math
import random
import itertools
from scipy.spatial import distance

In [363]:
def normalize_vector(vector):
    '''
    Returns the normalized vector of the provided one, a length 1
    vector with the same direction as the original.
    '''
    vector = np.array(vector)
    return (vector/math.sqrt(np.sum(pow(vector, 2)))).tolist()

In [364]:
def extended_normalization(vector):
    '''
    When two vectors of different lengths but in the same direction are normalized, 
    the result will be the same for both of them.
    
    In order to avoid collisions of normalized vectors, an extended normalization can
    be applied, where an extra dimension is added to the vector to be normalized. By
    adding a component of length 1 in the new dimension and normalizing the result,
    it is ensured that no two vectors which were originally in the same dimensions
    will produce the same output.
    '''
    vector = np.append(vector, 1)
    return (vector/math.sqrt(np.sum(pow(vector, 2)))).tolist()

In [365]:
def get_random_unit_vector(components):
    '''
    Returns a random vector of length 1 with the required number of components
    '''
    return normalize_vector([random.random() for i in range(components)])

In [462]:
class SOM:
    def __init__ (self, dimensions, topology='rectangular', max_iter=100):
        '''
        dimensions - Array-like, containing the size of the output
        layer in each dimension.
        
        topology - The neighborhood relation between output-layer neurons.
        The default topology is rectangular. A future improvement would be
        to implement a hexagonal topology option.
        
        max_iter - The maximum number of epochs before the algorithm stops
        '''
        self.dimensions = dimensions
        self.output_layer = None
        self.max_iter = max_iter
        self.topology = topology
        #Output layer indexes is the cartesian product of the lists containing
        #the possible indexes to each dimension of the output layer matrix. That
        #is, it is a list containing the indexes to all the positions of the 
        #output layer. It is used to iterate over the output layer, since the 
        #number of dimensions it will have is unknown before the execution
        indexes_list = [list(range(i)) for i in self.dimensions]
        self.output_layer_indexes = itertools.product(*indexes_list)
        return
    def __weight_initialization(self, num_attr):
        '''
        Initializes the output layer to a matrix with the dimensions specified
        for this map, filled with random unit vectors of dimension num_attr
        
        num_attr - The number of attributes of the instances this network will
        be trained with
        '''
        self.output_layer = np.zeros(self.dimensions, dtype=object)
        for i in self.output_layer_indexes:
            self.output_layer[i] = get_random_unit_vector(num_attr)
        #Maybe cast the output layer to a regular list here?
        return
    def __get_output(self, instance):
        '''
        Calculates the output of each neuron for the provided instance.
        Returns a matrix with the same dimensions as the output layer
        containing the outputs
        '''
        output_matrix = np.zeros(self.dimensions)
        for i in self.output_layer_indexes:
            output_matrix[i] = distance.euclidean(instance, self.output_layer[i])
        return output_matrix
    def __get_neighbors(self, index, radius):
        '''
        Returns a matrix containing the indexes of the neurons in distance radius
        to the one specified by index.
        In order to implement different possible topologies, this should have a different
        way of computing the indexes depending on that
        '''
        indexes_list = [list(range(i-math.ceil(radius/2), i+math.ceil(radius/2)+1)) for i in index]
        #Just like obtaining the indexes into the output layer, the indexes to this
        #area of the output matrix are obtained by computing the cartesian product 
        #of the indexes into each dimension, limiting the indexes to the correct range
        indexes = np.array(list(itertools.product(*indexes_list)))
        #Finally, we divide the indexes in each dimension modulo the size of that dimension,
        #in order to obtain the wrapped around indexes
        return indexes%self.dimensions
    def fit(self, X):
        '''
        Initializes the output layer neurons to random unit vectors, normalizes
        the input vectors, and applies the learning algorithm to the input data, X
        
        X - The data to train the algorithm on
        '''
        #Since extended normalization will be used, the inputs will have an extra 
        #dimension, which is why I add 1 here so the output layer vectors will have
        #the number of attributes of an instance plus 1 components
        #Weight initialization
        self.__weight_initialization(len(X[0]) + 1)
        #Input data normalization
        X = [extended_normalization(v) for v in X]
        #Training
        for epoch in range(self.max_iter):
            #Chose a vector at random from the training data
            input_vector = random.choice(X)
            #Get the output for each neuron in the output layer
            output_matrix = self.__get_output(input_vector)
            #Get the index of the neuron that provides the smallest output
            #Np unravel index gives us the multi-dimensional index to a matrix
            #of the specified shape given the index into the flattened version of
            #the matrix, which np.argmin returns
            best_matching_unit = np.unravel_index(np.argmin(output_matrix, axis=None), 
                                                  output_matrix.shape)
            #This would be another way to do it, testing should be done in order 
            #to determine which of the two is the most efficient. I would think this 
            #secong one is probably better as the matrix indexes are already computed
            #best_matching_unit = self.output_layer_indexes[np.argmin(output_matrix)]
            
            #Obtain the indexes of the neighbors of the best matching unit within a
            #specified radius, including the BMU. Note that the indexes that go out of
            #bounds wrap around the matrix
            neighbors = self.__get_neighbors(best_matching_unit, 3)
        return
    def predict(self, x):
        '''
        Determines the Best Matching Unit for the instance x, and returns the predicted
        class for it based on the class associated to the neuron
        '''
        return
    def print_map():
        '''
        Shows a graphical representation of the output layer
        '''
        return

In [463]:
som = SOM([10, 10])
som.fit(data)

UnboundLocalError: local variable 'indexes' referenced before assignment

In [368]:
data = [[1,2,3],
        [1,1,1],
        [2,3,4],
        [4,2,4],
        [2,2,5]]

In [369]:
random.choice(data)

[1, 2, 3]

In [370]:
a = np.array([[[3],[1]],
     [[0],[2]],
     [[3],[3]]])

In [356]:
np.argmin([np.array(a)[i] for i in [(0,0),(0,1),(1,0),(1,1),(2,0),(2,1)]])

2

In [360]:
np.unravel_index(np.argmin(a, axis=None), a.shape)

(1, 0, 0)

In [375]:
a = np.array([[1,2,3,4,5],
             [6,7,8,9,10],
             [11,12,13,14,15],
             [16,17,18,19,20],
             [21,22,23,24,25]])
i = [2,2]
a[i[0]-1:i[0]+2, i[1]-1:i[1]+2]

array([[ 7,  8,  9],
       [12, 13, 14],
       [17, 18, 19]])

In [456]:
#a[range(-2,1), range(-2,1)]
cell = [0,0]
indexes = [list(range(i-1, i+2)) for i in cell]
list(itertools.product(*indexes))

[(-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 0), (0, 1), (1, -1), (1, 0), (1, 1)]

In [442]:
np.ravel_multi_index([-1,-1],(5,5), mode="wrap")

24

In [458]:
dimensions = (5,4)
x = [[-1, -1], [-1, 0], [-1, 1], [0, -1], [0, 0], [0, 1], [1, -1], [1, 0], [1, 1]]
for d in dimensions:
    map(lambda i: x[])

[[-1, -1], [-1, 0], [-1, 1], [0, -1], [0, 0], [0, 1], [1, -1], [1, 0], [1, 1]]

In [461]:
np.array([[(-1, -1), (-1, 0), (-1, 1)], [(0, -1), (0, 0), (0, 1)], [(1, -1), (1, 0), (1, 1)]])%(5,4)

array([[[4, 3],
        [4, 0],
        [4, 1]],

       [[0, 3],
        [0, 0],
        [0, 1]],

       [[1, 3],
        [1, 0],
        [1, 1]]])

In [402]:
index = [4,4]
slices = [list(range(i-1, i+2)) for i in index]
a.take(slices, mode="wrap")

array([[[4, 5, 6]],

       [[4, 5, 6]]])

In [400]:
slices

[[3, 4, 5], [3, 4, 5]]

In [385]:
x=[slice(i,j) for i,j in zip([1,2,3], [4,5,6])]
x

[slice(1, 4, None), slice(2, 5, None), slice(3, 6, None)]