In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets

In [2]:
pd.options.mode.chained_assignment = None

np.random.seed(1)

In [3]:
CLUSTERS_COUNT = 3

iris = datasets.load_iris()
data = iris.data
target = iris.target
target_names = iris.target_names

preprocessed_data = data.copy()

print(pd.read_csv('./data/iris.data'))

DIMENSIONAL = data.shape[1]
SIZE = data.shape[0]

As = np.array([np.identity(DIMENSIONAL), np.identity(DIMENSIONAL), np.identity(DIMENSIONAL)])

EPSILON = 0.0005

     5.1  3.5  1.4  0.2     Iris-setosa
0    4.9  3.0  1.4  0.2     Iris-setosa
1    4.7  3.2  1.3  0.2     Iris-setosa
2    4.6  3.1  1.5  0.2     Iris-setosa
3    5.0  3.6  1.4  0.2     Iris-setosa
4    5.4  3.9  1.7  0.4     Iris-setosa
..   ...  ...  ...  ...             ...
144  6.7  3.0  5.2  2.3  Iris-virginica
145  6.3  2.5  5.0  1.9  Iris-virginica
146  6.5  3.0  5.2  2.0  Iris-virginica
147  6.2  3.4  5.4  2.3  Iris-virginica
148  5.9  3.0  5.1  1.8  Iris-virginica

[149 rows x 5 columns]


In [4]:
def encode_on_hypercube(data):
    """
    Метод для кодирования на гиперкуб
    
    :param data: DataFrame instance
    return DataFrame instance
    """
    def encode_by_column(column):
        minimum, maximum = np.min(column), np.max(column)
        return np.apply_along_axis(lambda x: 2 * (x - minimum) / (maximum - minimum) - 1, 0, column)
    
    return np.apply_along_axis(encode_by_column, 0, data)

In [5]:
preprocessed_data = encode_on_hypercube(preprocessed_data)
np.random.shuffle(preprocessed_data)

cluster_column = np.zeros((data.shape[0], 1))
preprocessed_data = np.append(preprocessed_data, cluster_column, axis=1)
preprocessed_data

array([[-1.66666667e-01,  6.66666667e-01, -9.32203390e-01,
        -9.16666667e-01,  0.00000000e+00],
       [-5.55555556e-01, -5.83333333e-01, -3.22033898e-01,
        -1.66666667e-01,  0.00000000e+00],
       [ 2.77777778e-01, -1.66666667e-01,  1.52542373e-01,
         8.33333333e-02,  0.00000000e+00],
       [-3.88888889e-01,  5.83333333e-01, -8.98305085e-01,
        -7.50000000e-01,  0.00000000e+00],
       [ 1.00000000e+00,  5.00000000e-01,  8.30508475e-01,
         5.83333333e-01,  0.00000000e+00],
       [ 1.11111111e-01,  8.33333333e-02,  2.54237288e-01,
         2.50000000e-01,  0.00000000e+00],
       [ 4.44444444e-01, -8.33333333e-02,  3.89830508e-01,
         8.33333333e-01,  0.00000000e+00],
       [-5.55555556e-01,  5.00000000e-01, -6.94915254e-01,
        -7.50000000e-01,  0.00000000e+00],
       [-7.77777778e-01,  0.00000000e+00, -7.96610169e-01,
        -9.16666667e-01,  0.00000000e+00],
       [ 4.44444444e-01,  0.00000000e+00,  5.93220339e-01,
         8.33333333e-01

In [6]:
centroids = np.random.uniform(low=-1, high=1, size=(CLUSTERS_COUNT, DIMENSIONAL))
centroids

array([[ 0.24672023, -0.96835751,  0.85887447,  0.38179384],
       [ 0.9946457 , -0.65531898, -0.7257285 ,  0.86519093],
       [ 0.39363632, -0.86799965,  0.51092611,  0.50775238]])

In [7]:
def get_distance(row, center, A):
    diff = row - center
    result = np.dot(np.dot(diff.reshape((1, DIMENSIONAL)), A), diff.reshape((DIMENSIONAL, 1)) ).item()

    return result

In [8]:
def get_owners(data, centers, As):
    owners = np.zeros((SIZE, CLUSTERS_COUNT))
    for index, row in enumerate(data):
        distances = np.array([1 / get_distance(row[:DIMENSIONAL], center, np.linalg.inv(As[n_index])) 
                              for n_index, center in enumerate(centers)])
        sum_distances = np.sum(distances)

        owners[index] = np.array([
            dist / sum_distances for dist in distances
        ])
    
    assert owners.shape == (SIZE, CLUSTERS_COUNT)

    return owners

In [9]:
def get_centroids(data, owners):
    sqrt_owners = np.power(owners, 2)
    
    sum_center = np.apply_along_axis(np.sum, 0, sqrt_owners)
    
    sum_data = np.zeros((CLUSTERS_COUNT, DIMENSIONAL))
    for index, row in enumerate(data):
        sum_data += np.dot(
            sqrt_owners[index].reshape((CLUSTERS_COUNT, 1)),
            row[:DIMENSIONAL].reshape((1, DIMENSIONAL))
        )
    
    result = sum_data / sum_center.reshape((CLUSTERS_COUNT, 1))

    assert result.shape == (CLUSTERS_COUNT, DIMENSIONAL)
    return result

In [10]:
def get_new_Fs(data, centroids, owners):
    Fs = np.zeros((CLUSTERS_COUNT, DIMENSIONAL, DIMENSIONAL))
    
    sqrt_owners = np.power(owners, 2)
    
    for index, center in enumerate(centroids):

        for n_index, row in enumerate(data):
            diff = row[:DIMENSIONAL] - center   

            Fs[index] += np.dot(
                (sqrt_owners[n_index][index] * diff).reshape((DIMENSIONAL, 1)),
                diff.reshape((1, DIMENSIONAL))
            )
        Fs[index] /= np.sum(sqrt_owners.T[index])

    return Fs

In [11]:
def get_new_As(data, centroids, owners):
    Fs = get_new_Fs(data, centroids, owners)
    
    As = np.zeros((CLUSTERS_COUNT, DIMENSIONAL, DIMENSIONAL))
    for index, F in enumerate(Fs):
        As[index] = np.sqrt(np.sqrt(1 / np.linalg.det(F))) * F
    
    return As

In [12]:
def get_answer(data, centroids, As):
    owners = get_owners(data, centroids, As)
    new_centroids = get_centroids(data, owners)
    
    As = get_new_As(data, new_centroids, owners)
        
    while any([get_distance(new_centroids[index], centroids[index], As[index]) > EPSILON
               for index in range(centroids.shape[0])
              ]):
        owners = get_owners(data, new_centroids, As)

        centroids = new_centroids
        new_centroids = get_centroids(data, owners)
        As = get_new_As(data, new_centroids, owners)

    return data, new_centroids, As, owners

In [13]:
result_data, new_centroids, new_As, new_owners = get_answer(preprocessed_data, centroids, As)

In [14]:
new_centroids

array([[ 0.02088759, -0.32852148,  0.20517664,  0.10045009],
       [-0.60248446,  0.19874842, -0.84131775, -0.87884282],
       [ 0.16637381, -0.18693857,  0.45071742,  0.58512411]])

In [15]:
new_owners

array([[0.05130057, 0.924354  , 0.02434543],
       [0.65291093, 0.12338887, 0.2237002 ],
       [0.71599381, 0.02850744, 0.25549876],
       [0.0446127 , 0.93596316, 0.01942415],
       [0.61347406, 0.03379547, 0.35273047],
       [0.65593731, 0.02219096, 0.32187173],
       [0.18408134, 0.03928561, 0.77663304],
       [0.06144453, 0.91010733, 0.02844814],
       [0.02906311, 0.95879084, 0.01214606],
       [0.07364012, 0.00650281, 0.91985708],
       [0.81781988, 0.01179456, 0.17038556],
       [0.02619802, 0.96207014, 0.01173183],
       [0.28052564, 0.00969458, 0.70977978],
       [0.73610501, 0.02156954, 0.24232545],
       [0.81992782, 0.00587385, 0.17419833],
       [0.05447456, 0.92466491, 0.02086053],
       [0.88909696, 0.0108437 , 0.10005935],
       [0.55934211, 0.02627387, 0.41438401],
       [0.05467016, 0.92353054, 0.0217993 ],
       [0.03752114, 0.94822626, 0.0142526 ],
       [0.55845993, 0.03733832, 0.40420175],
       [0.5013591 , 0.03945517, 0.45918573],
       [0.