In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets

In [2]:
pd.options.mode.chained_assignment = None

np.random.seed(1)

In [3]:
CLUSTERS_COUNT = 3

iris = datasets.load_iris()
data = iris.data
target = iris.target
target_names = iris.target_names

preprocessed_data = data.copy()

DIMENSIONAL = data.shape[1]
SIZE = data.shape[0]

EPSILON = 0.0005

In [4]:
def encode_on_hypercube(data):
    """
    Метод для кодирования на гиперкуб
    
    :param data: DataFrame instance
    return DataFrame instance
    """
    def encode_by_column(column):
        minimum, maximum = np.min(column), np.max(column)
        return np.apply_along_axis(lambda x: 2 * (x - minimum) / (maximum - minimum) - 1, 0, column)
    
    return np.apply_along_axis(encode_by_column, 0, data)

In [5]:
preprocessed_data = encode_on_hypercube(preprocessed_data)
np.random.shuffle(preprocessed_data)

cluster_column = np.zeros((data.shape[0], 1))
preprocessed_data = np.append(preprocessed_data, cluster_column, axis=1)
preprocessed_data

array([[-1.66666667e-01,  6.66666667e-01, -9.32203390e-01,
        -9.16666667e-01,  0.00000000e+00],
       [-5.55555556e-01, -5.83333333e-01, -3.22033898e-01,
        -1.66666667e-01,  0.00000000e+00],
       [ 2.77777778e-01, -1.66666667e-01,  1.52542373e-01,
         8.33333333e-02,  0.00000000e+00],
       [-3.88888889e-01,  5.83333333e-01, -8.98305085e-01,
        -7.50000000e-01,  0.00000000e+00],
       [ 1.00000000e+00,  5.00000000e-01,  8.30508475e-01,
         5.83333333e-01,  0.00000000e+00],
       [ 1.11111111e-01,  8.33333333e-02,  2.54237288e-01,
         2.50000000e-01,  0.00000000e+00],
       [ 4.44444444e-01, -8.33333333e-02,  3.89830508e-01,
         8.33333333e-01,  0.00000000e+00],
       [-5.55555556e-01,  5.00000000e-01, -6.94915254e-01,
        -7.50000000e-01,  0.00000000e+00],
       [-7.77777778e-01,  0.00000000e+00, -7.96610169e-01,
        -9.16666667e-01,  0.00000000e+00],
       [ 4.44444444e-01,  0.00000000e+00,  5.93220339e-01,
         8.33333333e-01

In [6]:
centroids = np.random.uniform(low=-1, high=1, size=(CLUSTERS_COUNT, DIMENSIONAL))
centroids

array([[ 0.24672023, -0.96835751,  0.85887447,  0.38179384],
       [ 0.9946457 , -0.65531898, -0.7257285 ,  0.86519093],
       [ 0.39363632, -0.86799965,  0.51092611,  0.50775238]])

In [7]:
def get_distance(row, center, sqrt=True):
    diff = row - center
    result = np.dot(diff.reshape((1, DIMENSIONAL)), diff.reshape((DIMENSIONAL, 1)) )

    return np.sqrt(result) if sqrt else result

In [8]:
def identify_members_of_clusters(data, centroids):
    new_data = data.copy()
    for index, row in enumerate(data):
        centers = np.array([get_distance(row[:DIMENSIONAL], center) for center in centroids])
        new_data[index][-1] = centers.argmin()
    
    new_centroids = centroids.copy()
    for index in range(centroids.shape[0]):
        cluster_filter = new_data.T[DIMENSIONAL] == index
        new_centroids[index] = np.mean(new_data[cluster_filter], axis=0)[:DIMENSIONAL]
        
    assert new_data.shape == (SIZE, DIMENSIONAL + 1)
    assert new_centroids.shape == (CLUSTERS_COUNT, DIMENSIONAL)
    
    return new_data, new_centroids

In [9]:
def get_answer(data, centroids):
    centroids = centroids.copy()
    data, new_centroids = identify_members_of_clusters(data, centroids)
    while any([get_distance(new_centroids[index], centroids[index], sqrt=False) > EPSILON
               for index in range(centroids.shape[0])
              ]):
        
        centroids = new_centroids
        data, new_centroids = identify_members_of_clusters(data, centroids)

    return data, new_centroids

In [10]:
clustered_data, centroids = get_answer(preprocessed_data, centroids)

In [11]:
def print_results(data, centroids):
    for index in range(centroids.shape[0]):
        print(f'Count objects in the {index + 1} cluster:', data[data.T[DIMENSIONAL] == index].shape[0])
    
    print()
    print('Centroids:', centroids, sep='\n')
    
    sum_distances = [
        np.array([get_distance(row[:DIMENSIONAL], centroid) for row in data[data.T[DIMENSIONAL] == index]]).sum()
        for index, centroid in enumerate(centroids)
    ]
    print()
    print('Sum of distances:', sum_distances)
    
    distances = [
        [round(get_distance(row[:DIMENSIONAL], centroid).item(),  3)
        for row in data[data.T[DIMENSIONAL] == index]]
        for index, centroid in enumerate(centroids)
    ]
    print()
    print('Distances:')
    for index, row in enumerate(distances, start=1):
        print(f'\tCluster №{index}:', row)

In [12]:
print_results(clustered_data, centroids)

Count objects in the 1 cluster: 52
Count objects in the 2 cluster: 50
Count objects in the 3 cluster: 48

Centroids:
[[ 0.31517094 -0.13782051  0.52542373  0.5849359 ]
 [-0.60777778  0.19       -0.84338983 -0.87833333]
 [-0.15393519 -0.42013889  0.1059322   0.01909722]]

Sum of distances: [23.549118459487193, 16.392683040148054, 18.78251181589291]

Distances:
	Cluster №1: [0.984, 0.526, 0.316, 0.319, 0.289, 0.305, 0.404, 0.525, 0.658, 0.261, 0.632, 0.487, 0.114, 0.289, 0.864, 0.964, 0.166, 0.488, 0.298, 0.436, 0.73, 0.435, 0.217, 0.412, 0.215, 0.476, 0.472, 0.194, 0.54, 0.246, 0.334, 0.479, 0.502, 0.644, 0.271, 0.472, 0.723, 0.486, 0.593, 0.496, 0.657, 0.323, 0.534, 0.579, 0.141, 0.328, 0.502, 0.615, 0.579, 0.459, 0.231, 0.338]
	Cluster №2: [0.657, 0.471, 0.371, 0.262, 0.475, 0.392, 0.255, 0.213, 0.093, 0.701, 0.318, 0.282, 0.119, 0.094, 0.503, 0.15, 0.378, 0.07, 0.289, 0.132, 0.394, 0.264, 0.232, 0.32, 0.318, 0.283, 0.583, 0.495, 0.13, 0.256, 0.305, 0.906, 0.984, 0.139, 0.305, 0.193, 