## K-means

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [72]:
df = pd.read_csv('train.csv').drop(['index'], axis=1)
df = df[['x_0', 'x_1']].head(100)
df

Unnamed: 0,x_0,x_1
0,0.001377,-0.312330
1,0.013704,-0.027573
2,0.052899,-0.187180
3,-0.106739,0.048862
4,0.041890,0.121760
...,...,...
95,0.042901,-0.176575
96,-0.043968,-0.371420
97,0.052376,-0.066440
98,-0.036547,0.584502


In [73]:
X = np.array(df)

Попробуем выделить три кластера. 

In [74]:
# Шаг 1: Выбираем число кластеров k
k = 3

In [75]:
# Шаг 2: Выбираем k случайных значений (точек, центроидов)
np.random.seed(2)
centroidd_id = np.random.choice(X.shape[0], k, replace=False) #replace=False - точки без повторений
centroidd_id

array([83, 30, 56])

In [76]:
centroids = X[centroidd_id]
centroids

array([[-0.0164101 ,  0.13093701],
       [-0.1114876 ,  0.01196211],
       [-0.1239887 , -0.11231261]])

In [77]:
# Шаг 3: Создаем k кластеров: относим каждую точку к кластеру, к центру которого она ближе
idx_1 = 0 
np.linalg.norm(centroids - X[0], axis=1) #евклидово расстояние

array([0.44362403, 0.34337141, 0.2360584 ])

In [78]:
clusters = {i: [] for i in range(k)}
clusters

{0: [], 1: [], 2: []}

In [79]:
for i in X:
    dist = np.linalg.norm(centroids - i, axis=1)
    g = dist.argmin()
    clusters[g].append(i)
clusters

{0: [array([0.04189022, 0.12176042]),
  array([-0.00366688,  0.06810156]),
  array([-0.00745486,  0.0786887 ]),
  array([-0.03355507,  0.16201834]),
  array([0.04436351, 0.09402095]),
  array([-0.05591833,  0.5972369 ]),
  array([-0.00743656,  0.16972911]),
  array([0.0792793, 0.2097884]),
  array([-0.0074243 ,  0.19244531]),
  array([-0.02570546,  0.04901224]),
  array([-0.03700836,  0.6714787 ]),
  array([0.06534123, 0.4234762 ]),
  array([0.08514876, 0.09001365]),
  array([0.01337011, 0.1620346 ]),
  array([-0.00543568,  0.14880796]),
  array([ 0.03957844, -0.00704764]),
  array([-0.08298477,  0.31393093]),
  array([0.03130806, 0.15252323]),
  array([-0.05440154,  0.1386931 ]),
  array([0.03005235, 0.18631707]),
  array([0.0193193 , 0.22401619]),
  array([-0.02169619,  0.4885225 ]),
  array([0.00183747, 0.23569678]),
  array([-0.06194757,  0.3017113 ]),
  array([0.00023287, 0.20723954]),
  array([-0.12123806,  0.22248346]),
  array([-0.00972699,  0.3127796 ]),
  array([0.10703822, 0

In [80]:
print(len(clusters[0]), len(clusters[1]), len(clusters[2]))

43 18 39


In [81]:
# Шаг 4: Вычисляем новый центроид каждого кластера («средняя координата»)
new_centroids = {}
for i in clusters:
    new_centroids[i] = np.mean(clusters[i], axis=0)
new_centroids

{0: array([0.00216888, 0.20562117]),
 1: array([-0.01807667, -0.01438927]),
 2: array([ 0.01609812, -0.27023689])}

In [82]:
# Проверка на сходимость (если центроиды не изменились, то алгоритм сошелся)
new_centroids = dict(sorted(new_centroids.items()))
new_centroids = np.array(list(new_centroids.values()))

In [83]:
stop = True
for i in range(len(centroids)):
    if np.linalg.norm(centroids[i] - new_centroids[i]) > 1e-4:
        stop = False
    if stop:
        break

In [84]:
centroids

array([[-0.0164101 ,  0.13093701],
       [-0.1114876 ,  0.01196211],
       [-0.1239887 , -0.11231261]])

In [85]:
clusters

{0: [array([0.04189022, 0.12176042]),
  array([-0.00366688,  0.06810156]),
  array([-0.00745486,  0.0786887 ]),
  array([-0.03355507,  0.16201834]),
  array([0.04436351, 0.09402095]),
  array([-0.05591833,  0.5972369 ]),
  array([-0.00743656,  0.16972911]),
  array([0.0792793, 0.2097884]),
  array([-0.0074243 ,  0.19244531]),
  array([-0.02570546,  0.04901224]),
  array([-0.03700836,  0.6714787 ]),
  array([0.06534123, 0.4234762 ]),
  array([0.08514876, 0.09001365]),
  array([0.01337011, 0.1620346 ]),
  array([-0.00543568,  0.14880796]),
  array([ 0.03957844, -0.00704764]),
  array([-0.08298477,  0.31393093]),
  array([0.03130806, 0.15252323]),
  array([-0.05440154,  0.1386931 ]),
  array([0.03005235, 0.18631707]),
  array([0.0193193 , 0.22401619]),
  array([-0.02169619,  0.4885225 ]),
  array([0.00183747, 0.23569678]),
  array([-0.06194757,  0.3017113 ]),
  array([0.00023287, 0.20723954]),
  array([-0.12123806,  0.22248346]),
  array([-0.00972699,  0.3127796 ]),
  array([0.10703822, 0

In [86]:
print(len(clusters[0]), len(clusters[1]), len(clusters[2]))

43 18 39
