<a href="https://colab.research.google.com/github/JakubPac/kurs_ml/blob/main/k_means_interpretation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
import random
sns.set()
np.random.seed(42)
random.seed(42)

In [12]:
from sklearn.datasets import make_blobs

data = make_blobs(n_samples = 40, centers = 2, cluster_std = 1.0, center_box = (-4.0, 4.0), random_state = 42)[0]

df = pd.DataFrame(data, columns = ['x1', 'x2'])
df.head()

Unnamed: 0,x1,x2
0,0.37743,0.069424
1,2.217347,2.327304
2,1.376777,0.603609
3,-1.467097,3.139985
4,-1.605386,5.457993


Wizualizacja danych

In [13]:
fig = px.scatter(df, 'x1', 'x2', width = 700, height = 400, title = 'Algorytm K-średnich')
fig.update_traces(marker_size = 12)

Implementacja algorytmu K-srednich


In [14]:
# Wyznaczenie wartości brzegowych
np.set_printoptions(precision = 6, suppress = True)

x1_min = df.x1.min()
x1_max = df.x1.max()

x2_min = df.x2.min()
x2_max = df.x2.max()

print(x1_min, x1_max)
print(x2_min, x2_max)

-2.728596881734133 3.333845579232757
-1.1983010410246 5.457992635788267


In [17]:
# Losowe wygenerowanie współrzednych centroidów
centroid_1 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])
centroid_2 = np.array([random.uniform(x1_min, x1_max), random.uniform(x2_min, x2_max)])
print(centroid_1)
print(centroid_2)

[1.736217 3.306009]
[ 2.68019  -0.619611]


Wizualizacja tzw. punktów startowych centroidów

In [18]:
fig = px.scatter(df, 'x1', 'x2', width = 700, height = 400, title = 'Algorytm K-średnich - inicjalizacja centroidow')
fig.add_trace(go.Scatter(x = [centroid_1[0]], y = [centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [centroid_2[0]], y = [centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

In [21]:
# przypisanie punktów do najblizszego centroidu
from numpy.linalg import norm
clusters = []
for point in data:
  centroid_1_dist = norm(centroid_1 - point)
  centroid_2_dist = norm(centroid_2 - point)
  cluster = 1
  if centroid_1_dist > centroid_2_dist:
    cluster = 2
  clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,2
1,2.217347,2.327304,1
2,1.376777,0.603609,2
3,-1.467097,3.139985,1
4,-1.605386,5.457993,1


In [22]:
# Wizualizacja przypisania
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - iteracja 1')
fig.add_trace(go.Scatter(x = [centroid_1[0]], y = [centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [centroid_2[0]], y = [centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

In [23]:
# obliczenie nowych wspołrzednych centroidów
new_centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
new_centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(new_centroid_1, new_centroid_2)

[np.float64(-0.301469631807515), np.float64(2.8393022378105033)] [np.float64(1.7253260399682575), np.float64(0.0936407565578158)]


In [25]:
# Wizualizacja nowych centroidów
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - iteracja 1')
fig.add_trace(go.Scatter(x = [centroid_1[0]], y = [centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [centroid_2[0]], y = [centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new_centroid_1[0]], y = [new_centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new_centroid_2[0]], y = [new_centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

In [26]:
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - iteracja 1')
fig.add_trace(go.Scatter(x = [new_centroid_1[0]], y = [new_centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new_centroid_2[0]], y = [new_centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

In [27]:
# ponowne przypisanie punktów

clusters = []
for point in data:
  centroid_1_dist = norm(new_centroid_1 - point)
  centroid_2_dist = norm(new_centroid_2 - point)
  cluster = 1
  if centroid_1_dist > centroid_2_dist:
    cluster = 2
  clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,2
1,2.217347,2.327304,2
2,1.376777,0.603609,2
3,-1.467097,3.139985,1
4,-1.605386,5.457993,1


In [28]:
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - iteracja 2')
fig.add_trace(go.Scatter(x = [new_centroid_1[0]], y = [new_centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new_centroid_2[0]], y = [new_centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

In [29]:
new2_centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
new2_centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(new2_centroid_1, new2_centroid_2)

[np.float64(-1.0482224342187576), np.float64(3.1518843876019713)] [np.float64(1.849825592780309), np.float64(0.7793531495220656)]


In [33]:
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - iteracja 2')
fig.add_trace(go.Scatter(x = [new2_centroid_1[0]], y = [new2_centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new2_centroid_2[0]], y = [new2_centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new_centroid_1[0]], y = [new_centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new_centroid_2[0]], y = [new_centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

In [34]:
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - iteracja 2')
fig.add_trace(go.Scatter(x = [new2_centroid_1[0]], y = [new2_centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new2_centroid_2[0]], y = [new2_centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

In [36]:
clusters = []
for point in data:
  centroid_1_dist = norm(new2_centroid_1 - point)
  centroid_2_dist = norm(new2_centroid_2 - point)
  cluster = 1
  if centroid_1_dist > centroid_2_dist:
    cluster = 2
  clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,2
1,2.217347,2.327304,2
2,1.376777,0.603609,2
3,-1.467097,3.139985,1
4,-1.605386,5.457993,1


In [37]:
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - iteracja 3')
fig.add_trace(go.Scatter(x = [new2_centroid_1[0]], y = [new2_centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [new2_centroid_2[0]], y = [new2_centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)

Implementacja algorytmu K_srednich podsumowanie

In [38]:
for i in range(10):
  clusters = []
  for point in data:
    centroid_1_dist = norm(centroid_1 - point)
    centroid_2_dist = norm(centroid_2 - point)
    cluster = 1
    if centroid_1_dist > centroid_2_dist:
      cluster = 2
    clusters.append(cluster)
  df['cluster'] = clusters

  centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
  centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(centroid_1, centroid_2)

[np.float64(-1.184810430866379), np.float64(3.18988309513586)] [np.float64(1.8482624297593075), np.float64(0.8622246431993411)]


In [39]:
fig = px.scatter(df, 'x1', 'x2', color = 'cluster', width = 700, height = 400, title = 'Algorytm K-średnich - koncowy rezultat')
fig.add_trace(go.Scatter(x = [centroid_1[0]], y = [centroid_1[1]], name = 'centroid 1', mode = 'markers', marker_line_width = 3))
fig.add_trace(go.Scatter(x = [centroid_2[0]], y = [centroid_2[1]], name = 'centroid 2', mode = 'markers', marker_line_width = 3))
fig.update_traces(marker_size = 12, showlegend = False)