# Import Libraries

In [0]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import random
import plotly.express as px
import plotly.graph_objects as go

np.random.seed(42)
np.set_printoptions(precision = 6)
random.seed(42)

# Generating Data

In [3]:
from sklearn.datasets import make_blobs

data = make_blobs(n_samples = 40, centers = 2, cluster_std = 1.0, 
                  center_box=(-4.0,4.0), random_state=42)[0]
df= pd.DataFrame(data, columns=['x1', 'x2'])
df.head()

Unnamed: 0,x1,x2
0,0.37743,0.069424
1,2.217347,2.327304
2,1.376777,0.603609
3,-1.467097,3.139985
4,-1.605386,5.457993


# Data Visualization

In [4]:
fig = px.scatter(df, 'x1', 'x2', width=950, height=500, title ='K-means '
'Algorithm')
fig.update_traces(marker_size=12)

# K-means Implementation

In [5]:
# determination of boundary values
x1_min = df.x1.min()
x1_max = df.x1.max()

x2_min = df.x2.min()
x2_max = df.x2.max()

print(x1_min, x1_max)
print(x2_min, x2_max)

-2.728596881734133 3.333845579232757
-1.1983010410246 5.457992635788267


In [6]:
# randomly coordinating centroid coordinates

centroid_1 = np.array([random.uniform(x1_min, x1_max), 
                       random.uniform(x2_min, x2_max)])
centroid_2 = np.array([random.uniform(x1_min, x2_max),
                       random.uniform(x2_min, x2_max)])
print(centroid_1)
print(centroid_2)

[ 1.147891 -1.031822]
[-0.477045  0.287455]


In [8]:
# visualization of centroid start points
fig = px.scatter(df, 'x1', 'x2', width=950, height=500, 
                 title='Algorithm K-means - initialization of centroids')
fig.add_trace(go.Scatter(x=[centroid_1[0]], y=[centroid_1[1]], 
                      name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid_2[0]], y=[centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12, showlegend=False)

In [9]:
# assigning points to the nearest centroid
clusters = []
for point in data:
    centroid_1_dist = norm(centroid_1 - point)
    centroid_2_dist = norm(centroid_2 - point)
    cluster = 1
    if centroid_1_dist > centroid_2_dist:
        cluster = 2
    clusters.append(cluster)
    
df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,2
1,2.217347,2.327304,2
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [10]:
# assignment visualization
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorithm K-means - iteration 1 - '
                 'assigning points to the nearest centroid')
fig.add_trace(go.Scatter(x=[centroid_1[0]], y=[centroid_1[1]], 
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid_2[0]], y=[centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

In [11]:
# calculation of new centroid coordinates
new_centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
new_centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(new_centroid_1, new_centroid_2)

[2.1105888553354877, 0.5092865045639642] [-0.538967826671177, 2.759227684424036]


In [12]:
# visualization of centroid updates

fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorithm K-means - calculation of new centroids')
fig.add_trace(go.Scatter(x=[centroid_1[0]], y=[centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[centroid_2[0]], y=[centroid_2[1]], 
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

In [13]:
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorithm K-means - centroid upgrade')
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

In [14]:
# reassigning points to the nearest centroid
clusters = []
for point in data:
    centroid_1_dist = norm(new_centroid_1 - point)
    centroid_2_dist = norm(new_centroid_2 - point)
    cluster = 1
    if centroid_1_dist > centroid_2_dist:
        cluster = 2
    clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [15]:

fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorithm K-means - iteration 2 -'
                 'reassignment of points')
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

In [16]:
# centroid upgrade
new_2_centroid_1 = [df[df.cluster == 1].x1.mean(),
                    df[df.cluster == 1].x2.mean()]
new_2_centroid_2 = [df[df.cluster == 2].x1.mean(),
                    df[df.cluster == 2].x2.mean()]

print(new_2_centroid_1, new_2_centroid_2)

[1.848262429759308, 0.8622246431993411] [-1.184810430866379, 3.18988309513586]


In [18]:
# centroid upgrade
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500,
                 title='Algorithm K-means - recalculation of centroids')
fig.add_trace(go.Scatter(x=[new_centroid_1[0]], y=[new_centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_centroid_2[0]], y=[new_centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_2_centroid_1[0]], y=[new_2_centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_2_centroid_2[0]], y=[new_2_centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

In [19]:
clusters = []
for point in data:
    centroid_1_dist = norm(new_centroid_1 - point)
    centroid_2_dist = norm(new_centroid_2 - point)
    cluster = 1
    if centroid_1_dist > centroid_2_dist:
        cluster = 2
    clusters.append(cluster)

df['cluster'] = clusters
df.head()

Unnamed: 0,x1,x2,cluster
0,0.37743,0.069424,1
1,2.217347,2.327304,1
2,1.376777,0.603609,1
3,-1.467097,3.139985,2
4,-1.605386,5.457993,2


In [20]:
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorithm K-means - centroid uprgade')
fig.add_trace(go.Scatter(x=[new_2_centroid_1[0]], y=[new_2_centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_2_centroid_2[0]], y=[new_2_centroid_2[1]], 
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)

# Implementation of the K-means algorithm - Summary

In [21]:
data = make_blobs(n_samples=40, centers=2, cluster_std=1.0, 
                  center_box=(-4.0, 4.0), random_state=42)[0]
df = pd.DataFrame(data, columns=['x1', 'x2'])
df.head()

x1_min = df.x1.min()
x1_max = df.x1.max()

x2_min = df.x2.min()
x2_max = df.x2.max()

centroid_1 = np.array([random.uniform(x1_min, x1_max), 
                       random.uniform(x2_min, x2_max)])
centroid_2 = np.array([random.uniform(x1_min, x1_max), 
                       random.uniform(x2_min, x2_max)])

for i in range(10):
    clusters = []
    for point in data:
        centroid_1_dist = norm(centroid_1 - point)
        centroid_2_dist = norm(centroid_2 - point)
        cluster = 1
        if centroid_1_dist > centroid_2_dist:
            cluster = 2
        clusters.append(cluster)

    df['cluster'] = clusters

    centroid_1 = [df[df.cluster == 1].x1.mean(), df[df.cluster == 1].x2.mean()]
    centroid_2 = [df[df.cluster == 2].x1.mean(), df[df.cluster == 2].x2.mean()]

print(new_centroid_1, new_centroid_2)

[2.1105888553354877, 0.5092865045639642] [-0.538967826671177, 2.759227684424036]


In [22]:
fig = px.scatter(df, 'x1', 'x2', color='cluster', width=950, height=500, 
                 title='Algorithm K-means - results')
fig.add_trace(go.Scatter(x=[new_2_centroid_1[0]], y=[new_2_centroid_1[1]],
                        name='centroid 1', mode='markers', marker_line_width=3))
fig.add_trace(go.Scatter(x=[new_2_centroid_2[0]], y=[new_2_centroid_2[1]],
                        name='centroid 2', mode='markers', marker_line_width=3))
fig.update_traces(marker_size=12)
fig.update_layout(showlegend=False)