In [10]:
!pip install plotly scipy




[notice] A new release of pip available: 22.2.2 -> 22.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

data = px.data.iris()

data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


In [12]:
fig = px.scatter_3d(data, x= 'sepal_length', y = 'sepal_width', z = 'petal_length')

fig.show()

In [13]:
inputs = data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].to_numpy()
inputs.shape

(150, 4)

In [14]:
from scipy import spatial

def findClosest(targetVector, vectors):
    return spatial.KDTree(vectors).query(targetVector)


def kMeans(inputs, numberofClusters):
    numberofFeatures = inputs.shape[1]

    #creating random centroids
    centroids = np.random.rand(numberofClusters, numberofFeatures) 
    centroids *= inputs.max(0) # scale to data

    # initiate point assignments
    assignments = np.zeros(len(inputs), dtype = int)

    #do until centroids stop changing
    oldCentroids = np.random.rand(numberofClusters, numberofFeatures)
    while not np.array_equal(centroids, oldCentroids):
        oldCentroids = centroids.copy()
        # expectations: assign each data point to centroid
        for i in range(assignments.size):
            distance, clusterIndex = findClosest(inputs[i], centroids)    
            assignments[i] = clusterIndex
        print(assignments)
        # learning: each centroid will be the center of all points assigned to it
        for clusterIndex in range(numberofClusters):

            allVectorsInCluster = inputs[assignments == clusterIndex]
            #allVectorsInCluster = np.array([inputs[i] for i in range(len(inputs)) if assignments[i]==clusterIndex])
            if len(allVectorsInCluster) == 0:
                print('one of the randomly chosen centroids has no data')
                return kMeans(inputs, numberofClusters)
            centroids[clusterIndex] = allVectorsInCluster.mean(0)

    return assignments


In [15]:
clusters = kMeans(inputs, 3)

[2 2 2 2 2 0 0 2 2 2 2 2 2 2 2 0 0 2 2 0 2 0 0 2 2 2 2 2 2 2 2 2 0 0 2 2 2
 2 2 2 2 2 2 0 0 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2
 2 2 2 2 2 2 2 2 2 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2 0 2 2 0 0
 2 2 2 0 0 2 0 2 2 0 0 2 2 0 2 2 0 2 2 2 2 2 2 2 2 0 0 0 2 0 2 2 0 0 2 2 2
 0 0]
one of the randomly chosen centroids has no data
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
one of the randomly chosen centroids has no data
[1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 0 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1
 1 1 1 1 1 1 1 0 1 0 1 0 1 2 2 2 2 2 2 2 1 2 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 0 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 

In [16]:
clusters

array([0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [17]:
data['kmeans_cluster'] = np.array(clusters, dtype=str)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id,kmeans_cluster
0,5.1,3.5,1.4,0.2,setosa,1,0
1,4.9,3.0,1.4,0.2,setosa,1,1
2,4.7,3.2,1.3,0.2,setosa,1,1
3,4.6,3.1,1.5,0.2,setosa,1,1
4,5.0,3.6,1.4,0.2,setosa,1,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3,2
146,6.3,2.5,5.0,1.9,virginica,3,2
147,6.5,3.0,5.2,2.0,virginica,3,2
148,6.2,3.4,5.4,2.3,virginica,3,2


In [18]:
fig = px.scatter_3d(data, x= 'sepal_length', y = 'sepal_width', z = 'petal_length', color = 'kmeans_cluster')

fig.show()

fig = px.scatter_3d(data, x= 'sepal_length', y = 'sepal_width', z = 'petal_length', color = 'species')

fig.show()