In [20]:
!pip install plotly
!pip install pandas
!pip install nbformat



In [21]:
# the code below will not run if you do not have numpy, pandas, and plotly installed
# do this to install from powershell/cmd:
#   python -m pip install pandas plotly nbformat
#    or
#   python3 -m pip install pandas plotly nbformat
# do this to install from linux/mac terminal:
#   sudo python -m pip install pandas plotly nbformat
#    or
#   sudo python3 -m pip install pandas plotly nbformat


# first, let's load what we need from numpy, plotly, and scipy
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"


In [22]:
# let's create a data variable which will have the Iris dataset (from plotly.express.data.iris())
data = px.data.iris()

# let's see this data
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


In [23]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_width dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_width', color='species')
fig.show()


In [35]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_length dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_length', color='species')
fig.show()

#feel free to change dimensions to see this data in different ways...

In [25]:
# let's create a numpy array called inputs with the 4 dimensions of iris data
inputs = data[['sepal_length','sepal_width','petal_length','petal_width']].to_numpy()


In [26]:
# below are some helper functions that you can use when writing your kmeans algorithm

# find which vector from the array of vectors is closest to targetVector
def findClosest(targetVector, vectors):
    # there is a built-in scipy function which creates a searchable KDTree of vectors:
    #   kdtree = spatial.KDTree(vectors)
    # this can be later queries for closest vector:
    #   kdtree.query(targetVector)
    # however, we'll just use vector math to get our closest vector, without a need to import scipy:
    return (((vectors - targetVector)**2).sum(1)).argmin()

# makes random vectors
def makeRandomVectors(numberOfVectors, sizeOfEachVector):
    return np.random.rand(numberOfVectors, sizeOfEachVector)

# get a vector that contains means of vectors
def averageVector(vectors):
    return vectors.mean(0)

# get a vector that contains max values from vectors
def maxVector(vectors):
    return vectors.max(0)

# filter vectors by array of booleans
def filterVectors(vectors, filterArray):
    return vectors[filterArray]

# get array of booleans depending on whether each value in vector is equal to testValue
def isValueInVector(vector, testValue):
    return vector == testValue

# return true if any value in vector is true
def anyTrueValues(vector):
    return vector.any()


In [27]:
# k-means
def kMeans(inputVectors, numberOfClusters=2):
    # make random centroids
    centroids = makeRandomVectors(numberOfClusters, inputVectors.shape[1])
    centroids *= maxVector(inputVectors)
    print('random centroids', centroids)
    
    clusters = np.zeros(len(inputVectors), dtype=int)

    #TODO: replace pseudo-code below with python code
    oldCentroids = np.zeros(centroids.shape)
    # do until centroids stop changing
    while not np.array_equal(centroids, oldCentroids):
        oldCentroids = centroids
        # Expectations: assign each point to its closest centroid
        for i in range(len(inputVectors)):
            clusters[i] = findClosest(inputVectors[i], centroids)
        # Check for bad centroids:
        #    if any of the random centroids have no values assigned to them, restart kMeans
        for clusterIndex in range(numberOfClusters):
            if clusterIndex not in clusters:
                return kMeans(inputVectors, numberOfClusters)
        # Maximization: compute new centroids
        for clusterIndex in range(numberOfClusters):
            # [clusters == clusterIndex]
            # isVectorInCluster = isValueInVector(clusters, clusterIndex)
            allVectorsInCluster = inputVectors[clusters == clusterIndex]
            centroids[clusterIndex] = averageVector( allVectorsInCluster )
        print('new centroids', centroids)
    
    #return all the assignments (should be 150 for the Iris dataset)


In [28]:
# use k-means to cluster the data into 2 clusters
clusters = kMeans(inputs,2)

random centroids [[7.24266319 3.77614114 1.12495113 0.45856934]
 [1.89625468 0.50277911 5.72084838 2.18949069]]
new centroids [[5.84859155 3.07183099 3.67957746 1.15492958]
 [5.75       2.7375     5.1625     1.975     ]]


In [29]:
# add k-means-generated cluster id's to data
data['kmeans cluster']=np.array(clusters,dtype=str)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id,kmeans cluster
0,5.1,3.5,1.4,0.2,setosa,1,
1,4.9,3.0,1.4,0.2,setosa,1,
2,4.7,3.2,1.3,0.2,setosa,1,
3,4.6,3.1,1.5,0.2,setosa,1,
4,5.0,3.6,1.4,0.2,setosa,1,
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3,
146,6.3,2.5,5.0,1.9,virginica,3,
147,6.5,3.0,5.2,2.0,virginica,3,
148,6.2,3.4,5.4,2.3,virginica,3,


In [30]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_width dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_width', color='kmeans cluster')
fig.show()

In [31]:
# use k-means to cluster the data into 3 clusters
clusters = kMeans(inputs, 3)

random centroids [[7.18385635 3.14377803 2.4977114  1.2527792 ]
 [6.91275162 1.16566249 5.65093656 2.47456173]
 [6.25703987 1.5178237  0.00866986 0.99906207]]
new centroids [[5.52777778 3.14861111 2.92916667 0.81805556]
 [6.5442623  2.93442623 5.40983607 1.9295082 ]
 [4.66470588 3.08235294 1.34705882 0.18823529]]


In [32]:
# add k-means-generated cluster id's to data
data['kmeans cluster']=np.array(clusters,dtype=str)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id,kmeans cluster
0,5.1,3.5,1.4,0.2,setosa,1,
1,4.9,3.0,1.4,0.2,setosa,1,
2,4.7,3.2,1.3,0.2,setosa,1,
3,4.6,3.1,1.5,0.2,setosa,1,
4,5.0,3.6,1.4,0.2,setosa,1,
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3,
146,6.3,2.5,5.0,1.9,virginica,3,
147,6.5,3.0,5.2,2.0,virginica,3,
148,6.2,3.4,5.4,2.3,virginica,3,


In [33]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_width dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_width', color='kmeans cluster')
fig.show()