In [2]:
!/opt/venv/bin/python -m pip install --upgrade pip
# Handling pip upgrades

import numpy as np
import pandas as pd
import numpy.matlib
import matplotlib.pyplot as plt

# Making plotly as the backend for pandas
!pip install plotly
pd.options.plotting.backend = "plotly"

# Setting the theme
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

from sklearn.model_selection import train_test_split
import pprint

Requirement already up-to-date: pip in /opt/venv/lib/python3.7/site-packages (20.2.4)
Collecting plotly
  Downloading plotly-4.13.0-py2.py3-none-any.whl (13.1 MB)
[K     |████████████████████████████████| 13.1 MB 5.7 MB/s 
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=2669f8f79d190dda00580c4a989f4704ade9f64d6a6fc54446a6625190ade920
  Stored in directory: /home/jovyan/.cache/pip/wheels/f9/8d/8d/f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.13.0 retrying-1.3.3


In [3]:
iris_set = pd.read_csv("./iris/iris.data")
# add the column names, as it doesn't have it
attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_set.columns = attributes
iris_set = iris_set.sample(frac=1).reset_index(drop=True) # Shuffle
iris_set

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.0,3.4,1.5,0.2,Iris-setosa
1,5.8,2.7,4.1,1.0,Iris-versicolor
2,4.5,2.3,1.3,0.3,Iris-setosa
3,4.9,3.0,1.4,0.2,Iris-setosa
4,5.8,4.0,1.2,0.2,Iris-setosa
...,...,...,...,...,...
144,5.4,3.7,1.5,0.2,Iris-setosa
145,7.7,3.0,6.1,2.3,Iris-virginica
146,6.7,3.1,4.4,1.4,Iris-versicolor
147,5.9,3.2,4.8,1.8,Iris-versicolor


In [4]:
X = iris_set[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
X = np.array(X)
X[:3]

array([[5. , 3.4, 1.5, 0.2],
       [5.8, 2.7, 4.1, 1. ],
       [4.5, 2.3, 1.3, 0.3]])

In [5]:
Y = (iris_set['species'])
Y = np.array(Y)
Y[:3]

array(['Iris-setosa', 'Iris-versicolor', 'Iris-setosa'], dtype=object)

In [6]:
print("First five rows")
print(iris_set.head())
print("*********")
print("columns",iris_set.columns)
print("*********")
print("shape:",iris_set.shape)
print("*********")
print("Size:",iris_set.size)
print("*********")
print("no of samples available for each type")
print(iris_set['species'].value_counts())
print("*********")
print(iris_set.describe())

First five rows
   sepal_length  sepal_width  petal_length  petal_width          species
0           5.0          3.4           1.5          0.2      Iris-setosa
1           5.8          2.7           4.1          1.0  Iris-versicolor
2           4.5          2.3           1.3          0.3      Iris-setosa
3           4.9          3.0           1.4          0.2      Iris-setosa
4           5.8          4.0           1.2          0.2      Iris-setosa
*********
columns Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')
*********
shape: (149, 5)
*********
Size: 745
*********
no of samples available for each type
Iris-virginica     50
Iris-versicolor    50
Iris-setosa        49
Name: species, dtype: int64
*********
       sepal_length  sepal_width  petal_length  petal_width
count    149.000000   149.000000    149.000000   149.000000
mean       5.848322     3.051007      3.774497     1.205369
std        0.828594     0.433499      1.

In [7]:
#convert required data into numpy array and sort it
cluster_data = iris_set[['sepal_length',
                        'sepal_width',
                        'petal_length',
                        'petal_width']].copy(deep=True)
cluster_data.dropna(axis=0, inplace=True)
cluster_data.sort_values(by=['sepal_length',
                        'sepal_width',
                        'petal_length',
                        'petal_width'], inplace=True)
cluster_array = np.array(cluster_data)

In [8]:
print(cluster_array[:10])

[[4.3 3.  1.1 0.1]
 [4.4 2.9 1.4 0.2]
 [4.4 3.  1.3 0.2]
 [4.4 3.2 1.3 0.2]
 [4.5 2.3 1.3 0.3]
 [4.6 3.1 1.5 0.2]
 [4.6 3.2 1.4 0.2]
 [4.6 3.4 1.4 0.3]
 [4.6 3.6 1.  0.2]
 [4.7 3.2 1.3 0.2]]


In [9]:
# Calculate Euclidean distance between two observations
def calc_distance(X1, X2):
    return (sum((X1 - X2)**2))**0.5

In [10]:
# Assign cluster clusters based on closest centroid
def assign_clusters(centroids, cluster_array):
    clusters = []
    for i in range(cluster_array.shape[0]):
        distances = []
        for centroid in centroids:
            distances.append(calc_distance(centroid, 
                                           cluster_array[i]))
        cluster = [z for z, val in enumerate(distances) if val==min(distances)]
        clusters.append(cluster[0])
    return clusters

In [11]:
# Calculate new centroids based on each cluster's mean
def calc_centroids(clusters, cluster_array):
    new_centroids = []
    cluster_df = pd.concat([pd.DataFrame(cluster_array),
                            pd.DataFrame(clusters, 
                                         columns=['cluster'])], 
                           axis=1)
    for c in set(cluster_df['cluster']):
        current_cluster = cluster_df[cluster_df['cluster']\
                                     ==c][cluster_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        new_centroids.append(cluster_mean)
    return new_centroids

In [12]:
# Calculate variance within each cluster
def calc_centroid_variance(clusters, cluster_array):
    sum_squares = []
    cluster_df = pd.concat([pd.DataFrame(cluster_array),
                            pd.DataFrame(clusters, 
                                         columns=['cluster'])], 
                           axis=1)
    for c in set(cluster_df['cluster']):
        current_cluster = cluster_df[cluster_df['cluster']\
                                     ==c][cluster_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        mean_repmat = np.matlib.repmat(cluster_mean, 
                                       current_cluster.shape[0],1)
        sum_squares.append(np.sum(np.sum((current_cluster - mean_repmat)**2)))
    return sum_squares

In [49]:
#perform K-Means clustering with 4 clusters
k = 4
cluster_vars = []

centroids = [cluster_array[i+2] for i in range(k)]
clusters = assign_clusters(centroids, cluster_array)
initial_clusters = clusters
print(0, round(np.mean(calc_centroid_variance(clusters, cluster_array))))

for i in range(len(cluster_array)):
    centroids = calc_centroids(clusters, cluster_array)
    clusters = assign_clusters(centroids, cluster_array)
    cluster_var = np.mean(calc_centroid_variance(clusters, 
                                                 cluster_array))
    cluster_vars.append(cluster_var)
    print(i+1, round(cluster_var))

0 151
1 32
2 25
3 20
4 19
5 19
6 19
7 19
8 18
9 18
10 18
11 18
12 18
13 18
14 18
15 18
16 18
17 18
18 18
19 18
20 18
21 18
22 18
23 18
24 18
25 18
26 18
27 18
28 18
29 18
30 18
31 18
32 18
33 18
34 18
35 18
36 18
37 18
38 18
39 18
40 18
41 18
42 18
43 18
44 18
45 18
46 18
47 18
48 18
49 18
50 18
51 18
52 18
53 18
54 18
55 18
56 18
57 18
58 18
59 18
60 18
61 18
62 18
63 18
64 18
65 18
66 18
67 18
68 18
69 18
70 18
71 18
72 18
73 18
74 18
75 18
76 18
77 18
78 18
79 18
80 18
81 18
82 18
83 18
84 18
85 18
86 18
87 18
88 18
89 18
90 18
91 18
92 18
93 18
94 18
95 18
96 18
97 18
98 18
99 18
100 18
101 18
102 18
103 18
104 18
105 18
106 18
107 18
108 18
109 18
110 18
111 18
112 18
113 18
114 18
115 18
116 18
117 18
118 18
119 18
120 18
121 18
122 18
123 18
124 18
125 18
126 18
127 18
128 18
129 18
130 18
131 18
132 18
133 18
134 18
135 18
136 18
137 18
138 18
139 18
140 18
141 18
142 18
143 18
144 18
145 18
146 18
147 18
148 18
149 18


In [50]:
#plot error curve
px.line(cluster_vars, labels=dict(value="Variance", index="Iteration")).update_layout(showlegend=False)

In [51]:
clustered_data = pd.DataFrame(data=cluster_array, columns=['sepal_length','sepal_width','petal_length','petal_width'] )

In [52]:
clustered_data['initial'] = initial_clusters
clustered_data['final'] = clusters

In [53]:
clustered_data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,initial,final
0,4.3,3.0,1.1,0.1,0,0
1,4.4,2.9,1.4,0.2,0,0
2,4.4,3.0,1.3,0.2,0,0
3,4.4,3.2,1.3,0.2,1,0
4,4.5,2.3,1.3,0.3,2,0
...,...,...,...,...,...,...
144,7.7,2.6,6.9,2.3,3,3
145,7.7,2.8,6.7,2.0,3,3
146,7.7,3.0,6.1,2.3,3,3
147,7.7,3.8,6.7,2.2,3,3


In [54]:
px.scatter(clustered_data, x='sepal_width', y="sepal_length", color="initial", hover_data=['petal_width','petal_length'])

In [55]:
px.scatter(clustered_data, x='sepal_width', y="sepal_length", color="final", hover_data=['petal_width','petal_length'])