In [3]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean

In [4]:
# read data

pd_data = pd.read_csv('iris-data.csv')
pd_data.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# convert to ndarray

data = np.array(pd_data.values[:,:-1])

In [6]:
def init_centroids(data, k):
    # init k random centroids
    return data[np.random.choice(data.shape[0], k)]

In [7]:
def count_dists(data, centroids):
    # count distances from all points to each centroid
    dists = []
    for centroid in centroids:
        dists.append(((data - centroid) ** 2).sum(axis=1))
    return np.array(dists)

In [8]:
def nearest_centroid(dists):
    # find nearest centroid for each point
    return np.argmin(dists, axis=0)

In [9]:
def move_centroids(data, nearest):
    # move centroids to the center of their clusters
    return [data[nearest==i].mean(axis=0)for i in range(max(nearest) + 1)]

In [10]:
def clusterize(data, k, n_iters):
    centroids = init_centroids(data, k)
    for i in range(n_iters):
        dists = count_dists(data, centroids)
        nearest = nearest_centroid(dists)
        centroids = move_centroids(data, nearest)
    return nearest   

In [11]:
def accuracy(pd_data):
    # compare real class entries for each predicted class
    for i in pd_data['predicted'].unique():
        if i is not None:
            print ('class =', i)
            print (pd_data[pd_data['predicted'] == i]['class'].value_counts())

In [12]:
k = 3
n_iters = 5

predicted = clusterize(data, k, n_iters)
pd_data['predicted'] = predicted

accuracy(pd_data)

class = 1
Iris-setosa    50
Name: class, dtype: int64
class = 0
Iris-versicolor    47
Iris-virginica     13
Name: class, dtype: int64
class = 2
Iris-virginica     37
Iris-versicolor     3
Name: class, dtype: int64


In [17]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(data)
kmeans.cluster_centers_

array([[ 5.9016129 ,  2.7483871 ,  4.39354839,  1.43387097],
       [ 5.006     ,  3.418     ,  1.464     ,  0.244     ],
       [ 6.85      ,  3.07368421,  5.74210526,  2.07105263]])