In [1113]:
import numpy as np
import pandas as pd
from copy import deepcopy

import sklearn
from sklearn import metrics
from scipy import stats
from sklearn.metrics import accuracy_score

from collections import Counter
import time
from sklearn.metrics.pairwise import cosine_similarity

In [1114]:
def manhattan(a, b, ax=1):
    return np.sum(np.abs(a-b), axis=ax)

def euclidean(a, b, ax=1):
    return np.sum((a-b)**2, axis=ax)

def cosine(a,b, ax=1):
    val = 1- np.dot(a,b.T) /(np.linalg.norm(a))*np.sum(np.linalg.norm(b))
    return val
        
def jaccard(a, b, ax=1):
    return (1-np.sum(np.minimum(a,b),axis=ax)/np.sum(np.maximum(a,b),axis=ax))

In [1115]:
def SSE(datapoints, clusters, centroids, distance=euclidean):
    SSE_Val = 0
    for i, centroid in enumerate(centroids):
       SSE_Val += np.sum(distance(datapoints[np.where(clusters==i)], centroid))
    return SSE_Val

In [1116]:
def predict(clusters, y, k=3):
    indexes = []
    for i in range(k):
        indexes.append(np.where(clusters == i))
    for cluster in indexes:
        mode = int(stats.mode(y[cluster])[0])
        clusters[cluster] = mode        
    return clusters

In [1117]:
def Kmeans(datapoints, centroid, K, distance, max_iter):
    
    print("Distance Metric:", distance)
    np.random.seed(99)
    
    ### Initializing centroid
    if centroid is None:
        centroid = datapoints[np.random.choice(len(datapoints), size=K, replace=False)]
    
    prev_centroid = np.ones(centroid.shape)
    
    ### Cluster Lables(0, 1, 2)
    clusters = np.zeros(len(datapoints))
   
    ### Distance between new centroids and old centroids 
    error = distance_type[distance](centroid, prev_centroid, None)
    error = np.array(error)

    count = 1
    prev_sse = 0
    curr_sse = 0
    
    ### Kmeans will iterate till the error becomes zero (no change in centroids anymore)
    ### Comment out this while loop condition and set "while True" for the condition "When the maximum preset value (100) of iteration is complete"
    while error.any() != 0:
#     while True:
        # Assigning each value to its closest cluster
        for i in range(len(datapoints)):
            distances = distance_type[distance](datapoints[i], centroid)
            cluster = np.argmin([distances])
            clusters[i] = cluster
                      
        # Storing the old centroid values
        prev_centroid = deepcopy(centroid)
        curr_sse = SSE(datapoints, clusters, centroid)
        print('Iteration: {}'.format(count))
        print('Current SSE: {}'.format(curr_sse))
        print('Previous SSE: {}'.format(prev_sse))

        # Finding the new centroids by taking the average value
        for i in range(K):
            points = [datapoints[j] for j in range(len(datapoints)) if clusters[j] == i]
            centroid[i] = np.mean(points, axis=0)

        error_old = deepcopy(error)
        error = distance_type[distance](centroid, prev_centroid, None)

        ### Comment this out for the condition "When the maximum preset value (100) of iteration is complete"
        if count > 0:
            if np.sum(error_old) == np.sum(error):
                print('breaks')
                break
        if count > max_iter+1:
            break
        count = count + 1
        prev_sse = curr_sse
    return clusters, count

In [1118]:
### For 'Euclidean' distance Metric
def euclidean_dist(df, datapoints):
    clusters, count = Kmeans(datapoints, centroid=None, K=3, distance='euclidean', max_iter= 100)
    print("Max Iteration Required: ", count)
    
    ### Print Max Vote for each cluster
#     print(Counter(clusters).keys()) 
#     print(Counter(clusters).values()) 

    pred_val = predict(clusters, df['Class'].values)
    print("Accuracy: ", sklearn.metrics.accuracy_score(df['Class'].values, pred_val))  


In [1119]:
### For 'Cosine' distance Metric'
def cosine_dist(df, datapoints):
    clusters, count = Kmeans(datapoints, centroid=None, K=3, distance='cosine', max_iter= 100)
    print("Max Iteration Required: ", count)
    
    ### Print Max Vote for each cluster
#     print(Counter(clusters).keys()) 
#     print(Counter(clusters).values()) 

    print("Accuracy: ", sklearn.metrics.accuracy_score(df['Class'].values, clusters))

In [1120]:
### For 'Jaccard' distance Metric
def jaccard_dist(df, datapoints):
    clusters, count = Kmeans(datapoints, centroid=None, K=3, distance='jaccard', max_iter= 100)
    print("Max Iteration Required: ", count)
    
    ### Print Max Vote for each cluster
#     print(Counter(clusters).keys()) 
#     print(Counter(clusters).values()) 

    pred_val = predict(clusters, df['Class'].values)
    print("Accuracy: ", sklearn.metrics.accuracy_score(df['Class'].values, pred_val))

In [1121]:
if __name__ == '__main__':
    
    
    distance_type = {
        'manhattan': manhattan,
        'euclidean': euclidean,
        'cosine': cosine,
        'jaccard': jaccard
    }
    
#     main()

    df = pd.read_csv("iris.data", sep=",", names=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Class'])
    
#     print(df.head())
#     print(df['Class'].unique())
    
    ### Converting String values of Column 'Class' to Numerical ['Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2]
    class_map = {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}
    df['Class'] = df['Class'].map(class_map).astype('int32')
    
    datapoints = df[df.columns[:-1]].values
    


In [1122]:
    start = time.time()
    euclidean_dist(df, datapoints)
    end = time.time()
    print("Time Esapsed: ", end-start)
##### For 'Euclidean' distance Metric #####

Distance Metric: euclidean
Iteration: 1
Current SSE: 133.26
Previous SSE: 0
Iteration: 2
Current SSE: 79.13199166666668
Previous SSE: 133.26
Iteration: 3
Current SSE: 78.940841426146
Previous SSE: 79.13199166666668
Max Iteration Required:  4
Accuracy:  0.8933333333333333
Time Esapsed:  0.014958620071411133


In [1123]:
#     start = time.time()
#     cosine_dist(df, datapoints)
#     end = time.time()
#     print("Time Esapsed: ", end-start)
##### For 'Cosine' distance Metric' #####

In [1124]:
#     start = time.time()
#     jaccard_dist(df, datapoints)
#     end = time.time()
#     print("Time Esapsed: ", end-start)
##### For 'Jaccard' distance Metric #####