In [1]:
%config IPCompleter.greedy=True
import pandas as pd
import numpy as np
from random import choice
import math

In [2]:
data = pd.read_csv('agaricus-lepiota.data')

In [3]:
def hamming_distance_and_choose_cluster(row, cluster_modes):
    distances = np.array([])
    for cluster_mode in cluster_modes:
        distances = np.append(distances, row.eq(cluster_mode).value_counts().get(False, 0))
    return np.argmin(distances)

In [4]:
#calculate cluster modes after cluster assignment
def calculate_cluster_modes(data, number_of_clusters):
    cluster_modes = []
    for i in range(number_of_clusters):
        mode = data[data['cluster'] == i].mode()
        mode.drop('cluster', 1, inplace=True, errors='ignore')
        cluster_modes.append(mode.loc[0])
    return cluster_modes
    

In [5]:
#for termination condition
def cluster_sets_are_equal(cluster_modes, new_cluster_modes):
    if(len(cluster_modes) != len(new_cluster_modes)):
        return False
    for i in range(len(cluster_modes)):
        if not cluster_modes[i].equals(new_cluster_modes[i]):
            return False
    return True 

In [6]:
def k_mode_clustering(data, number_of_clusters):
    cluster_centroids = []
    already_chosen_clusters = []
    clusters = {}
    df_with_cluster = data.copy()
    df_with_cluster.drop('p', 1, inplace=True)
    
    #for random centroids, remove the rows with missing values
    data_without_missing_values = df_with_cluster[df_with_cluster['e.1'] != '?'].reset_index()
    
    #chose random centroids
    rows = len(data_without_missing_values)
    for i in range(number_of_clusters):
        rand_index = choice([i for i in range(rows) if i not in already_chosen_clusters])
        already_chosen_clusters.append(rand_index)
        cluster_centroids.append(data_without_missing_values.loc[rand_index])
    new_cluster_centroids = []
    
    iteration = 0
    while not cluster_sets_are_equal(cluster_centroids, new_cluster_centroids):
        print('On iteration number: {}'.format(iteration))
        new_cluster_centroids = cluster_centroids
        df_with_cluster.drop('cluster', 1, inplace=True, errors='ignore')
        df_with_cluster['cluster'] = df_with_cluster.apply(hamming_distance_and_choose_cluster, args=[new_cluster_centroids], axis=1)
        cluster_centroids = calculate_cluster_modes(df_with_cluster, number_of_clusters)
        
        if(iteration == 0):
            for i, acc in enumerate(already_chosen_clusters):
                mask = (df_with_cluster['e.1'] == '?') & (df_with_cluster['cluster'] == i)
                print(df_with_cluster.loc[acc]['e.1'])
                df_with_cluster.loc[mask, 'e.1'] = df_with_cluster.loc[acc]['e.1']
            
        print(len(df_with_cluster[df_with_cluster['e.1'] == '?']))
        iteration += 1

    df_with_cluster.to_csv('prediction.csv')
    return df_with_cluster

In [None]:
res = k_mode_clustering(data, 14)

On iteration number: 0
c
b
b
c
b
b
b
e
b
r
b
b
b
r
0
On iteration number: 1


In [57]:
res['e.1'].unique()

array(['c', 'e', 'b', 'r'], dtype=object)

In [26]:
data

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
3983,e,x,y,b,t,n,f,c,b,e,...,s,e,w,p,w,t,e,w,c,w
4022,p,x,y,e,f,y,f,c,n,b,...,s,w,w,p,w,o,e,w,v,p
4075,e,f,y,u,f,n,f,c,n,h,...,f,w,w,p,w,o,f,h,y,d
4099,p,x,y,e,f,y,f,c,n,b,...,s,p,p,p,w,o,e,w,v,d
4103,p,x,y,n,f,f,f,c,n,b,...,s,p,p,p,w,o,e,w,v,l
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8118,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8119,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8120,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8121,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l
