In [243]:
%config IPCompleter.greedy=True
import pandas as pd
import numpy as np
from random import choice
import math

In [244]:
data = pd.read_csv('agaricus-lepiota.data')

In [245]:
def hamming_distance_and_choose_cluster(data, cluster_modes):
    clusters = []
    for idx, cluster_mode in enumerate(cluster_modes):
        df = data.iloc[:, 0:22]
        clusters.append('c' + str(idx))
        data['c' + str(idx)] = df.ne(cluster_mode).sum(1)
    data['cluster'] = (data.loc[:, clusters].idxmin(axis = 1))
    data['cluster'] = data['cluster'].str[1:].astype(int)
    data = data.drop(columns=clusters)
    return data

In [246]:
#calculate cluster modes after cluster assignment
def calculate_cluster_modes(data, number_of_clusters):
    cluster_modes = []
    for i in range(number_of_clusters):
        mode = data[data['cluster'] == i].mode()
        mode.drop('cluster', 1, inplace=True, errors='ignore')
        cluster_modes.append(mode.loc[0])
    return cluster_modes
    

In [247]:
#for termination condition
def cluster_sets_are_equal(cluster_modes, new_cluster_modes):
    if(len(cluster_modes) != len(new_cluster_modes)):
        return False
    for i in range(len(cluster_modes)):
        if not cluster_modes[i].equals(new_cluster_modes[i]):
            return False
    return True 

In [248]:
def k_mode_clustering(data, number_of_clusters):
    cluster_centroids = []
    already_chosen_clusters = []
    clusters = {}
    df_with_cluster = data.copy()
    df_with_cluster.drop('p', 1, inplace=True)
    
    #for random centroids, remove the rows with missing values
    #data_without_missing_values = df_with_cluster[df_with_cluster['e.1'] != '?'].reset_index()
    
    #chose random centroids
    rows = len(df_with_cluster)
    i = 0
    while i < number_of_clusters:
        rand_index = choice([i for i in range(rows) if i not in already_chosen_clusters])
        value = df_with_cluster.loc[rand_index]
        if value['e.1'] == '?':
            continue
        already_chosen_clusters.append(rand_index)
        cluster_centroids.append(df_with_cluster.loc[rand_index])
        i += 1
    new_cluster_centroids = []
    
    #df_with_cluster = hamming_distance_and_choose(df_with_cluster, cluster_centroids)
    #return df_with_cluster
    iteration = 0
    while not cluster_sets_are_equal(cluster_centroids, new_cluster_centroids):
        print('On iteration number: {}'.format(iteration))
        new_cluster_centroids = cluster_centroids
        df_with_cluster.drop('cluster', 1, inplace=True, errors='ignore')
        #df_with_cluster['cluster'] = df_with_cluster.apply(hamming_distance_and_choose_cluster, args=[new_cluster_centroids], axis=1)
        df_with_cluster = hamming_distance_and_choose_cluster(df_with_cluster, new_cluster_centroids)
        cluster_centroids = calculate_cluster_modes(df_with_cluster, number_of_clusters)
        
        if(iteration == 0):
            for i, acc in enumerate(already_chosen_clusters):
                mask = (df_with_cluster['e.1'] == '?') & (df_with_cluster['cluster'] == i)
                df_with_cluster.loc[mask, 'e.1'] = df_with_cluster.loc[acc]['e.1']

        iteration += 1

    df_with_cluster.to_csv('prediction.csv')
    return df_with_cluster

In [249]:
res = k_mode_clustering(data, 12)
res

On iteration number: 0
On iteration number: 1
On iteration number: 2
On iteration number: 3
On iteration number: 4


Unnamed: 0,x,s,n,t,p.1,f,c,n.1,k,e,...,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u,cluster
0,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,0
1,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,0
2,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,0
3,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,6
4,x,y,y,t,a,f,c,b,n,e,...,w,w,p,w,o,p,k,n,g,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8118,k,s,n,f,n,a,c,b,y,e,...,o,o,p,o,o,p,b,c,l,1
8119,x,s,n,f,n,a,c,b,y,e,...,o,o,p,n,o,p,b,v,l,10
8120,f,s,n,f,n,a,c,b,n,e,...,o,o,p,o,o,p,b,c,l,2
8121,k,y,n,f,y,f,c,n,b,t,...,w,w,p,w,o,e,w,v,l,10
