# K Modes Clustering
In this notebook we'll work on K Modes Clustering algorithm, to cluster a single categorical variable clustering task.  
As usual, let's import the libraries first:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Now, let's use a dummy dataset and see how it is:

In [7]:
data = pd.read_csv('data1.csv')
data

Unnamed: 0,Individual,Skill
0,1,C
1,1,Python
2,1,Java
3,1,Haskell
4,2,Python
5,2,React
6,2,JS
7,2,PHP
8,3,C++
9,3,JS


In [9]:
one_hot_df = data.copy()
for i,name in enumerate(data['Skill'].unique()):
    one_hot_df[name] = 0
def set_product(x):
    x[str(x['Skill'])] = 1
    return x

one_hot_df = one_hot_df.apply(set_product, axis=1)
one_hot_df = one_hot_df.groupby(['Individual']).sum()
one_hot_df

Unnamed: 0_level_0,C,Python,Java,Haskell,React,JS,PHP,C++,Flutter,Android,iOS,Fortran,Pascal,.NET,C#,MATLAB
Individual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1


In [10]:
from kmodes.kmodes import KModes
# define the k-modes model
km = KModes(n_clusters=3, init='Huang', n_init=4, verbose=1)
# fit the clusters to the skills dataframe
clusters = km.fit_predict(one_hot_df)
# get an array of cluster modes
kmodes = km.cluster_centroids_
shape = kmodes.shape
# For each cluster mode (a vector of "1" and "0")
# find and print the column headings where "1" appears.
# If no "1" appears, assign to "no-styles" cluster.
for i in range(shape[0]):
    if sum(kmodes[i,:]) == 0:
        print("\ncluster " + str(i) + ": ")
        print("no-style cluster")
    else:
        print("\ncluster " + str(i) + ": ")
        cent = kmodes[i,:]
        for j in one_hot_df.columns[np.nonzero(cent)]:
            print(j)

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 8.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 0, cost: 8.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 8.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 7.0
Best run was number 4

cluster 0: 
JS

cluster 1: 
C
Python
Java
Haskell

cluster 2: 
C
Fortran
Pascal
.NET
C#
MATLAB


# Test on Bank Dataset

In [18]:
bank = pd.read_csv('bank/bank-full.csv', delimiter=';')
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [20]:
from kmodes.kmodes import KModes
# define the k-modes model
km = KModes(n_clusters=5, init='Huang', n_init=1, verbose=1)
# fit the clusters to the skills dataframe
clusters = km.fit_predict(bank)
# get an array of cluster modes
kmodes = km.cluster_centroids_
shape = kmodes.shape
# For each cluster mode (a vector of "1" and "0")
# find and print the column headings where "1" appears.
# If no "1" appears, assign to "no-styles" cluster.
for i in range(shape[0]):
    print(kmodes[i,:])
#     print("\ncluster " + str(i) + ": ")
#     cent = kmodes[i,:]
#     for j in kmodes[i,:]:
#         print(kmodes[i,j])

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 9523, cost: 331454.0
Run 1, iteration: 2/100, moves: 2526, cost: 331454.0
['33' 'blue-collar' 'married' 'secondary' 'no' '0' 'yes' 'no' 'cellular'
 '20' 'may' '151' '1' '-1' '0' 'unknown' 'no']
['38' 'management' 'married' 'tertiary' 'no' '0' 'yes' 'no' 'cellular' '7'
 'may' '122' '2' '-1' '0' 'unknown' 'no']
['30' 'technician' 'single' 'secondary' 'no' '0' 'no' 'no' 'cellular' '28'
 'aug' '90' '1' '-1' '0' 'unknown' 'no']
['39' 'blue-collar' 'married' 'primary' 'no' '0' 'yes' 'no' 'unknown' '6'
 'jun' '18' '1' '-1' '0' 'unknown' 'no']
['59' 'blue-collar' 'married' 'primary' 'no' '0' 'no' 'no' 'cellular' '3'
 'feb' '133' '1' '-1' '0' 'unknown' 'no']
