In [1]:
"""
KModes

"""
#imports
import time
import numpy as np
from scipy import stats

class K_modes:
    def __init__(self,n_clusters):
        self.n_clusters = n_clusters
        self.modes = []
        self.labels = []

    def fit(self,X_train):
        size = X_train.shape
        self.labels = [0]*size[0]
        #random selection of culster modes
        modes = np.random.choice(size[1], self.n_clusters, replace=False)
        not_stable = True
        for element in modes:
            self.modes.append(X_train[element])
        

        while not_stable : 

            #calculate the distances between the modes and all the individuals
            for i in range (0,size[0]):
                distance = [0]*self.n_clusters
                for k in range (0,self.n_clusters):
                    for j in range (0,size[1]):
                        distance[k] = (distance[k] + 1) if X_train[i][j] != self.modes[k][j] else distance[k]

            # assign the individual to the cluster with minimum distance 
                self.labels[i] = distance.index(min(distance))

            #keep the old modes to compare later
            modes_old = self.modes[:]

            #claculate the new modes 
            for i in range (0,self.n_clusters):
                self.modes[i] = stats.mode(X_train[np.array(self.labels) == i])[0][0]
            
            #check if changes occured to the modes
            not_stable = False 
            for i in range (0,len(self.modes)):
                if not(np.array_equal(self.modes[i],modes_old[i])):
                    not_stable =  True

        return self




#test data 
data = np.array([[1,2,1,2,3],[1,1,1,2,2],
                [3,1,2,2,1],[1,2,2,1,3],
                [3,3,3,2,1],[1,1,1,1,2],
                [1,3,1,3,3],[3,1,2,2,3],
                [1,1,2,3,1],[1,2,2,1,3]])


start_time = time.time()
k = K_modes(3)
k.fit(data)
print(k.modes)
print(k.labels)
print("--- %s seconds ---" % (time.time() - start_time))

[array([1, 2, 1, 1, 3]), array([3, 1, 2, 2, 1]), array([1, 1, 1, 1, 2])]
[0, 2, 1, 0, 1, 2, 0, 1, 1, 0]
--- 0.02713632583618164 seconds ---


In [4]:
#test sur la BD des iris
import seaborn as sns
iris = sns.load_dataset('iris')
print(iris.head())
iris=np.array(iris)


   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [5]:
for k in range(150):
    if iris[k,4]=='setosa':
        iris[k,4]=0
    elif iris[k,4]=='versicolor':
        iris[k,4]=1
    else :
        iris[k,4]=2

In [3]:
start_time = time.time()
k = K_modes(5)
k.fit(iris)
print(k.modes)

print("--- %s seconds ---" % (time.time() - start_time))
print(k.labels)

[array([5.6, 2.8, 4.5, 1.3, 1], dtype=object), array([6.3, 3.0, 5.1, 1.8, 2], dtype=object), array([5.1, 3.4, 1.5, 0.2, 0], dtype=object), array([6.7, 3.1, 1.5, 0.2, 0], dtype=object), array([4.4, 3.2, 1.3, 0.2, 0], dtype=object)]
--- 0.08621644973754883 seconds ---
[2, 2, 4, 3, 2, 2, 2, 2, 4, 3, 2, 2, 1, 1, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 3, 2, 2, 2, 3, 4, 4, 2, 4, 2, 4, 4, 4, 2, 2, 1, 2, 4, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [93]:
#test Kmeans comparer les temps d'execution
from sklearn.cluster import KMeans
start_time = time.time()
kmeans = KMeans(n_clusters=5, random_state=0).fit(iris)

print("--- %s seconds ---" % (time.time() - start_time))
kmeans.labels_

--- 0.08493614196777344 seconds ---


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 2, 3, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2,
       3, 3, 2, 3, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 3, 2, 2, 2,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 0, 0, 4, 0, 0, 4, 3, 4, 0, 4,
       0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 4, 0, 0, 4, 0, 0, 0, 4, 4, 4,
       0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)