In [176]:
#Self organizing map clustering algorithm
import numpy as np
from scipy.spatial import distance
import matplotlib.pyplot as plt

In [177]:
np.set_printoptions(suppress=True) 


In [178]:
# Datasets

# Load the CSV file as a numpy array
dataset = np.loadtxt("pima_indians.csv", delimiter=",")

X = dataset[:, :-1]
y = dataset[:, -1]

# Print the array
print(X)

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


In [179]:
k = 2
p = 0
alpha = 0.7 # Initial learning rate

In [180]:
# Print the number of data and dimension 
n = len(X)
d = len(X[0])
# addZeros = np.zeros((n, 1))

# X = np.append(X, addZeros, axis=1) 
print("The training data: \n", X)
print("\nTotal number of data: ",n)
print("Total number of features: ",d)
print("Total number of Clusters: ",k)

C = np.zeros((k,d+1))


The training data: 
 [[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]

Total number of data:  768
Total number of features:  8
Total number of Clusters:  2


In [181]:
# Initialize weights
weight = np.random.rand(d,k)
print("\nThe initial weight: \n", np.round(weight,2))
print(weight.shape)
print(X.shape)


The initial weight: 
 [[0.18 0.81]
 [0.54 0.59]
 [0.46 0.42]
 [0.53 0.04]
 [0.3  0.16]
 [0.06 0.11]
 [0.5  0.89]
 [0.52 0.22]]
(8, 2)
(768, 8)


In [182]:
num_iter = 100
cluster_no = np.array([])
for it in range(num_iter): # Total number of iterations
    for i in range(n):
        distMin = 99999999
        for j in range(k):
            dist = np.square(distance.euclidean(weight[:,j], X[i,0:d]))
            
            # Find the weight for which minimum distance is obtained
            if distMin>dist:
                distMin = dist
                jMin = j

        # Collect the cluster number from the final iteration
        if it==num_iter-1:
                cluster_no = np.append(cluster_no, jMin)

        # Adapt that weight        
        weight[:,jMin] = weight[:,jMin]*(1-alpha) + alpha*X[i,0:d] 
    

    # Update learning rate  
    alpha = 0.5*alpha
    

In [183]:
print("\nThe final weight: \n",np.round(weight,4))


The final weight: 
 [[  3.8905   3.6304]
 [114.9498 140.6055]
 [ 68.1894  72.6002]
 [ 17.3695  31.1479]
 [ 29.9658 241.0899]
 [ 31.1676  34.8501]
 [  0.4375   0.5688]
 [ 33.2799  33.2023]]


In [184]:
print(cluster_no.shape)

(768,)


In [185]:
clustered_data = np.append(X, np.expand_dims(cluster_no, axis=1), axis=1)
print(clustered_data)


[[  6.    148.     72.    ...   0.627  50.      0.   ]
 [  1.     85.     66.    ...   0.351  31.      0.   ]
 [  8.    183.     64.    ...   0.672  32.      0.   ]
 ...
 [  5.    121.     72.    ...   0.245  30.      0.   ]
 [  1.    126.     60.    ...   0.349  47.      0.   ]
 [  1.     93.     70.    ...   0.315  23.      0.   ]]


In [186]:
clusters = clustered_data[:, -1]

In [187]:
correct = 0
wrong = 0
for i in range(len(y)):
    if clusters[i] == y[i]:
        correct+=1
    else:
        wrong+=1

accuracy = (correct/(correct+wrong))*100
print("Accuracy:", accuracy, "%")


Accuracy: 65.625 %
