In [1]:
#importing libraries and loading dataset
import numpy as np
from sklearn import datasets
df=datasets.load_iris().data
df

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

# k-MEDIODS CLUSTERING

In [2]:
import random
import copy

In [3]:

def euclidean_distance(a, b):          #euclidean distance between a and b, datapoints
    return np.sqrt(sum(np.square(a-b)))


def point_distribution(matrix,centroid_list):

    cluster_points = [[i] for i in centroid_list]
    label_list = []
    for datapoint in matrix:
        # Calculate the distance from the node point to each center and divide it to the nearest center point
        dist_list = [euclidean_distance(datapoint, centroid) for centroid in centroid_list]
        label = np.argmin(dist_list)  # Select the nearest cluster center
        label_list.append(label)
        cluster_points[label].append(datapoint)  # Add point to the nearest cluster
    return label_list, cluster_points


In [4]:

def pam(data, k):

    # Random initial cluster center
    index_list = list(range(len(data)))
    random.shuffle(index_list)
    shuffled_index = index_list[:k]
    centroids = data[shuffled_index, :]  # Array of center points
    labels = []  # Category label for each data
    stop_flag = False  # A sign that the algorithm stops iterating
    while not stop_flag:
        stop_flag = True
        cluster_points = [[i] for i in centroids]  # The i-th element is a collection of data points of the i-th type
        labels = []  # Category label for each data
        #Iterate over the data
        for datapoint in data:
            #Calculate the distance from the node point to each center and divide it to the nearest center point
            distances = [euclidean_distance(datapoint, i) for i in centroids]
            label = np.argmin(distances)  # Select the nearest cluster center
            labels.append(label)
            cluster_points[label].append(datapoint)  #Add point to the nearest cluster

        #Calculate the total distance between the current center point and all other points
        distances = []
        for i in range(k):
            distances.extend([euclidean_distance(j, centroids[i]) for j in cluster_points[i]])
        old_distances_sum = sum(distances)

        #Try to replace the center point with each non-central point in the entire data set. If the clustering error is reduced, change the center point
        for i in range(k):
            # Calculate the distance from each node to the center of the original cluster in the i-th cluster
            for datapoint in data:
                new_centroids = copy.deepcopy(centroids)  #Hypothetical center set
                new_centroids[i] = datapoint
                labels, cluster_points = point_distribution(data, new_centroids)
                #Calculate new clustering error
                distances = []
                for j in range(k):
                    distances.extend([euclidean_distance(p, new_centroids[j]) for p in cluster_points[j]])
                new_distances_sum = sum(distances)

                #Determine whether the clustering error is reduced
                if new_distances_sum < old_distances_sum:
                    old_distances_sum = new_distances_sum
                    centroids[i] = datapoint  #Modify the center of the i-th cluster
                    stop_flag = False
    return centroids, labels, old_distances_sum

In [5]:
def k_medoid_clustering(data,k):
    centroids,targets,old_dis=pam(data,k)
    print("CENTROIDS: \n",centroids)
    print("TARGETS:",targets)

In [6]:
k_medoid_clustering(df,5)

CENTROIDS: 
 [[6.2 2.8 4.8 1.8]
 [6.9 3.2 5.7 2.3]
 [5.7 2.8 4.1 1.3]
 [4.6 3.1 1.5 0.2]
 [5.2 3.5 1.5 0.2]]
TARGETS: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 4, 2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 2, 0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 4, 1, 4, 1, 1, 2, 1, 1, 1, 0, 0, 1, 4, 4, 1, 1, 1, 1, 0, 1, 4, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 4, 1, 1, 4, 0, 1, 1, 1, 4, 1, 1, 1, 0, 0, 4, 4]


# AGGLOMERATIVE CLUSTERING

In [7]:
def euclidean_dist(a, b):          #euclidean distance between a and b, datapoints
    return np.sqrt(sum(np.square(a-b)))

def single_link_dist(cluster1,cluster2):       #TAKES IN TWO CLUSTERS. RETURNS SINGLE_LINK DISTANCE OR MINIMUM DISTANCE BETWEEN THEM. 
    dist_list=list()
    for i in cluster1:                  #datapoint(s) of cluster1
        for j in cluster2:              #datapoint(s) of cluster2
            dist_list.append(euclidean_dist(i,j))
    return(min(dist_list))





In [8]:

def aglomerative(data,k):
    """parameter data: data array
       parameter k: Number of clusters 
    """
    N=len(data)
    cluster_label=[[i] for i in range(N)]
    
    if k == N:
        return cluster_label
    else:
        for cluster_num in range(N-1,k-1,-1):
            
            counter=0
            for i in range(len(cluster_label)-1):
                cluster1=list()
                for t in cluster_label[i]:
                    cluster1.append(data[t])
                
                for j in range(i+1,len(cluster_label)):
                    cluster2=list()
                    for t1 in cluster_label[j]:
                        cluster2.append(data[t1])
                    
                    if counter == 0:
                        min_sl_dist=single_link_dist(cluster1,cluster2)
                        r,c=0,1
                        counter=2
                    else:
                        if single_link_dist(cluster1,cluster2)<min_sl_dist:
                            min_sl_dist = single_link_dist(cluster1,cluster2)
                            r,c=i,j
            
            cluster_label[r]=cluster_label[r]+cluster_label[c]
            del cluster_label[c]
    return cluster_label
    
    
    


In [9]:
clus=aglomerative(df,130)


In [10]:
for i in range(len(clus)):
    print("CLUSTER",i+1,":\n",clus[i])
print(len(clus))

CLUSTER 1 :
 [0, 17, 40, 4, 37, 7, 39, 49]
CLUSTER 2 :
 [1, 9, 34]
CLUSTER 3 :
 [2]
CLUSTER 4 :
 [3, 47]
CLUSTER 5 :
 [5]
CLUSTER 6 :
 [6]
CLUSTER 7 :
 [8, 38]
CLUSTER 8 :
 [10, 48]
CLUSTER 9 :
 [11]
CLUSTER 10 :
 [12]
CLUSTER 11 :
 [13]
CLUSTER 12 :
 [14]
CLUSTER 13 :
 [15]
CLUSTER 14 :
 [16]
CLUSTER 15 :
 [18]
CLUSTER 16 :
 [19, 21, 46]
CLUSTER 17 :
 [20]
CLUSTER 18 :
 [22]
CLUSTER 19 :
 [23]
CLUSTER 20 :
 [24]
CLUSTER 21 :
 [25]
CLUSTER 22 :
 [26]
CLUSTER 23 :
 [27]
CLUSTER 24 :
 [28]
CLUSTER 25 :
 [29, 30]
CLUSTER 26 :
 [31]
CLUSTER 27 :
 [32]
CLUSTER 28 :
 [33]
CLUSTER 29 :
 [35]
CLUSTER 30 :
 [36]
CLUSTER 31 :
 [41]
CLUSTER 32 :
 [42]
CLUSTER 33 :
 [43]
CLUSTER 34 :
 [44]
CLUSTER 35 :
 [45]
CLUSTER 36 :
 [50]
CLUSTER 37 :
 [51]
CLUSTER 38 :
 [52]
CLUSTER 39 :
 [53]
CLUSTER 40 :
 [54]
CLUSTER 41 :
 [55]
CLUSTER 42 :
 [56]
CLUSTER 43 :
 [57, 93]
CLUSTER 44 :
 [58]
CLUSTER 45 :
 [59]
CLUSTER 46 :
 [60]
CLUSTER 47 :
 [61]
CLUSTER 48 :
 [62]
CLUSTER 49 :
 [63]
CLUSTER 50 :
 [64]
CLUST