In [1]:
# Minkowski distance:

from math import *
import numpy as np


def p_root(value, root):
    root = 1 / float(root)
    return round ((value) ** (root), 3)

def minkowski_distance(x, y, p):
    return (p_root(sum(pow(abs(a-b), p) for a, b in zip(x, y)), p))


# vector1 = [0, 2, 3, 4]
# vector2 = [2, 4, 3, 7]
p = 1

# print(minkowski_distance(vector1, vector2, p))

In [2]:
# Load data and define K:

import pandas as pd
from scipy.stats import zscore
from collections import Counter

df = pd.read_csv("datasets/1.data")
df = df.fillna(0)

number_of_columns = len(df.columns)
list_of_classes = df['class'].to_list()
# Number of unique classes:
K = len(Counter(list_of_classes).keys())

print(K)
df.head()

3


Unnamed: 0,sepal_length_in_cm,sepal_width_in_cm,petal_length_in_cm,petal_width_in_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Remove class from dataframe

data = df.drop("class", axis=1)
data.head()

Unnamed: 0,sepal_length_in_cm,sepal_width_in_cm,petal_length_in_cm,petal_width_in_cm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
n_samples, n_features = data.shape

def matrix_of_distances(p = 1):
    distances = []
    for index, row in data.iterrows():
        sample = row.values.tolist()

        distance = [
            minkowski_distance(sample, row.values.tolist(), p) for index, row in data.iterrows()
        ]

        distances.append(distance)

    return distances




matrix = matrix_of_distances(p)

In [10]:
from operator import itemgetter

def K_Means(distances):

    # If we have more centers than elements, return the dataset itself
    if K > n_samples:
        return data
    
    # Starting with a random center 
    centroids = np.random.choice(n_samples, 1).tolist()
    print('First center:', centroids[0])
    
    # Select s that maximize dist(s, C)
    while len(centroids) < K:
        candidates = []
        for idx_row in range(n_samples):
            s_is_centroid = [idx_row == centroid for centroid in centroids]
            if not any(s_is_centroid):
                most_distant_centroid = max(distances[idx_row][centroid] for centroid in centroids)
                candidates.append({'distance': most_distant_centroid, 'index': idx_row})

        if len(candidates) > 0:
#             print('candidates', candidates)
            max_item = max(candidates, key=itemgetter('distance'))
#             print('max_item', max_item)
            s = max_item['index']
#             print('s', s)
            centroids.append(s)
        
    return centroids

C = K_Means(matrix)
print('Centers', C)

First center: 82
Centers [82, 117, 41]


In [11]:
# Calcule clusters
def calc_custers(centers, distances):
    # Initialize clusters with empty array
    clusters = [[] for _ in range(K)]
    for idx_row in range(n_samples):
        candidates = []
        for idx, center in enumerate(centers):
            candidates.append({'distance': distances[idx_row][center], 'center_index': idx})
        closest_centroid = min(candidates, key=itemgetter('distance'))
        clusters[closest_centroid['center_index']].append(idx_row)
        
    return clusters

clusters = calc_custers(C, matrix)
print('Clusters', clusters)

Clusters [[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 103, 106, 108, 110, 111, 113, 114, 116, 119, 121, 123, 126, 127, 128, 132, 133, 134, 137, 138, 142, 146, 147, 149], [100, 102, 104, 105, 107, 109, 112, 115, 117, 118, 120, 122, 124, 125, 129, 130, 131, 135, 136, 139, 140, 141, 143, 144, 145, 148], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]]


In [12]:
# Calculate radius
def calc_radius(clusters, distances):
    candidates = []
    for center_index, cluster in enumerate(clusters):
#         print('center_index', C[center_index], [distances[C[center_index]][element] for element in cluster])
        candidates.append(max(distances[C[center_index]][element] for element in cluster))
#     print('candidates', candidates)
    return max(candidates)
    
radius = calc_radius(clusters, matrix)
print(radius)

3.6


In [8]:
# Use sklearn do evaluate our solution
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=K, random_state=0,).fit(data)

In [13]:
print(kmeans.cluster_centers_)
print(kmeans.labels_)

def calc_sklearn_radius():
    candidates = []
    for center_idx, center in enumerate(kmeans.cluster_centers_):
        options = []
        for idx, cluster in enumerate(kmeans.labels_):
            if center_idx == cluster:
                options.append(minkowski_distance(center, data.iloc[idx].values.tolist(), p))
        print('options', options)
        candidates.append(max(options))
        
    print('candidates', candidates)
    return max(candidates)

sklearn_radius = calc_sklearn_radius()
print('p value', p)
print('Optimal radius', sklearn_radius)


[[5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.418      1.464      0.244     ]
 [6.85       3.07368421 5.74210526 2.07105263]]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]
options [1.89, 1.123, 1.377, 0.923, 0.494, 1.423, 2.877, 1.19, 1.277, 2.977, 0.513, 1.474, 0.69, 1.381, 1.19, 0.726, 0.877, 1.019, 1.377, 1.226, 0.777, 1.219, 0.79, 0.877, 0.99, 1.39, 0.423, 1.677, 1.677, 1.877, 0.877, 1.019, 0.926, 1.023, 1.523, 0.987, 0.981, 1.177, 0.79, 0.69, 0.877, 2.877, 0.677, 0.881, 0.681, 0.677, 2.777, 0.681, 1.323, 1.623, 1.623, 1.826, 1.319, 1.426, 1.319, 1.123, 1.323, 1.223, 1.123, 1.323, 1.719, 1.326]
options [0.284, 0.632, 0.732, 0.804, 0.296, 1.268, 0.544, 0.104, 1.232, 0.604, 0.756, 0.404, 0.832, 1.632, 1.684, 1.86