In [None]:
# Imports

from math import *
import numpy as np

import pandas as pd
from scipy.stats import zscore
import time
from collections import Counter

from sklearn.cluster import KMeans
from operator import itemgetter
from sklearn.metrics import silhouette_score, adjusted_rand_score


In [None]:
# Minkowski distance:

def p_root(value, root):
    root = 1 / float(root)
    return round ((value) ** (root), 3)

def minkowski_distance(x, y, p):
    return (p_root(sum(pow(abs(a-b), p) for a, b in zip(x, y)), p))

p = 1

In [None]:
# Load data and define K:

df = pd.read_csv("datasets/0.data")
df = df.fillna(0)
number_of_columns = len(df.columns)
list_of_classes = df['class'].to_list()

# Number of unique classes:
classes = Counter(list_of_classes).keys()
K = len(classes)
print('K=', K)

optimal_labels = []
for item in list_of_classes:
    optimal_labels.append(list(classes).index(item))

# print('optimal_labels', optimal_labels)

# Remove class from dataframe
data = df.drop("class", axis=1)
# data = data.select_dtypes(include='number').apply(zscore)
data.head()

In [None]:
n_samples, n_features = data.shape

rows = []
for index, row in data.iterrows():
    rows.append(row.values)

def matrix_of_distances(p = 1):
    distances = np.zeros((n_samples,n_samples))

    for diag in range(0, n_samples):
        if diag % 100 == 0:
            print('DIAG: ', diag, '/', n_samples)
        for row in range(0, n_samples-diag):
            col = row + diag
            if row == col:
                distances[row][col] = 0.0
            else:
                sample = rows[row]
                target = rows[col]
                distances[row][col] = minkowski_distance(sample, target, p)

    return distances

matrix = matrix_of_distances(p)
print('FINISH')

In [None]:
# Use sklearn do evaluate our solution

def calc_sklearn_radius(labels, centers):
    candidates = []
    for center_idx, center in enumerate(centers):
        options = []
        for idx, cluster in enumerate(labels):
            if center_idx == cluster:
                options.append(minkowski_distance(center, rows[idx], p))
        candidates.append(max(options))
        
    return max(candidates)


start_time = time.time()

kmeans = KMeans(n_clusters=K).fit(data)
sklearn_labels = kmeans.labels_
sklearn_centers = kmeans.cluster_centers_

sklearn_radius = calc_sklearn_radius(sklearn_labels, sklearn_centers)

sklearn_silhouette = round(silhouette_score(data, kmeans.labels_), 5)
rand_score = round(adjusted_rand_score(optimal_labels, sklearn_labels), 5)

sklearn_end_time = round(time.time() - start_time, 5)

print('sklearn radius', sklearn_radius)
# print('sklearn centers', sklearn_centers)
# print('sklearn labels', sklearn_labels)
print('sklearn silhouette', sklearn_silhouette)
print('sklearn rand index', rand_score)
print('sklearn time', sklearn_end_time)


In [None]:
def select_index(x, y):
    row = min(x, y)
    column = max(x, y)
    return {'row': row, 'column': column}

def K_Means(distances):
    # If we have more centers than elements, return the dataset itself
    if K > n_samples:
        return data
    
    # Starting with a random center 
    centroids = np.random.choice(n_samples, 1).tolist()
    # print('First center:', centroids[0])
    
    # Select s that maximize dist(s, C)
    while len(centroids) < K:
        candidates = []
        for idx_row in range(n_samples):
            s_is_centroid = [idx_row == centroid for centroid in centroids]
            if not any(s_is_centroid):
                options = []
                for centroid in centroids:
                    indexes = select_index(idx_row, centroid)
                    options.append(distances[indexes['row']][indexes['column']])
                most_distant_centroid = min(options)
                candidates.append({'distance': most_distant_centroid, 'index': idx_row})

        if len(candidates) > 0:
            max_item = max(candidates, key=itemgetter('distance'))
            s = max_item['index']
            centroids.append(s)
        
    return centroids

# Calcule clusters
def calc_clusters(centers, distances):
    # Initialize clusters with empty array
    clusters = [[] for _ in range(K)]
    labels = []
    for idx_row in range(n_samples):
        candidates = []
        for idx, center in enumerate(centers):
            indexes = select_index(idx_row, center)
            candidates.append({'distance': distances[indexes['row']][indexes['column']], 'center_index': idx})
        closest_centroid = min(candidates, key=itemgetter('distance'))
        clusters[closest_centroid['center_index']].append(idx_row)
        labels.append(closest_centroid['center_index'])

    return {'clusters': clusters, 'labels': labels}

def calc_radius(labels, distances, centers):
    candidates = []
    for center_idx, center in enumerate(centers):
        options = []
        for idx, cluster in enumerate(labels):
            if center_idx == cluster:
                indexes = select_index(center, idx)
                options.append(distances[indexes['row']][indexes['column']])
        if len(options) > 0:
            candidates.append(max(options))
        
    return max(candidates)
 
def kmeans_calc():
    C = K_Means(matrix)
    print('Centers', C)

    result = calc_clusters(C, matrix)
    radius = calc_radius(result['labels'], matrix, C)

    return {'radius': radius, 'labels': result['labels']}
    
    
def execute_calculations(max_iterations = 1): 
    all_radius = []
    all_silhouettes = []
    all_rand_scores = []
    all_times = []
    
    for _ in range(max_iterations):
        start_time = time.time()
        result = kmeans_calc()
        end_time = round(time.time() - start_time, 5)
        silhouette = round(silhouette_score(data, result['labels']), 5)
        rand_score = round(adjusted_rand_score(optimal_labels, result['labels']), 5)
        
        all_radius.append(result['radius'])
        all_silhouettes.append(silhouette)
        all_rand_scores.append(rand_score)
        all_times.append(end_time)
        
        print('Radius', result['radius'])
        # print('Labels', result['labels'])
        print('Silhouette', silhouette)
        print('Adjusted Rand', rand_score)
        print('Time', end_time)
        print('----------------')
    
    print('Average radius of ', max_iterations, ' executions: ', round(sum(all_radius)/len(all_radius), 5))
    print('Average silhouette of ', max_iterations, ' executions: ', round(sum(all_silhouettes)/len(all_silhouettes), 5))
    print('Average rand of ', max_iterations, ' executions: ', round(sum(all_rand_scores)/len(all_rand_scores), 5))
    print('Average time of ', max_iterations, ' executions: ', round(sum(all_times)/len(all_times), 5))
    print('----------------')
    
execute_calculations(1)