# Intro

Since all data is preprocessed, and datasets for thresholds [0.5, 0.9] are ready we can proceed into comparing results in different clustering models.

# Libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
from math import tau
from scipy.integrate import quad
warnings.filterwarnings('ignore')

# Data

In [2]:
data = np.loadtxt("./../DATA/digits2k_pixels.data.gz", ndmin=2)/255.0
data.shape = (data.shape[0], int(np.sqrt(data.shape[1])), int(np.sqrt(data.shape[1])))
labels = np.loadtxt("./../DATA/digits2k_pixels.labels.gz", dtype='int')

In [3]:
coef_05 = np.load('SETS/Fourier_depth_10_th_0.5.npy')
coef_06 = np.load('SETS/Fourier_depth_10_th_0.6.npy')
coef_07 = np.load('SETS/Fourier_depth_10_th_0.7.npy')
coef_08 = np.load('SETS/Fourier_depth_10_th_0.8.npy')
coef_09 = np.load('SETS/Fourier_depth_10_th_0.9.npy')
coef_dict = dict()
coef_dict['05'] = coef_05
coef_dict['06'] = coef_06
coef_dict['07'] = coef_07
coef_dict['08'] = coef_08
coef_dict['09'] = coef_09

# Functions needed

## Metrics between Fourier Series

In [4]:
def find_mse_radius(coef_1, coef_2, **order):
    real_1 = coef_1[:, 0]
    imag_1 = coef_1[:, 1]
    real_2 = coef_2[:, 0]
    imag_2 = coef_2[:, 1]
    radius_1 = np.sqrt(real_1**2 + imag_1**2)
    radius_2 = np.sqrt(real_2**2 + imag_2**2)
    diff_radius = (radius_2-radius_1)**2
    return np.sqrt(np.sum(diff_radius))

In [5]:
def find_mse_distance(coef_1, coef_2, **order):
    real_1 = coef_1[:, 0]
    imag_1 = coef_1[:, 1]
    real_2 = coef_2[:, 0]
    imag_2 = coef_2[:, 1]
    diff_real = (real_2 - real_1)**2
    diff_imag = (imag_2 - imag_1)**2
    return np.sqrt(np.sum(diff_real+diff_imag))

In [6]:
def find_mse_coef_order(coef_1, coef_2, order):
    kernel = np.array([np.exp(-n*1j) for n in range(-order, order+1)])
    series_1 = (coef_1[:,0]+1j*coef_1[:,1]) * kernel[:]
    series_2 = (coef_2[:,0]+1j*coef_2[:,1]) * kernel[:]
    real_1, imag_1 = np.real(series_1), np.imag(series_1)
    real_2, imag_2 = np.real(series_2), np.imag(series_2)
    diff_real = real_2 - real_1
    diff_imag = imag_2 - imag_1
    diff_real *= diff_real
    diff_imag *= diff_imag
    return np.sum(np.sqrt(diff_real+diff_imag))

In [7]:
def find_mse_radius_order(coef_1, coef_2, order):
    kernel = np.array([np.exp(-n*1j) for n in range(-order, order+1)])
    series_1 = (coef_1[:,0]+1j*coef_1[:,1]) * kernel[:]
    series_2 = (coef_2[:,0]+1j*coef_2[:,1]) * kernel[:]
    real_1, imag_1 = np.real(series_1), np.imag(series_1)
    real_2, imag_2 = np.real(series_2), np.imag(series_2)
    radius_1 = np.sqrt(real_1**2 + imag_1**2)
    radius_2 = np.sqrt(real_2**2 + imag_2**2)
    diff_radius = (radius_2-radius_1)**2
    return np.sqrt(np.sum(diff_radius))

## Scoring of models

In [20]:
def labelDiversity(labels_pred):
    labels_in_clusters = np.bincount(labels_pred)
    return np.std(labels_in_clusters)

In [34]:
def dominanceOfPredictedClusters(labels_pred, labels_true):
    clusters = np.unique(labels_pred)
    cluster_dominance = []
    for l in clusters:
        labels_in_cluster = np.bincount(labels[np.where(labels_pred==l)])
        cluster_dominance.append(np.max(labels_in_cluster)/np.sum(labels_in_cluster))
    return np.mean(cluster_dominance)

## Distance Matrix

In [35]:
def createDistanceMatrix(coef_data, metric, order = 10):
    n = len(coef_data)
    dist_matrix = np.empty([n, n])
    for i in range(0, n):
        for j in range(i, n):
            dist_matrix[i, j] = metric(coef_data[i], coef_data[j], order=order)
            dist_matrix[j, i] = dist_matrix[i, j]
    return dist_matrix

# Model base

In [36]:
from sklearn.cluster import *

In [37]:
n = 10
clustering_models = dict()

In [38]:
#Kmeans
clustering_models['Kmeans_auto'] = KMeans(n_clusters=n, random_state=0, algorithm='auto', n_jobs=-1)
clustering_models['Kmeans_full'] = KMeans(n_clusters=n, random_state=0, algorithm='full', n_jobs=-1)
clustering_models['Kmeans_elkan'] = KMeans(n_clusters=n, random_state=0, algorithm='elkan', n_jobs=-1)

In [39]:
#Agglomerative
clustering_models['Aggl_ward'] = AgglomerativeClustering(n_clusters=n, linkage='ward')
clustering_models['Aggl_complete'] = AgglomerativeClustering(n_clusters=n, linkage='complete')
clustering_models['Aggl_average'] = AgglomerativeClustering(n_clusters=n, linkage='average')
clustering_models['Aggl_single'] = AgglomerativeClustering(n_clusters=n, linkage='single')

In [40]:
#DBSCAN need to normalize matrix first

In [41]:
#OPTICS cant define number of clusters

# Testing

In [42]:
labels_true = labels

In [43]:
for coef in coef_dict:
    distance_matrixes = dict()
    distance_matrixes['mse_radius'] = createDistanceMatrix(coef_dict[coef], metric=find_mse_radius, order = 10)
    distance_matrixes['mse_distance'] = createDistanceMatrix(coef_dict[coef], metric=find_mse_distance, order = 10)
    distance_matrixes['mse_distance_ord'] = createDistanceMatrix(coef_dict[coef], metric=find_mse_coef_order, order = 10)
    distance_matrixes['mse_radius_ord'] = createDistanceMatrix(coef_dict[coef], metric=find_mse_radius_order, order = 10)

    for model in clustering_models:
        for dist in distance_matrixes:
            clustering_models[model].fit(distance_matrixes[dist])
            labels_pred = clustering_models[model].labels_
            print('SET:', coef, 'Model:', model, 'Metric:', dist, 'Diversity:', labelDiversity(labels_pred), 'Dominance:', dominanceOfPredictedClusters(labels_pred, labels_true))

SET: 05 Model: Kmeans_auto Metric: mse_radius Diversity: 107.89161227824896 Dominance: 0.5421049496048028
SET: 05 Model: Kmeans_auto Metric: mse_distance Diversity: 76.78150819044909 Dominance: 0.46362542037766435
SET: 05 Model: Kmeans_auto Metric: mse_distance_ord Diversity: 87.04826247547966 Dominance: 0.5124655695236109
SET: 05 Model: Kmeans_auto Metric: mse_radius_ord Diversity: 107.89161227824896 Dominance: 0.5421049496048028
SET: 05 Model: Kmeans_full Metric: mse_radius Diversity: 107.89161227824896 Dominance: 0.5421049496048028
SET: 05 Model: Kmeans_full Metric: mse_distance Diversity: 76.78150819044909 Dominance: 0.46362542037766435
SET: 05 Model: Kmeans_full Metric: mse_distance_ord Diversity: 87.04826247547966 Dominance: 0.5124655695236109
SET: 05 Model: Kmeans_full Metric: mse_radius_ord Diversity: 107.89161227824896 Dominance: 0.5421049496048028
SET: 05 Model: Kmeans_elkan Metric: mse_radius Diversity: 107.89161227824896 Dominance: 0.5421049496048028
SET: 05 Model: Kmeans_e

SET: 07 Model: Aggl_average Metric: mse_radius Diversity: 218.34651359708036 Dominance: 0.503544326524601
SET: 07 Model: Aggl_average Metric: mse_distance Diversity: 311.6549373906982 Dominance: 0.6142783126331619
SET: 07 Model: Aggl_average Metric: mse_distance_ord Diversity: 228.85322807423975 Dominance: 0.4753654239002896
SET: 07 Model: Aggl_average Metric: mse_radius_ord Diversity: 218.34651359708036 Dominance: 0.503544326524601
SET: 07 Model: Aggl_single Metric: mse_radius Diversity: 596.6667411545577 Dominance: 0.8611557788944724
SET: 07 Model: Aggl_single Metric: mse_distance Diversity: 589.3625369838161 Dominance: 0.8611788617886178
SET: 07 Model: Aggl_single Metric: mse_distance_ord Diversity: 589.3694936116053 Dominance: 0.8611788617886178
SET: 07 Model: Aggl_single Metric: mse_radius_ord Diversity: 596.6667411545577 Dominance: 0.8611557788944724
SET: 08 Model: Kmeans_auto Metric: mse_radius Diversity: 107.63456693832146 Dominance: 0.4871462283515572
SET: 08 Model: Kmeans_aut

# Checking best model

In [72]:
best_coef = coef_dict['05']

In [74]:
best_model = clustering_models['Kmeans_elkan']

In [75]:
best_metric = find_mse_distance

In [76]:
best_dist_matrix = createDistanceMatrix(best_coef, best_metric, order = 10)

In [77]:
best_model.fit(best_dist_matrix)

KMeans(algorithm='elkan', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=-1, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [78]:
np.bincount(best_model.labels_)

array([261, 305, 190, 277,  29, 237, 206, 172, 118, 205], dtype=int64)

In [87]:
np.bincount(labels_true[np.where(best_model.labels_==9)])

array([ 1,  1, 41, 52,  0,  9,  0, 81,  1, 19], dtype=int64)

In [71]:
labels_true[np.where(best_model.labels_==4)]

array([0, 0, 0, 0])