In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import ujson as json
from sklearn.decomposition import PCA

import umap
import hdbscan
import sklearn.cluster as cluster
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
from tqdm.auto import tqdm
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(10,8)})

In [2]:
# load ResNet activations
data_dir = r'C:\Users\Rodney\PycharmProjects\Thesis_cur-AI-tor\notebooks\micro_dataset1_resnet18_output_identity.json'
with open(data_dir, 'r') as f:
    data_dict_list = json.load(f)

data_dict = {}
for element in data_dict_list:
    data_dict.update(element)

df_x = pd.DataFrame.from_dict(data_dict, orient='index')
# df_x.head()
X = df_x.values

In [10]:
# n_neighbors_l = [60, 120, 240, 480, 960]
n_neighbors_l = [360, 480, 720]
# n_components_l = [10, 20, 40, 80, 160]
n_components_l = [50, 25]
min_samples_l = [10, 20]
# min_cluster_size_l = [500, 750, 1000, 1250, 1500]
min_cluster_size_l = [500]

ls = [n_neighbors_l, n_components_l, min_samples_l, min_cluster_size_l]


import itertools
for element in itertools.product(*ls):
    print(element)

(360, 50, 10, 500)
(360, 50, 20, 500)
(360, 25, 10, 500)
(360, 25, 20, 500)
(480, 50, 10, 500)
(480, 50, 20, 500)
(480, 25, 10, 500)
(480, 25, 20, 500)
(720, 50, 10, 500)
(720, 50, 20, 500)
(720, 25, 10, 500)
(720, 25, 20, 500)


In [24]:
# n_neighbors_l = [360, 480, 720]
n_neighbors_l = [1440, 2880]
n_components_l = [10]
ls = [n_neighbors_l, n_components_l]
umap_params = list(itertools.product(*ls))
umap_params

[(1440, 10), (2880, 10)]

In [25]:
# Generate UMAP embeddings
embedding_list = []
for hparams in tqdm(umap_params):
    clusterable_embedding = umap.UMAP(
        n_neighbors=hparams[0],
        min_dist=0.0,
        n_components=hparams[1],
        random_state=3,
    ).fit_transform(X)
    embedding_list.append(clusterable_embedding)

  0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
min_samples_l = [1, 10, 20, 100]
min_cluster_size_l = [25, 250, 1000, 2000]
ls = [min_samples_l, min_cluster_size_l]
hdbscan_params = list(itertools.product(*ls))
hdbscan_params

[(1, 25),
 (1, 250),
 (1, 1000),
 (1, 2000),
 (10, 25),
 (10, 250),
 (10, 1000),
 (10, 2000),
 (20, 25),
 (20, 250),
 (20, 1000),
 (20, 2000),
 (100, 25),
 (100, 250),
 (100, 1000),
 (100, 2000)]

In [27]:
results = {}
for i, clusterable_embedding in tqdm(enumerate(embedding_list)):
    for hparams in tqdm(hdbscan_params, leave=False):
        labels = hdbscan.HDBSCAN(
            min_samples=hparams[0],
            min_cluster_size=hparams[1],
        ).fit_predict(clusterable_embedding)

        clustered = (labels >= 0)

        d = {hparams: {
            'embedding':(i, clusterable_embedding),
            'metric': np.sum(clustered) / X.shape[0]}}
        results.update(d)
        print(f'hparams: {hparams}, metric: {np.sum(clustered) / X.shape[0]}')

0it [00:00, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

hparams: (1, 25), metric: 0.9637390790251593
hparams: (1, 250), metric: 0.38888523944032055
hparams: (1, 1000), metric: 0.3337055770873021
hparams: (1, 2000), metric: 0.0
hparams: (10, 25), metric: 0.29606516455363596
hparams: (10, 250), metric: 0.30368521316429087
hparams: (10, 1000), metric: 0.2696577547132628
hparams: (10, 2000), metric: 0.0
hparams: (20, 25), metric: 0.29934966826512516
hparams: (20, 250), metric: 0.27583262169086253
hparams: (20, 1000), metric: 0.31222492281416275
hparams: (20, 2000), metric: 0.0
hparams: (100, 25), metric: 0.3231951652105367
hparams: (100, 250), metric: 0.29797017670629966
hparams: (100, 1000), metric: 0.3193851409052092
hparams: (100, 2000), metric: 0.0


  0%|          | 0/16 [00:00<?, ?it/s]

hparams: (1, 25), metric: 0.4941864284306641
hparams: (1, 250), metric: 0.4482690665440452
hparams: (1, 1000), metric: 0.4628522630230572
hparams: (1, 2000), metric: 0.0
hparams: (10, 25), metric: 0.7520856598567957
hparams: (10, 250), metric: 0.30368521316429087
hparams: (10, 1000), metric: 0.3260855284766472
hparams: (10, 2000), metric: 0.0
hparams: (20, 25), metric: 0.32569138803126846
hparams: (20, 250), metric: 0.3629376601195559
hparams: (20, 1000), metric: 0.3817250213492741
hparams: (20, 2000), metric: 0.2843723313407344
hparams: (100, 25), metric: 0.34776325297247584
hparams: (100, 250), metric: 0.4608815607961637
hparams: (100, 1000), metric: 0.5488405701898443
hparams: (100, 2000), metric: 0.5488405701898443


In [11]:
results = {}
# Hyperparameter search
hparams_list = list(itertools.product(*ls))
for hparams in tqdm(hparams_list):

    clusterable_embedding = umap.UMAP(
        n_neighbors=hparams[0],
        min_dist=0.0,
        n_components=hparams[1],
        random_state=3,
    ).fit_transform(X)

    labels = hdbscan.HDBSCAN(
        min_samples=hparams[2],
        min_cluster_size=hparams[3],
    ).fit_predict(clusterable_embedding)

    clustered = (labels >= 0)

    d = {hparams: {
        'embedding':clusterable_embedding,
        'metric': np.sum(clustered) / X.shape[0]}}
    results.update(d)
    print(f'hparams: {hparams}, metric: {np.sum(clustered) / X.shape[0]}')

  0%|          | 0/12 [00:00<?, ?it/s]

hparams: (360, 50, 10, 500), metric: 0.26906654404519476
hparams: (360, 50, 20, 500), metric: 0.30473625435196744
hparams: (360, 25, 10, 500), metric: 0.2750443408001051
hparams: (360, 25, 20, 500), metric: 0.38435262431846545
hparams: (480, 50, 10, 500), metric: 0.41457005846416606


KeyboardInterrupt: 

In [None]:
# 372
for k in results.keys():
    print(results[k]['metric'])

In [33]:

pd.DataFrame(labels).value_counts()

-1    6868
 1    4578
 0    3777
dtype: int64

In [34]:


clusterable_embedding = umap.UMAP(
    n_neighbors=1440,
    min_dist=0.0,
    n_components=10,
    random_state=3,
).fit_transform(X)

labels = hdbscan.HDBSCAN(
    min_samples=1,
    min_cluster_size=500,
).fit_predict(clusterable_embedding)

clustered = (labels >= 0)

In [35]:
np.sum(clustered) / X.shape[0]

0.37581291466859357