# Non-Parametric, Unsupervised UMAP and HDBSCAN

In [1]:
#Imports

### Unfortunately, UMAP takes a while to import. One of its dependencies (pynndescent) uses numba, 
### which is the cause of the performance bottleneck here.

import cv2
import hdbscan
import json 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import re
import seaborn as sns
import skimage 
import umap
import umap.plot

from bokeh.embed import json_item
from bokeh.models import HoverTool
from bokeh.models.tools import LassoSelectTool
from bokeh.plotting import show as bokeh_show, output_notebook
from glob import glob
from hdbscan import HDBSCAN
from IPython.display import display, HTML, Image, Javascript
from pathlib import Path
from sklearn.preprocessing import StandardScaler

In [2]:
#Constants
IMG_SIZE = 256
IMAGES_DIR = '../images/final_to_match'
MAX_FILES = 300

We first load the images...

In [3]:
files = Path(IMAGES_DIR) 
image_files = []

for file in files.rglob('*'):
    ext = file.suffix.lower()
    if ext == ".jpg": image_files.append(file)
    if len(image_files) >= MAX_FILES: break
    
image_files[0:4]

[PosixPath('../images/final_to_match/HG_161020_047_AM_N9.JPG'),
 PosixPath('../images/final_to_match/HG_161020_052_AM_P5.JPG'),
 PosixPath('../images/final_to_match/HG_161020_062_AM_N8.JPG'),
 PosixPath('../images/final_to_match/HG_161020_071_AM_N7.JPG')]

Then apply additional preprocessing...

In [4]:
images = []
names = []
files = []

for i, file in enumerate(image_files):
    file_str = str(file)
    image = cv2.imread(file_str)
    names.append(file.stem)
    files.append(file_str)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    image = image.astype('float32')
    #image = (image - image.mean(axis=(0, 1), keepdims = True)) / image.std(axis=(0, 1), keepdims = True)
    #image = cv2.normalize(image, None, 0, 1, cv2.NORM_MINMAX)
    images.append(image)
    
    progress = i/MAX_FILES * 100
    if progress % 5 == 0: print(f'{progress}% done')
        
print("Complete!")

0.0% done
5.0% done
10.0% done
15.0% done
20.0% done
25.0% done
30.0% done
35.0% done
40.0% done
45.0% done
50.0% done
60.0% done
65.0% done
70.0% done
75.0% done
80.0% done
85.0% done
90.0% done
95.0% done
Complete!


We then convert it to a numpy array.

In [5]:
images = np.asarray(images)
images.shape

(300, 256, 256)

In [6]:
data = images.reshape((images.shape[0], -1))
#data = StandardScaler().fit_transform(data)
data.shape

(300, 65536)

We then fit UMAP to the data.

In [7]:
mapper = umap.UMAP(
    n_neighbors = 200,
    min_dist = 0.5,
    n_components = 2,
    metric = 'euclidean',
    random_state = 100,
    densmap = False
).fit(data)

umap_res = mapper.transform(data)

We apply HDBSCAN to the UMAP results.

In [8]:
cluster = HDBSCAN(
    algorithm ='best', 
    approx_min_span_tree = True,
    gen_min_span_tree = False, 
    leaf_size = 40, 
    metric='euclidean', 
    min_cluster_size = 15,
    min_samples = 15, 
    p = None
).fit(umap_res)

The results are shown below.

In [9]:
hover_data = pd.DataFrame(
    index = np.arange(
        data.shape[0] 
    ), 
    data = {
        "file": files,
        'index' : np.arange(data.shape[0]),
        'name': names,
        'class': cluster.labels_,
        'probability': cluster.probabilities_,
        'outlier': [ 1 if item == -1 else 0 for item in cluster.labels_ ]
    }
)

In [10]:
hover_data.head(10).drop("index", axis = 1)

Unnamed: 0,file,name,class,probability,outlier
0,../images/final_to_match/HG_161020_047_AM_N9.JPG,HG_161020_047_AM_N9,-1,0.0,1
1,../images/final_to_match/HG_161020_052_AM_P5.JPG,HG_161020_052_AM_P5,-1,0.0,1
2,../images/final_to_match/HG_161020_062_AM_N8.JPG,HG_161020_062_AM_N8,-1,0.0,1
3,../images/final_to_match/HG_161020_071_AM_N7.JPG,HG_161020_071_AM_N7,-1,0.0,1
4,../images/final_to_match/HG_161023_220_AM_N1.JPG,HG_161023_220_AM_N1,-1,0.0,1
5,../images/final_to_match/HG_161023_222_AM_N2.JPG,HG_161023_222_AM_N2,-1,0.0,1
6,../images/final_to_match/HG_161023_224_AM_N4.JPG,HG_161023_224_AM_N4,-1,0.0,1
7,../images/final_to_match/HG_161023_246_AM_P8.JPG,HG_161023_246_AM_P8,-1,0.0,1
8,../images/final_to_match/HG_161023_250_AM_N5.JPG,HG_161023_250_AM_N5,-1,0.0,1
9,../images/final_to_match/HG_161023_257_AM_P1.JPG,HG_161023_257_AM_P1,-1,0.0,1


In [11]:
hover = HoverTool(
    tooltips="""
    <div>
        <div>
            <img
                src="@file" height="128" alt="@file" width="128"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
        <div>
            <span style="font-size: 10px; font-weight: bold;">@name</span>
            <span style="font-size: 10px; color: #966;">[$index]</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">@file</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Class: @class</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Probability: @probability</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Outlier: @outlier</span>
        </div>
    </div>
    """
)

In [15]:
p = umap.plot.interactive(mapper, labels = cluster.labels_,hover_data = hover_data, point_size = 5, interactive_text_search = False)
del p.tools[len(p.tools)-1]
p.add_tools(hover)
p.add_tools(LassoSelectTool())
output_notebook()
bokeh_show(p)

The front-end has been built using Vue.js. Therefore, the data must be consumable using a JS-friendly format. We can convert the Bokeh plot into a JSON blob, and visualise it using Bokeh.js. 

In [16]:
p_json = json.dumps(json_item(p))
p_json[0:150]

'{"target_id": null, "root_id": "1157", "doc": {"defs": [], "roots": {"references": [{"attributes": {"background_fill_color": "white", "below": [{"id":'

In [17]:
display(HTML('<div id="umap"></div>'))
Javascript(f'''Bokeh.embed.embed_item({p_json}, "umap")''')

<IPython.core.display.Javascript object>