In [1]:
#Imports

### Unfortunately, UMAP takes a while to import. One of its dependencies (pynndescent) uses numba, 
### which is the cause of the performance bottleneck here.

import cv2
import hdbscan
import json 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import re
import seaborn as sns
import skimage 
import umap
import umap.plot

from bokeh.embed import json_item
from bokeh.models import HoverTool
from bokeh.models.tools import LassoSelectTool
from bokeh.plotting import show as bokeh_show, output_notebook
from glob import glob
from joblib import dump, load
from hdbscan import HDBSCAN
from IPython.display import display, HTML, Image, Javascript
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
#Constants
IMG_SIZE = 256
IMAGES_DIR = '../images/final_pigmentation_catalogue_2016'
TARGET_IMAGES_DIR = '../images/final_to_match'
MAX_FILES = 300

We first load the labelled images.

In [3]:
image_dirs = Path(IMAGES_DIR) 

labels = []
image_files = []

for image_dir in image_dirs.glob('*'):
    label = image_dir.stem
    for file in image_dir.glob('*'):
        ext = file.suffix.lower()
        if ext != ".png": continue
        image_files.append(file)
        labels.append(label)
    if len(image_files) >= MAX_FILES: break

Then we apply additional pre-processing.

In [4]:
images = []
names = []
files = []

for i, file in enumerate(image_files):
    file_str = str(file)
    image = cv2.imread(file_str)
    names.append(file.stem)
    files.append(file_str)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    image = image.astype('float32')
    image = cv2.bilateralFilter(image, 9, 50, 50)
    #image = cv2.normalize(image, np.zeros((IMG_SIZE, IMG_SIZE)), 0, 1, cv2.NORM_MINMAX)
    images.append(image)
    
    progress = i/MAX_FILES * 100
    if progress % 5 == 0: print(f'{progress}% done')
        
print("Complete!")

0.0% done
5.0% done
10.0% done
15.0% done
20.0% done
25.0% done
30.0% done
35.0% done
40.0% done
45.0% done
50.0% done
60.0% done
65.0% done
70.0% done
75.0% done
80.0% done
85.0% done
90.0% done
95.0% done
100.0% done
Complete!


The image data is converted into a numpy array.

In [5]:
images = np.asarray(images)
images.shape

(312, 256, 256)

Then flattened into 2D.

In [6]:
data = images.reshape((images.shape[0], -1))
#data = StandardScaler().fit_transform(data)
data.shape

(312, 65536)

UMAP will be fitted to the data. This time, the class labels will be provided.

A collection of all the class labels:

In [7]:
y_encoder = LabelEncoder()
y_encoder.fit(labels)
y_target = y_encoder.transform(labels)
set(labels)

{'0548',
 '0550',
 '0600',
 '0619',
 '0627',
 '0659',
 '0660',
 '0664',
 '0685',
 '0693',
 '0698',
 '0704',
 '0710',
 '0718',
 '0722',
 '0723',
 '0727'}

In [8]:
mapper = umap.UMAP(
    n_neighbors = 200,
    min_dist = 0.5,
    n_components = 2,
    metric = 'euclidean',
    random_state = 100,
    densmap = False
).fit(data, y = y_target)

umap_res = mapper.transform(data)

In [9]:
dump(mapper, 'umap.joblib') 

['umap.joblib']

We apply HDBSCAN to the UMAP results.

In [10]:
cluster = HDBSCAN(
    algorithm ='best', 
    approx_min_span_tree = True,
    gen_min_span_tree = False, 
    leaf_size = 40, 
    metric='euclidean', 
    min_cluster_size = 10,
    min_samples = 10, 
    p = None
).fit(umap_res)

In [11]:
hover_data = pd.DataFrame(
    index = np.arange(
        data.shape[0] 
    ), 
    data = {
        'index' : np.arange(data.shape[0]),
        'name': names,
        "file": files,
        "identity": labels,
        'cluster_class': cluster.labels_,
        'probability': cluster.probabilities_,
        'outlier': [ 1 if item == -1 else 0 for item in cluster.labels_ ]
    }
)

In [12]:
hover_data.head(10).drop("index", axis = 1)

Unnamed: 0,name,file,identity,cluster_class,probability,outlier
0,HG_110923_100_E2_N3,../images/final_pigmentation_catalogue_2016/05...,548,2,0.912379,0
1,HG_130711_093_E4_CL_N7,../images/final_pigmentation_catalogue_2016/05...,548,2,1.0,0
2,HG_130720_0333_E1_CL_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,0.778601,0
3,HG_130720_0334_E1_CL_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,0.586395,0
4,HG_130720_0359_E1_CL_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,0.85475,0
5,HG_130720_0360_E1_CL_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,1.0,0
6,HG_130720_0361_E1_CL_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,1.0,0
7,HG_130720_0362_E1_CL_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,1.0,0
8,HG_130720_0363_E1_CL_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,1.0,0
9,HG_130720_0462_E1_KR_AII,../images/final_pigmentation_catalogue_2016/05...,548,2,0.51059,0


In [13]:
hover = HoverTool(
    tooltips="""
    <div>
        <div>
            <img
                src="@file" height="128" alt="@file" width="128"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
        <div>
            <span style="font-size: 10px; font-weight: bold;">@name</span>
            <span style="font-size: 10px; color: #966;">[$index]</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">@file</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Identity: @identity</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Probability: @probability</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Outlier: @outlier</span>
        </div>
    </div>
    """
)

In [28]:
p = umap.plot.interactive(mapper, labels = labels,hover_data = hover_data, point_size = 5, interactive_text_search = False)
del p.tools[len(p.tools)-1]
p.add_tools(hover)
p.add_tools(LassoSelectTool())
output_notebook()
bokeh_show(p)

The front-end has been built using Vue.js. Therefore, the data must be consumable using a JS-friendly format. We can convert the Bokeh plot into a JSON blob, and visualise it using Bokeh.js. 

In [15]:
p_json = json.dumps(json_item(p))
p_json[0:150]

'{"target_id": null, "root_id": "1004", "doc": {"defs": [], "roots": {"references": [{"attributes": {"callback": null, "tooltips": "\\n    <div>\\n      '

In [16]:
display(HTML('<div id="umap"></div>'))
Javascript(f'''Bokeh.embed.embed_item({p_json}, "umap")''')

<IPython.core.display.Javascript object>

Now, we want to apply UMAP to the unlabelled data.

In [17]:
sup_files = Path(TARGET_IMAGES_DIR) 
sup_image_files = []

for file in sup_files.rglob('*'):
    ext = file.suffix.lower()
    if ext == ".jpg": sup_image_files.append(file)
    if len(sup_image_files) >= MAX_FILES: break
    
sup_image_files[0:4]

[PosixPath('../images/final_to_match/HG_161020_047_AM_N9.JPG'),
 PosixPath('../images/final_to_match/HG_161020_052_AM_P5.JPG'),
 PosixPath('../images/final_to_match/HG_161020_062_AM_N8.JPG'),
 PosixPath('../images/final_to_match/HG_161020_071_AM_N7.JPG')]

In [18]:
sup_images = []
sup_names = []
sup_files = []

for i, file in enumerate(sup_image_files):
    file_str = str(file)
    image = cv2.imread(file_str)
    sup_names.append(file.stem)
    sup_files.append(file_str)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    image = image.astype('float32')
    #image = (image - image.mean(axis=(0, 1), keepdims = True)) / image.std(axis=(0, 1), keepdims = True)
    #image = cv2.normalize(image, None, 0, 1, cv2.NORM_MINMAX)
    sup_images.append(image)
    
    progress = i/MAX_FILES * 100
    if progress % 5 == 0: print(f'{progress}% done')
        
print("Complete!")

0.0% done
5.0% done
10.0% done
15.0% done
20.0% done
25.0% done
30.0% done
35.0% done
40.0% done
45.0% done
50.0% done
60.0% done
65.0% done
70.0% done
75.0% done
80.0% done
85.0% done
90.0% done
95.0% done
Complete!


In [19]:
sup_images = np.asarray(sup_images)
sup_images.shape

(300, 256, 256)

In [20]:
sup_data = sup_images.reshape((sup_images.shape[0], -1))
#data = StandardScaler().fit_transform(data)
sup_data.shape

(300, 65536)

In [21]:
sup_umap_res = mapper.transform(sup_data)

In [22]:
sup_cluster = HDBSCAN(
    algorithm ='best', 
    approx_min_span_tree = True,
    gen_min_span_tree = False, 
    leaf_size = 40, 
    metric='euclidean', 
    min_cluster_size = 15,
    min_samples = 15, 
    p = None
).fit(sup_umap_res)

In [23]:
sup_umap_res.shape

(300, 2)

In [24]:
sup_hover_data = pd.DataFrame(
    index = np.arange(
        sup_data.shape[0] 
    ), 
    data = {
        'index' : np.arange(sup_data.shape[0]),
        'name': sup_names,
        "file": sup_files,
        'class': sup_cluster.labels_,
        'probability': sup_cluster.probabilities_,
        'outlier': [ 1 if item == -1 else 0 for item in sup_cluster.labels_ ]
    }
)

In [25]:
sup_hover_data.head(10).drop("index", axis = 1)

Unnamed: 0,name,file,class,probability,outlier
0,HG_161020_047_AM_N9,../images/final_to_match/HG_161020_047_AM_N9.JPG,1,0.554439,0
1,HG_161020_052_AM_P5,../images/final_to_match/HG_161020_052_AM_P5.JPG,-1,0.0,1
2,HG_161020_062_AM_N8,../images/final_to_match/HG_161020_062_AM_N8.JPG,-1,0.0,1
3,HG_161020_071_AM_N7,../images/final_to_match/HG_161020_071_AM_N7.JPG,-1,0.0,1
4,HG_161023_220_AM_N1,../images/final_to_match/HG_161023_220_AM_N1.JPG,-1,0.0,1
5,HG_161023_222_AM_N2,../images/final_to_match/HG_161023_222_AM_N2.JPG,-1,0.0,1
6,HG_161023_224_AM_N4,../images/final_to_match/HG_161023_224_AM_N4.JPG,0,0.728885,0
7,HG_161023_246_AM_P8,../images/final_to_match/HG_161023_246_AM_P8.JPG,1,0.716484,0
8,HG_161023_250_AM_N5,../images/final_to_match/HG_161023_250_AM_N5.JPG,1,0.813304,0
9,HG_161023_257_AM_P1,../images/final_to_match/HG_161023_257_AM_P1.JPG,2,1.0,0


In [26]:
hover = HoverTool(
    tooltips="""
    <div>
        <div>
            <img
                src="@file" height="128" alt="@file" width="128"
                style="float: left; margin: 0px 15px 15px 0px;"
                border="2"
            ></img>
        </div>
        <div>
            <span style="font-size: 10px; font-weight: bold;">@name</span>
            <span style="font-size: 10px; color: #966;">[$index]</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">@file</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Class: @class</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Probability: @probability</span>
        </div>
        <div>
            <span style="font-size: 9px; color: #966;">Outlier: @outlier</span>
        </div>
    </div>
    """
)

In [27]:
sup_mapper = umap.UMAP()
sup_mapper.embedding = sup_umap_res
p = umap.plot.interactive(sup_mapper, labels = sup_cluster.labels_,hover_data = sup_hover_data, point_size = 5, interactive_text_search = False)
del p.tools[len(p.tools)-1]
p.add_tools(hover)
p.add_tools(LassoSelectTool())
output_notebook()
bokeh_show(p)