# Set-up

## Imports

In [None]:
import os
import json

import numpy as np
import matplotlib.pyplot as plt


In [None]:
from source.constants import DATA_DIR, FEATURE_VECTORS_SAVE_DIR, ANNOTATIONS_SAVE_DIR
from source.constants import ALL_CANCER_TYPES, ALL_IMG_NORMS, ALL_EXTRACTOR_MODELS, ALL_DIMENSIONALITY_REDUCTION_METHODS, ALL_CLUSTERING_ALGORITHMS, ALL_DISTANCE_METRICS

print(f"DATA_DIR: {DATA_DIR}")
print(f"FEATURE_VECTORS_SAVE_DIR: {FEATURE_VECTORS_SAVE_DIR}")
print(f"ANNOTATIONS_SAVE_DIR: {ANNOTATIONS_SAVE_DIR}")

print("ALL_CANCER_TYPES:", ALL_CANCER_TYPES)
print("ALL_EXTRACTOR_MODELS:", ALL_EXTRACTOR_MODELS)
print("ALL_IMG_NORMS:", ALL_IMG_NORMS)
print("ALL_DIMENSIONALITY_REDUCTION_METHODS:", ALL_DIMENSIONALITY_REDUCTION_METHODS)
print("ALL_CLUSTERING_ALGORITHMS:", ALL_CLUSTERING_ALGORITHMS)
print("ALL_DISTANCE_METRICS:", ALL_DISTANCE_METRICS)

In [None]:
from source.eval_utils import precision_at_1, precision_at_k
from source.eval_utils import reduce_feature_dimensionality, get_clustering_labels, compute_clustering_metrics

In [None]:
from evaluate_clustering import get_true_connectivity, get_predicted_connectivity

In [None]:
%load_ext autoreload
%autoreload 2

## Notebook Constants

In [None]:
# TODO: Set the constants for the evaluation
CANCER_TYPE = 'lung_aca'
EXTRACTOR_NAME = 'UNI'
IMG_NORM = 'resize_only'
DISTANCE_METRIC = 'cosine'
DIMENSIONALITY_REDUCTION_METHOD = 'UMAP-2'
CLUSTERING_ALGORITHM = 'kmeans'

assert CANCER_TYPE in ALL_CANCER_TYPES
assert EXTRACTOR_NAME in ALL_EXTRACTOR_MODELS
assert IMG_NORM in ALL_IMG_NORMS
assert CLUSTERING_ALGORITHM in ALL_CLUSTERING_ALGORITHMS
assert DIMENSIONALITY_REDUCTION_METHOD in ALL_DIMENSIONALITY_REDUCTION_METHODS
assert DISTANCE_METRIC in ALL_DISTANCE_METRICS

In [None]:
# expected to already be there
features_npy_path = f'{FEATURE_VECTORS_SAVE_DIR}/{CANCER_TYPE}/{EXTRACTOR_NAME}/{IMG_NORM}/features.npy'
ids_2_imgpaths_json_path = f'{FEATURE_VECTORS_SAVE_DIR}/{CANCER_TYPE}/{EXTRACTOR_NAME}/{IMG_NORM}/ids_2_img_paths.json'
assert os.path.isfile(features_npy_path)
assert os.path.isfile(ids_2_imgpaths_json_path)

In [None]:
# load saved manual annotations
manual_annotations_dir = os.path.join(ANNOTATIONS_SAVE_DIR, CANCER_TYPE, 'UNI', 'resize_only')
assert os.path.isdir(manual_annotations_dir)

# print the contents of the results directory
print("Manual annotations directory", manual_annotations_dir)
print("Manual annotations directory contents:\n", os.listdir(manual_annotations_dir))

In [None]:
features = np.load(features_npy_path)

if DISTANCE_METRIC == 'cosine':
    # normalise the features to have euclidian distance and cosine similarity being monotonically related
    features = features / np.linalg.norm(features, axis=1,  keepdims=True)

# Load image paths
with open(ids_2_imgpaths_json_path, 'r') as f:
    ids_2_imgpaths = json.load(f)
print(ids_2_imgpaths)

# check that the values are unique, this will allow bijective mapping
assert len(set(ids_2_imgpaths.values())) == len(ids_2_imgpaths.values())
imgpaths_2_intids = {v: int(k) for k, v in ids_2_imgpaths.items()}
print(imgpaths_2_intids)

## True Clustering

In [None]:
true_connectivity_matrix, true_connectivity_vector, true_cluster_labels, num_true_clusters = get_true_connectivity(
    manual_annotations_dir,
    ids_2_imgpaths
)

print("Num true clusters:", num_true_clusters)
print("Total num images in all clusters:", len(true_cluster_labels))

# Dimensionality Reduction

In [None]:
print(DIMENSIONALITY_REDUCTION_METHOD)

In [None]:
features_reduced = reduce_feature_dimensionality(features, method=DIMENSIONALITY_REDUCTION_METHOD)
print(features_reduced.shape)

# Precision@1, Precision@5 - use euclidean distance because already normalized before and did dimensionality reduction

In [None]:
precision_at_1_value = precision_at_1(features_reduced, true_connectivity_matrix, metric='euclidean')
precision_at_5_value = precision_at_k(features_reduced, true_connectivity_matrix, k=5, metric='euclidean')

print("precision@1:", precision_at_1_value)
print("precision@5:", precision_at_5_value)

In [None]:
from pytorch_metric_learning.utils.accuracy_calculator import AccuracyCalculator

accuracy_calculator = AccuracyCalculator()
accuracy_dict = accuracy_calculator.get_accuracy(
    query=features_reduced,
    query_labels=true_cluster_labels
)

precision_at_1_value, accuracy_dict['precision_at_1']

# Unsupervised Clustering

In [None]:
print("Clustering algorithm:", CLUSTERING_ALGORITHM)

In [None]:
predicted_cluster_labels = get_clustering_labels(
    features=features_reduced,
    n_clusters=num_true_clusters,
    method=CLUSTERING_ALGORITHM
)

# plot distribution of predicted cluster labels, should be close to uniform with 5000 / 250 = 20 images per cluster
counts, bins = np.histogram(predicted_cluster_labels, bins=num_true_clusters)
plt.hist(bins[:-1], bins, weights=counts)
plt.show()

In [None]:
# plot how many times the size of the cluster appears; should center around 20
plt.hist(counts, bins=np.unique(counts))

In [None]:
predicted_connectivity_vector = get_predicted_connectivity(
    predicted_cluster_labels,
    ids_2_imgpaths
)

assert predicted_connectivity_vector.shape == true_connectivity_vector.shape
print("True connectivity vector shape:", true_connectivity_vector.shape)

## Evaluation

In [None]:
print(f"{EXTRACTOR_NAME}#{IMG_NORM}#{DISTANCE_METRIC}#{DIMENSIONALITY_REDUCTION_METHOD}#{CLUSTERING_ALGORITHM}")

print("\n```")
# Compute metrics
metrics = compute_clustering_metrics(
    true_connectivity_vector, predicted_connectivity_vector, true_cluster_labels, predicted_cluster_labels)
metrics['precision@1'] = precision_at_1_value
metrics['precision@5'] = precision_at_5_value

for metric, value in metrics.items():
    if isinstance(value, int):
        print(f"{metric}: {value}")
    elif isinstance(value, float):
        print(f"{metric}: {value:.4f}")
    else:
        print(f"{metric}:\n {value}")
print("```")