In [1]:
from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
import pandas as pd
import random
import shutil
#import umap

# Own libraries
import sys
sys.path.append('/mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning')
from data_manipulation.data import Data
from data_manipulation.utils import store_data
from models.evaluation.folds import load_existing_split
from models.clustering.data_processing import *

## Data to deal with

In [None]:
## This data refers to the vector representations generated by the 10 epoch weights from DGX training

folds_pickle = '/mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/utilities/fold_creation/lattice_5x_folds.pkl'

dataset        = 'LATTICeA_5x'
meta_field     = None
rep_key        = 'z_latent'
matching_field = 'samples'
fold_number    = 0
resolution     = 5.0
groupby        = 'leiden_%s' % resolution

h5_complete_path   = '/mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/LATTICeA_5x/h224_w224_n3_zdim128/hdf5_LATTICeA_5x_he_complete_surv_sex.h5'
h5_additional_path = None
adatas_path        = '/mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/LATTICeA_5x/h224_w224_n3_zdim128/removal_folds_NoNAN/adatas'
adata_name         = h5_complete_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s' % (groupby.replace('.', 'p'), fold_number)

indexes_remove_path = '/mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/utilities/files/indexes_to_remove'
data_remove_path    = os.path.join(indexes_remove_path, dataset)
if not os.path.isdir(data_remove_path):
    os.makedirs(data_remove_path)


### Dataset Images

In [3]:
data = Data(dataset=dataset, marker='he', patch_h=224, patch_w=224, n_channels=3, batch_size=64, project_path='/mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/', load=True)

Train Set: /mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/datasets/LATTICeA_5x/he/patches_h224_w224/hdf5_LATTICeA_5x_he_train.h5
Validation Set: /mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/datasets/LATTICeA_5x/he/patches_h224_w224/hdf5_LATTICeA_5x_he_validation.h5
Test Set: /mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/datasets/LATTICeA_5x/he/patches_h224_w224/hdf5_LATTICeA_5x_he_test.h5



### Folds

### Representations, fold, and set frames.

In [4]:
complete_frame,   complete_dims,   complete_rest   = representations_to_frame(h5_complete_path,   meta_field=meta_field, rep_key=rep_key, check_meta=True)
additional_frame, additional_dims, additional_rest = representations_to_frame(h5_additional_path, meta_field=meta_field, rep_key=rep_key)

Loading representations: /mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/LATTICeA_5x/h224_w224_n3_zdim128/hdf5_LATTICeA_5x_he_complete_surv_sex.h5
Meta field: None Keys: ['has_recurrence', 'hist_subtype', 'img_h_latent', 'img_z_latent', 'indexes', 'labels', 'li', 'male', 'original_set', 'os_event_data', 'os_event_ind', 'patterns', 'recurrence_event_data', 'rfs_event_data', 'rfs_event_ind', 'samples', 'slides', 'smoker', 'thoracic', 'tiles', 'vi']


In [7]:
# Get folds from existing split.
folds = load_existing_split(folds_pickle)

dataframes, complete_df, leiden_clusters = read_csvs(adatas_path, matching_field, groupby, fold_number, folds[fold_number], h5_complete_path, h5_additional_path)
additional_df = dataframes[-1]

In [8]:
complete_frame['complete_indexes'] = list(range(complete_frame.shape[0]))
complete_frame['indexes']          = complete_frame['indexes'].astype(int)
complete_df['indexes']             = complete_df['indexes'].astype(int)

# Cross check clusters with representations.
# Drop entries that were not used for the clustering.
complete_merged = complete_frame.merge(complete_df[[groupby, 'indexes', 'original_set']], how='inner', left_on=['indexes', 'original_set'], right_on=['indexes', 'original_set'])

### Clusters to review

In [None]:
# Data Class with all h5, these contain the images.
data_dicts = dict()
data_dicts['train'] = data.training.images
data_dicts['valid'] = data.validation.images
data_dicts['test']  = data.test.images

def cluster_set_images(frame, data_dicts, cluster_id, groupby, batches=1):
    indexes       = frame[frame[groupby]==cluster_id]['indexes'].values.tolist()
    original_sets = frame[frame[groupby]==cluster_id]['original_set'].values.tolist()
    combined      = list(zip(indexes, original_sets))
    random.shuffle(combined)
    combined_plot = sorted(combined[:100*batches])

    for batch in range(batches):
        images_cluster = list()
        for index, original_set in combined_plot[100*batch:100*(batch+1)]:
            images_cluster.append(data_dicts[original_set][int(index)]/255.)

        sns.set_theme(style='white')
        fig = plt.figure(figsize=(30, 6))
        fig.suptitle('Cluster %s' % (cluster_id), fontsize=18, fontweight='bold')
        grid = ImageGrid(fig, 111, nrows_ncols=(5, 20), axes_pad=0.1,)

        for ax, im in zip(grid, images_cluster):
            ax.imshow(im)
            ax.set_xticks([])
            ax.set_yticks([])

        plt.show()
        sns.set_theme(style='darkgrid')

# for cluster_id in clusters_to_review:
#     print('Number of samples:', complete_merged[complete_merged[groupby]==cluster_id].shape[0])
#     cluster_set_images(complete_merged, data_dicts, cluster_id, groupby, batches=10)
#     print()
#     print()
#     print()

### Dump indexes to pickle

In [None]:
# After reviewing the clusters, we can remove some of them (these are clusters that do not contain any relevant information, e.g. artifacts, noise, etc.)

cluster_descriptions = pd.read_csv('/mnt/cephfs/home/users/krakovic/sharedscratch/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/LATTICeA_5x/h224_w224_n3_zdim128/removal/leiden_5p0_fold0/231212_DGX10e_removal_5p0_f0.csv')

clusters_to_remove = list(cluster_descriptions[cluster_descriptions['remove'] == 1]['cluster'])
clusters_to_remove

[6,
 9,
 16,
 19,
 28,
 29,
 30,
 33,
 42,
 46,
 47,
 50,
 53,
 58,
 59,
 72,
 73,
 74,
 79,
 80,
 85,
 92,
 94,
 100,
 102,
 105,
 107]

In [12]:
data_clusters = complete_merged[complete_merged[groupby].isin(clusters_to_remove)]
data_clusters

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,rfs_event_data,rfs_event_ind,samples,slides,smoker,thoracic,tiles,vi,complete_indexes,leiden_5.0
8,0.217800,0.590432,0.406543,-0.705830,-0.132307,-0.418289,-0.347152,-0.105998,-0.047888,-0.031424,...,9.008219178,1,ACA_0799,ACA_0799_3_40x,1,0.0,50_2.jpeg,0.0,8,29
10,-0.839355,-0.503051,-0.776158,0.832314,-1.286932,1.161961,0.058325,0.638692,-0.968840,0.885668,...,64.96438356,0,ACA_0045,ACA_0045_11_40x,1,0.0,34048_87808.jpeg,0.0,10,19
14,0.023263,0.687327,0.530721,-0.488144,-0.025589,-0.273532,-0.113547,-0.265609,0.103211,-0.032787,...,59.93424658,1,ACA_1050,ACA_1050_10_40x,1,0.0,3_21.jpeg,0.0,14,58
16,-0.658249,-0.150491,1.225978,0.125077,-1.003166,-0.533260,-0.971227,0.711735,0.780629,-1.091792,...,27.15616438,0,ACA_0860,ACA_0860_7_40x,1,0.0,65_45.jpeg,0.0,16,42
20,1.327398,-0.043588,-0.925908,0.697587,-1.330123,0.275887,0.141777,-1.825047,1.691561,1.779177,...,34.3890411,1,ACA_0144,ACA_0144_13_40x,1,0.0,50_58.jpeg,0.0,20,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6992737,0.566942,1.821078,-1.845600,0.681373,-0.259512,2.698395,1.370832,0.994830,-2.114437,0.306831,...,5.589041096,1,ACA_0694,ACA_0694_2_40x,1,0.0,80640_16128.jpeg,1.0,6992737,46
6992738,0.741319,0.118185,-0.330407,-1.044611,-0.292770,-1.026443,-0.553536,0.462613,-0.475469,-0.051125,...,6.575342466,1,ACA_0488,ACA_0488_7_40x,0,0.0,62_16.jpeg,1.0,6992738,28
6992739,-2.394688,0.667215,-1.173937,-1.872837,0.429872,0.602509,1.588251,1.184795,-0.427774,-0.961876,...,2.630136986,0,ACA_0847,ACA_0847_14_40x,1,0.0,28_23.jpeg,0.0,6992739,94
6992744,0.713074,0.421853,-0.942992,0.057427,-1.603604,-0.150910,-0.488317,-1.150092,1.170694,1.507777,...,119.0465753,0,ACA_0725,ACA_0725_7_40x,1,0.0,77056_7168.jpeg,0.0,6992744,80


In [13]:
# Indexes in the train/validation/test set split.
indexes_set = dict()
indexes_set['complete'] = data_clusters.complete_indexes.values.tolist()
for set_name in pd.unique(data_clusters['original_set'].values):
    indexes_set[set_name] = data_clusters[data_clusters['original_set']==set_name]['indexes'].astype(int).values.tolist()

for set_name in indexes_set:
    pickle_path = os.path.join(data_remove_path, '%s.pkl' % set_name)
    store_data(indexes_set[set_name], pickle_path)

### Additional dataset

In [83]:
if not additional_frame.shape[0] == additional_df.shape[0]:
    print('[Error] Expecting the same representations in the original H5 file and the cluster assignations.')
else:
    set_name = h5_additional_path.split('/hdf5_')[1].split('_he')[0]
    pickle_path = os.path.join(data_remove_path, '%s.pkl' % set_name)
    indexes_set = additional_df[additional_df[groupby].isin(clusters_to_remove)].index.astype(int).values.tolist()
    if len(indexes_set) > 0:
        store_data(indexes_set, pickle_path)



### Copy over the h5ad for reference, disk costly but necessary for backtracking

In [None]:
adata_name  = h5_complete_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s' % (groupby.replace('.', 'p'), fold_number)
h5_path = os.path.join(adatas_path, adata_name) + '.h5ad'
shutil.copy2(h5_path, data_remove_path)