In [20]:
from concurrent.futures import ProcessPoolExecutor

import numpy as np
import pandas as pd
import scanpy as sc

from ccHBGF import ccHBGF

In [21]:
from clustering_utils import igraph_leiden

### Lets define our hyperparameters

In [22]:
NSEEDS = 100
RESOLUTION = 8
NWORKERS = 20

ADATA_PATH = '../data/v0.3_clean_control/datasets/neuronal_raw_embed_propagated.h5ad'

### Lets load the dataset

In [None]:
# adata = sc.read_h5ad(ADATA_PATH, backed='r')
adata = sc.read_h5ad(ADATA_PATH)
adata

### Lets get the adjacency matrix

In [24]:
connectivities_key = adata.uns['neighbors']['connectivities_key']
adata_igraph = sc._utils.get_igraph_from_adjacency(adata.obsp[connectivities_key])

### Lets run the Leiden Community Detection

In [25]:
labels_matrix = np.zeros((adata.shape[0], NSEEDS))

In [26]:
def compute_leiden(seed):
    """Wrapper function for Leiden clustering with a specific random seed."""
    return igraph_leiden(adata_igraph, RESOLUTION, random_state=seed)

# Initialize a pool of processes
with ProcessPoolExecutor(max_workers=NWORKERS) as executor:
    results = executor.map(compute_leiden, range(NSEEDS))

# Populate the labels_matrix with the results
for i, labels in enumerate(results):
    labels_matrix[:, i] = labels

### Lets check for singleton clusters

In [27]:
def get_singleton_mask(arr, count_treshold):
    
    mask = np.zeros_like(arr, dtype=bool)

    # Any cluster with < count_threshold is a singleton
    for col_idx in range(arr.shape[1]):

        unique_values, counts = np.unique(arr[:, col_idx], return_counts=True)
        value_counts = dict(zip(unique_values, counts))
        
        for value, count in value_counts.items():
            if count < count_treshold:
                mask[:, col_idx] |= (arr[:, col_idx] == value)

    # If a cells is always within a singleton, then flag it
    final_mask = np.any(mask, axis=1)
    
    return final_mask

In [None]:
singleton_mask = get_singleton_mask(labels_matrix, count_treshold=3)

# How many singletons exist?
n_singletons = sum(singleton_mask)
f"{n_singletons} singletons identified!"

### Lets run the consensus clustering

In [None]:
consensus_labels = np.full(adata.shape[0], fill_value=np.nan)
consensus_labels[~singleton_mask] = ccHBGF(labels_matrix[~singleton_mask], random_state=0, verbose=True)

In [30]:
leiden_label = f'consensus_leiden_r{RESOLUTION:.0f}'
adata.obs[leiden_label] = pd.Categorical(consensus_labels)

In [None]:
sc.pl.umap(adata, color=leiden_label, frameon=False, palette='Dark2', legend_loc=None)
sc.pl.tsne(adata, color=leiden_label, frameon=False, palette='Dark2', legend_loc=None)

In [None]:
for key in adata.uns_keys():
    if 'color' in key:
        del adata.uns[key]

In [34]:
adata.write('../data/v0.3_clean_control/datasets/neuronal_raw_embed_clustered.h5ad')