In [4]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import gc
import scanpy as sc
import numpy as np
import random
import os

root = './dataset/'

In [2]:
data_path = os.path.join(root, 'Dataset32', 'scRNA_count_cluster.h5ad')
data32 = sc.read(data_path)

data32

AnnData object with n_obs × n_vars = 6178 × 21164
    obs: 'merge_cell_type'

In [3]:
len(set(data32.obs['merge_cell_type']))

14

# Finding the target K value

In [None]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'

def leiden_clustering(adata, resolution, seed):
    sc.tl.pca(adata, random_state=seed)
    
    sc.pp.neighbors(adata, random_state=seed)
    
    sc.tl.leiden(adata, random_state=seed, resolution=resolution)
    return adata

def res_search(adata_pred, ncluster, seed=1234, max_iter=200, tol=1e-3):

    start, end = 0, 3  # resolution 搜索范围
    best_res = None

    for i in range(max_iter):
        res = (start + end) / 2
        set_random_seed(seed)

        leiden_clustering(adata_pred, res, seed)

        count = len(set(adata_pred.obs['leiden']))
        print(f'Iter {i + 1}: res = {res:.4f}, count = {count}, target = {ncluster}')

        if count == ncluster:
            print(f'Found optimal resolution: {res}')
            return res
        elif count > ncluster:
            end = res 
        else:
            start = res 

        if abs(end - start) < tol:
            print(f'Converged to resolution: {res} (count = {count})')
            return res

    raise RuntimeError(f'Failed to find resolution after {max_iter} iterations')


k_values = [5,10,15,20,25,30]

for k in k_values:
    print(f"\nProcessing K = {k}...")
    
    res = res_search(data32, k)
    
    leiden_clustering(data32, res, seed=1234)
    data32.obs[f'leiden{k}'] = data32.obs['leiden']
    print(f"Leiden clustering results for K = {k} stored in 'leiden{k}'")
    print(data32)

data32.write('./dataset/Dataset32/Review_scRNA_count_cluster.h5ad')
print("\nAll Leiden clustering results saved'")


Processing K = 5...


2025-03-03 17:16:01.560271: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-03 17:16:04.029563: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-03 17:16:04.029624: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-03 17:16:04.393708: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-03 17:16:05.183497: I tensorflow/core/platform/cpu_feature_guar

Iter 1: res = 1.5000, count = 25, target = 5
Iter 2: res = 0.7500, count = 15, target = 5
Iter 3: res = 0.3750, count = 9, target = 5
Iter 4: res = 0.1875, count = 4, target = 5
Iter 5: res = 0.2812, count = 8, target = 5
Iter 6: res = 0.2344, count = 5, target = 5
Found optimal resolution: 0.234375
Leiden clustering results for K = 5 stored in 'leiden5'
AnnData object with n_obs × n_vars = 6178 × 21164
    obs: 'merge_cell_type', 'leiden', 'leiden5'
    uns: 'pca', 'neighbors', 'leiden'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

Processing K = 10...
Iter 1: res = 1.5000, count = 25, target = 10
Iter 2: res = 0.7500, count = 15, target = 10
Iter 3: res = 0.3750, count = 9, target = 10
Iter 4: res = 0.5625, count = 12, target = 10
Iter 5: res = 0.4688, count = 9, target = 10
Iter 6: res = 0.5156, count = 11, target = 10
Iter 7: res = 0.4922, count = 11, target = 10
Iter 8: res = 0.4805, count = 11, target = 10
Iter 9: res = 0.4746, count = 11, target = 10

In [6]:
data32

AnnData object with n_obs × n_vars = 6178 × 21164
    obs: 'merge_cell_type', 'leiden', 'leiden5', 'leiden10', 'leiden15', 'leiden20', 'leiden25', 'leiden30'
    uns: 'pca', 'neighbors', 'leiden'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

## Then, simply replace the "merge_cell_type" in `dataset.py` with the Leiden clustering results corresponding to different K values:

## If K = 5: `self.seq_cluster = self.seq_data.obs['leiden5'].cat.codes.values`