# Extract some fields of anndata to test (original) reimplemented kBET
- use metric-dev

In [1]:
import os
import sys
import scib
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from collections import Counter

In [2]:
# Add the parent directory to sys.path
sys.path.append(os.path.abspath('..'))
from kbet_utils import NeighborsError, diffusion_nn

### Sc 04

In [3]:
scenario = "sc_04"
scenario

'sc_04'

In [5]:
# adata = sc.read_h5ad("../scenarios/2025-10-28_sc_04.h5ad")
adata = sc.read_h5ad("2025-10-28_sc_04.h5ad")

adata

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'

In [6]:
print("Reducing data...")
scib.preprocessing.reduce_data(adata, batch_key="batch", umap=True)

adata

Reducing data...
HVG
Using 2000 HVGs from full intersect set
Using 0 HVGs from n_batch-1 set
Using 2000 HVGs
Computed 2000 highly variable genes
PCA


  adata.var["highly_variable"] = np.in1d(adata.var_names, hvg_list)


Nearest Neigbours


  from .autonotebook import tqdm as notebook_tqdm


UMAP




AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [7]:
# keep only HVGs
adata = adata[:, adata.var["highly_variable"]].copy()

adata

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [8]:
adata.uns["neighbors"]

{'connectivities_key': 'connectivities',
 'distances_key': 'distances',
 'params': {'n_neighbors': 15,
  'method': 'umap',
  'random_state': 0,
  'metric': 'euclidean',
  'use_rep': 'X_pca'}}

In [9]:
batch_key = "batch"
label_key = "cell_type"
type_ = "full"
embed = "X_pca"
scaled = True
return_df = True
verbose = True

In [10]:
# compute connectivities for non-knn type data integrations
if type_ != "knn" and embed is not None:
    # compute nearest neighbors distance matrix and a neighborhood graph of cells
    # 50 neighbors for each cell
    adata_tmp = sc.pp.neighbors(adata, n_neighbors=50, use_rep=embed, copy=True)

adata_tmp

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [11]:
adata_tmp.uns["neighbors"]

{'connectivities_key': 'connectivities',
 'distances_key': 'distances',
 'params': {'n_neighbors': 50,
  'method': 'umap',
  'random_state': 0,
  'metric': 'euclidean',
  'use_rep': 'X_pca'}}

In [12]:
if verbose:
    print(f"batch: {batch_key}")

batch: batch


In [13]:
# set upper bound for k0
size_max = 2**31 - 1
size_max

2147483647

In [14]:
# check if neighborhood size too small or only one batch in cell type

In [15]:
# counts: # cells in cell type and # batches in cell type
counts = adata_tmp.obs.groupby(label_key).agg(
    {
        label_key: "count",
        batch_key: "nunique"
    }
)
counts

  counts = adata_tmp.obs.groupby(label_key).agg(


Unnamed: 0_level_0,cell_type,batch
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1
cell type 1,600,2
cell type 2,600,2
cell type 3,600,2


##### cell type 1

In [16]:
scenario

'sc_04'

In [17]:
cell_type = "1"
sc_dir = f"{scenario}/{cell_type}/original_kbet"

os.makedirs(sc_dir, exist_ok=True)

In [18]:
sc_dir

'sc_04/1/original_kbet'

In [19]:
adata_tmp

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [20]:
clus = "cell type 1"

In [21]:
adata_sub = adata_tmp[adata_tmp.obs["cell_type"] == clus, :].copy()

adata_sub

AnnData object with n_obs × n_vars = 600 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [22]:
# mean: avg number of cells per batch in cell type
mean = np.mean(adata_sub.obs[batch_key].value_counts())
print(f"Average number of cells per batch in {clus}:", mean)

Average number of cells per batch in cell type 1: 300.0


In [23]:
quarter_mean = np.floor(mean / 4).astype("int")
quarter_mean

np.int64(75)

In [24]:
k0 = np.min([70, np.max([10, quarter_mean])])
print(f"Using k0={k0}")

Using k0=70


In [25]:
if k0 * adata_sub.n_obs >= size_max:
    print("Neighborhood size k0={k0} is not reasonable. Changin it")
    k0 = np.floor(size_max / adata_sub.n_obs).astype("int")

if verbose:
    print(f"Use {k0} nearest neighbors.")

Use 70 nearest neighbors.


In [26]:
matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))

matrix.shape, matrix

((600, 71),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [27]:
df = pd.DataFrame(matrix)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
sc_dir

'sc_04/1/original_kbet'

In [29]:
df.to_csv(f"{sc_dir}/df.csv", index=False, header=False)

In [30]:
batch = adata_sub.obs["batch"]

batch

cell_id
cell_0      batch 1
cell_1      batch 1
cell_2      batch 1
cell_3      batch 1
cell_4      batch 1
             ...   
cell_595    batch 2
cell_596    batch 2
cell_597    batch 2
cell_598    batch 2
cell_599    batch 2
Name: batch, Length: 600, dtype: category
Categories (2, object): ['batch 1', 'batch 2']

In [31]:
batch.to_csv(f"{sc_dir}/batch.csv", index=False, header=False)

In [32]:
# get k0 nearest neighbors indices for each cell in this cell type
nn_index_sub = diffusion_nn(adata_sub, k=k0).astype("float")

nn_index_sub.shape

(600, 70)

In [33]:
nn_index_sub

array([[298.,  16., 442., ..., 182.,  94., 377.],
       [347., 407.,  18., ..., 183., 368., 101.],
       [410.,  30., 323., ..., 244., 186., 138.],
       ...,
       [515., 580., 569., ..., 598., 540., 495.],
       [530., 485., 543., ..., 497., 576., 484.],
       [528., 506., 507., ..., 521., 553., 487.]])

In [34]:
pd.DataFrame(nn_index_sub + 1).to_csv(f"{sc_dir}/knn.csv", index=False, header=False)

##### cell type 1 largest comp

In [53]:
scenario

'sc_04'

In [54]:
cell_type = "1"
sc_dir = f"{scenario}/{cell_type}/largest_comp"

os.makedirs(sc_dir, exist_ok=True)
sc_dir

'sc_04/1/largest_comp'

In [35]:
# n_comp: # strongly connected components
# labs: np array that holds index of component each cell belong to
n_comp, labs = scipy.sparse.csgraph.connected_components(
    csgraph=adata_sub.obsp["connectivities"],
    connection="strong"
)
n_comp

2

In [36]:
labs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [37]:
Counter(labs)

Counter({np.int32(0): 480, np.int32(1): 120})

In [38]:
if verbose:
    print(f"There are {n_comp} connected components (i.e., batches are not mixed).")

There are 2 connected components (i.e., batches are not mixed).


In [39]:
# get component sizes
comp_size = pd.value_counts(labs)
comp_size

  comp_size = pd.value_counts(labs)


0    480
1    120
Name: count, dtype: int64

In [40]:
# threshold
comp_size_thresh = 3 * k0
comp_size_thresh

np.int64(210)

In [41]:
# check which components are small
idx_nonan = np.flatnonzero(
    np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
)
idx_nonan.shape, idx_nonan

  np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)


((480,),
 array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175,

In [42]:
len(idx_nonan), len(labs)

(480, 600)

In [43]:
print(f"{len(idx_nonan)} cells are in components of size >= 3*k0.")
print(f"{len(labs) - len(idx_nonan)} cells are in components of size < 3*k0 and will be ignored.")

480 cells are in components of size >= 3*k0.
120 cells are in components of size < 3*k0 and will be ignored.


In [44]:
len(idx_nonan) / len(labs)

0.8

In [45]:
# get part of adata object related to the cells in these valid components
adata_sub_sub = adata_sub[idx_nonan, :].copy()
adata_sub_sub   # only component 0 is selected

AnnData object with n_obs × n_vars = 480 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [46]:
nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
nn_index_tmp.shape, nn_index_tmp

((600, 70),
 array([[6.40783711e-310, 6.40783711e-310, 5.15939089e-310, ...,
         3.03101420e+000, 3.06540394e+000, 3.06568575e+000],
        [3.07262087e+000, 3.07597089e+000, 3.07689834e+000, ...,
         3.61631083e+000, 3.62086511e+000, 3.62399912e+000],
        [3.62826157e+000, 3.62884283e+000, 3.63332891e+000, ...,
         2.94693494e+000, 2.96366310e+000, 2.98333788e+000],
        ...,
        [1.14799972e-311, 1.15436571e-311, 1.15860970e-311, ...,
         1.18195166e-311, 1.18619565e-311, 1.19043964e-311],
        [1.19468363e-311, 1.19892762e-311, 1.20317161e-311, ...,
         1.17134168e-311, 1.17982966e-311, 1.20529361e-311],
        [1.21165960e-311, 1.22226958e-311, 1.23500155e-311, ...,
         1.24136754e-311, 1.03765594e-311, 1.15012172e-311]]))

In [47]:
nn_index_tmp[:] = np.nan
nn_index_tmp

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [48]:
nn_index_tmp[idx_nonan] = diffusion_nn(adata_sub_sub, k=k0).astype("float")
nn_index_tmp

array([[298.,  16., 442., ..., 182.,  94., 377.],
       [347., 407.,  18., ..., 183., 368., 101.],
       [410.,  30., 323., ..., 244., 186., 138.],
       ...,
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

In [49]:
print("Indices of nan rows:")
idx_all_nan = np.where(np.all(np.isnan(nn_index_tmp), axis=1))[0]
idx_all_nan

Indices of nan rows:


array([480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492,
       493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
       506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518,
       519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531,
       532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544,
       545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557,
       558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570,
       571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583,
       584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596,
       597, 598, 599])

In [50]:
matrix = np.zeros(shape=(adata_sub.n_obs, k0))
matrix.shape

(600, 70)

In [51]:
df = pd.DataFrame(matrix)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
sc_dir

'sc_04/1/largest_comp'

In [56]:
df.to_csv(f"{sc_dir}/df.csv", index=False, header=False)

In [57]:
batch = adata_sub.obs["batch"]

batch

cell_id
cell_0      batch 1
cell_1      batch 1
cell_2      batch 1
cell_3      batch 1
cell_4      batch 1
             ...   
cell_595    batch 2
cell_596    batch 2
cell_597    batch 2
cell_598    batch 2
cell_599    batch 2
Name: batch, Length: 600, dtype: category
Categories (2, object): ['batch 1', 'batch 2']

In [58]:
Counter(batch)

Counter({'batch 1': 480, 'batch 2': 120})

In [59]:
batch.to_csv(f"{sc_dir}/batch.csv", index=False, header=False)

In [60]:
pd.DataFrame(nn_index_tmp + 1).to_csv(f"{sc_dir}/knn.csv", index=False, header=False)

##### cell type 2

In [61]:
scenario

'sc_04'

In [62]:
cell_type = "2"
sc_dir = f"{scenario}/{cell_type}/original_kbet"

os.makedirs(sc_dir, exist_ok=True)

In [63]:
sc_dir

'sc_04/2/original_kbet'

In [64]:
adata_tmp

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [65]:
clus = "cell type 2"

In [66]:
adata_sub = adata_tmp[adata_tmp.obs["cell_type"] == clus, :].copy()

adata_sub

AnnData object with n_obs × n_vars = 600 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [67]:
# mean: avg number of cells per batch in cell type
mean = np.mean(adata_sub.obs[batch_key].value_counts())
print(f"Average number of cells per batch in {clus}:", mean)

Average number of cells per batch in cell type 2: 300.0


In [68]:
quarter_mean = np.floor(mean / 4).astype("int")
quarter_mean

np.int64(75)

In [69]:
k0 = np.min([70, np.max([10, quarter_mean])])
print(f"Using k0={k0}")

Using k0=70


In [70]:
if k0 * adata_sub.n_obs >= size_max:
    print("Neighborhood size k0={k0} is not reasonable. Changin it")
    k0 = np.floor(size_max / adata_sub.n_obs).astype("int")

if verbose:
    print(f"Use {k0} nearest neighbors.")

Use 70 nearest neighbors.


In [71]:
matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))

matrix.shape, matrix

((600, 71),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [72]:
df = pd.DataFrame(matrix)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
sc_dir

'sc_04/2/original_kbet'

In [74]:
df.to_csv(f"{sc_dir}/df.csv", index=False, header=False)

In [75]:
batch = adata_sub.obs["batch"]

batch

cell_id
cell_600     batch 1
cell_601     batch 1
cell_602     batch 1
cell_603     batch 1
cell_604     batch 1
              ...   
cell_1195    batch 2
cell_1196    batch 2
cell_1197    batch 2
cell_1198    batch 2
cell_1199    batch 2
Name: batch, Length: 600, dtype: category
Categories (2, object): ['batch 1', 'batch 2']

In [76]:
batch.to_csv(f"{sc_dir}/batch.csv", index=False, header=False)

In [77]:
# get k0 nearest neighbors indices for each cell in this cell type
nn_index_sub = diffusion_nn(adata_sub, k=k0).astype("float")

nn_index_sub.shape

(600, 70)

In [78]:
nn_index_sub

array([[ 10., 114.,  91., ...,  72., 118.,  14.],
       [ 23.,  37.,  57., ...,  33.,  14.,  15.],
       [ 80.,  96., 105., ...,  86.,  88.,  30.],
       ...,
       [166., 162., 445., ..., 144., 573., 134.],
       [128., 539., 417., ..., 503., 124., 307.],
       [532., 400., 575., ..., 304., 571., 226.]])

In [79]:
pd.DataFrame(nn_index_sub + 1).to_csv(f"{sc_dir}/knn.csv", index=False, header=False)

##### cell type 2 largest comp

In [81]:
scenario

'sc_04'

In [82]:
cell_type = "2"
sc_dir = f"{scenario}/{cell_type}/largest_comp"

os.makedirs(sc_dir, exist_ok=True)
sc_dir

'sc_04/2/largest_comp'

In [83]:
# n_comp: # strongly connected components
# labs: np array that holds index of component each cell belong to
n_comp, labs = scipy.sparse.csgraph.connected_components(
    csgraph=adata_sub.obsp["connectivities"],
    connection="strong"
)
n_comp

2

In [84]:
labs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [85]:
Counter(labs)

Counter({np.int32(1): 480, np.int32(0): 120})

In [86]:
if verbose:
    print(f"There are {n_comp} connected components (i.e., batches are not mixed).")

There are 2 connected components (i.e., batches are not mixed).


In [87]:
# get component sizes
comp_size = pd.value_counts(labs)
comp_size

  comp_size = pd.value_counts(labs)


1    480
0    120
Name: count, dtype: int64

In [88]:
# threshold
comp_size_thresh = 3 * k0
comp_size_thresh

np.int64(210)

In [89]:
# check which components are small
idx_nonan = np.flatnonzero(
    np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
)
idx_nonan.shape, idx_nonan

  np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)


((480,),
 array([120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
        133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
        146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
        159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
        172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184,
        185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
        198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210,
        211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
        224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236,
        237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
        250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262,
        263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275,
        276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288,
        289, 290, 291, 292, 293, 294, 295,

In [90]:
len(idx_nonan), len(labs)

(480, 600)

In [91]:
print(f"{len(idx_nonan)} cells are in components of size >= 3*k0.")
print(f"{len(labs) - len(idx_nonan)} cells are in components of size < 3*k0 and will be ignored.")

480 cells are in components of size >= 3*k0.
120 cells are in components of size < 3*k0 and will be ignored.


In [92]:
len(idx_nonan) / len(labs)

0.8

In [93]:
# get part of adata object related to the cells in these valid components
adata_sub_sub = adata_sub[idx_nonan, :].copy()
adata_sub_sub   # only component 0 is selected

AnnData object with n_obs × n_vars = 480 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [94]:
nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
nn_index_tmp.shape, nn_index_tmp

((600, 70),
 array([[6.40783711e-310, 6.40783711e-310, 5.15939514e-310, ...,
         3.90447226e-312, 3.94691217e-312, 3.98935209e-312],
        [4.03179200e-312, 4.09545188e-312, 4.37131133e-312, ...,
         1.07797386e-311, 1.08221785e-311, 1.09070584e-311],
        [1.10555981e-311, 1.10980380e-311, 1.12253577e-311, ...,
         7.38454535e-312, 7.80894451e-312, 8.10602392e-312],
        ...,
        [1.33690832e-006, 1.43406086e-008, 1.75344143e-006, ...,
         5.57626234e-011, 2.39151862e-010, 5.52060725e-010],
        [9.10507122e-013, 7.14732763e-014, 2.47131179e-014, ...,
         5.73115467e-007, 1.88770421e-011, 5.49943816e-007],
        [7.81250180e-003, 3.72405615e-009, 7.53530840e-015, ...,
         1.56397028e-008, 1.23521887e-016, 5.30466347e-016]]))

In [95]:
nn_index_tmp[:] = np.nan
nn_index_tmp

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [96]:
nn_index_tmp[idx_nonan] = diffusion_nn(adata_sub_sub, k=k0).astype("float")
nn_index_tmp

array([[ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       ...,
       [ 46.,  42., 325., ...,  24., 320., 221.],
       [  8., 419., 297., ..., 383.,   4., 187.],
       [412., 280., 455., ..., 342., 184., 106.]])

In [97]:
print("Indices of nan rows:")
idx_all_nan = np.where(np.all(np.isnan(nn_index_tmp), axis=1))[0]
idx_all_nan

Indices of nan rows:


array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119])

In [98]:
matrix = np.zeros(shape=(adata_sub.n_obs, k0))
matrix.shape

(600, 70)

In [99]:
df = pd.DataFrame(matrix)

df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
sc_dir

'sc_04/2/largest_comp'

In [101]:
df.to_csv(f"{sc_dir}/df.csv", index=False, header=False)

In [102]:
batch = adata_sub.obs["batch"]

batch

cell_id
cell_600     batch 1
cell_601     batch 1
cell_602     batch 1
cell_603     batch 1
cell_604     batch 1
              ...   
cell_1195    batch 2
cell_1196    batch 2
cell_1197    batch 2
cell_1198    batch 2
cell_1199    batch 2
Name: batch, Length: 600, dtype: category
Categories (2, object): ['batch 1', 'batch 2']

In [103]:
Counter(batch)

Counter({'batch 2': 480, 'batch 1': 120})

In [104]:
batch.to_csv(f"{sc_dir}/batch.csv", index=False, header=False)

In [105]:
pd.DataFrame(nn_index_tmp + 1).to_csv(f"{sc_dir}/knn.csv", index=False, header=False)

In [106]:
print("End of the notebook")

End of the notebook
