### scib kbet for sc_04 cell type 1
- run with metric-dev

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
import rpy2.robjects as ro
import scib
import anndata2ri
import rpy2
from collections import Counter

In [2]:
# Add the parent directory to sys.path
sys.path.append(os.path.abspath('../..'))

In [3]:
from kbet_exceptions import OptionalDependencyNotInstalled, RLibraryNotFound
from kbet_utils import NeighborsError, diffusion_nn

In [4]:
def kBET_single_with_plot(matrix, batch, k0=10, knn=None, verbose=True, scenario=None, sc_dir=None):
    """
    Single run of kBET
    
    Parameters
    ----------
    matrix : zero matrix
        np zero matrix (cells x k0+1 features)
    batch : list-like
        batch labels
    k0 : int, optional
        number of nearest neighbors, by default 10
    knn : array-like, optional
        precomputed knn indices for each cell, by default None
    verbose : bool, optional
        whether to print progress, by default True
    scenario : str
        name of the scenario
    sc_dir : str
        directory to save plots

    Returns
    -------
    float
        kBET score (i.e. observed rejection rate)
    """

    ro.r.source('kbet_with_plot.R')

    # activate automatic conversion between anndata and R
    anndata2ri.activate()

    ro.globalenv["data_mtrx"] = matrix
    ro.globalenv["batch"] = batch

    ro.globalenv["knn_graph"] = knn
    ro.globalenv["k0"] = k0
    ro.globalenv["scenario"] = scenario
    ro.globalenv["sc_dir"] = sc_dir
    
    ro.r(
        "batch.estimate <- kBET("
        "  data_mtrx,"
        "  batch,"
        "  knn=knn_graph,"
        "  k0=k0,"
        "  scenario=scenario,"
        f"  sc_dir='{sc_dir}'"
        ")"
    )

    try:
        score = ro.r("batch.estimate$summary$kBET.observed")[0]
    except rpy2.rinterface_lib.embedded.RRuntimeError as ex:
        print(f"Error computing kBET: {ex}\nSetting value to np.nan")
        score = np.nan

    anndata2ri.deactivate()

    return score

In [5]:
def kBET_single(matrix, batch, k0=10, knn=None, verbose=True):
    """
    Single run of kBET
    
    Parameters
    ----------
    matrix : zero matrix
        np zero matrix (cells x k0+1 features)
    batch : list-like
        batch labels
    k0 : int, optional
        number of nearest neighbors, by default 10
    knn : array-like, optional
        precomputed knn indices for each cell, by default None
    verbose : bool, optional
        whether to print progress, by default True
    
    Returns
    -------
    float
        kBET score (i.e. observed rejection rate)
    """

    try:
        ro.r("library(kBET)")
    except rpy2.rinterface_lib.embedded.RRuntimeError as ex:
        RLibraryNotFound(ex)

    # activate automatic conversion between anndata and R
    anndata2ri.activate()

    # if verbose:
    #    print("importing expression matrix")

    ro.globalenv["data_mtrx"] = matrix
    ro.globalenv["batch"] = batch

    # if verbose:
    #    print("kBET estimation")

    ro.globalenv["knn_graph"] = knn
    ro.globalenv["k0"] = k0
    
    ro.r(
        "batch.estimate <- kBET("
        "  data_mtrx,"
        "  batch,"
        "  knn=knn_graph,"
        "  k0=k0,"
        "  plot=FALSE,"
        "  do.pca=FALSE,"
        "  heuristic=FALSE,"
        "  adapt=FALSE,"
        f"  verbose={str(verbose).upper()}"
        ")"
    )

    try:
        score = ro.r("batch.estimate$summary$kBET.observed")[0]
    except rpy2.rinterface_lib.embedded.RRuntimeError as ex:
        print(f"Error computing kBET: {ex}\nSetting value to np.nan")
        score = np.nan

    anndata2ri.deactivate()

    return score


In [6]:
scenario = "2025-10-28_sc_04.h5ad"

scenario

'2025-10-28_sc_04.h5ad'

In [7]:
adata = sc.read_h5ad(f"../../scenarios/{scenario}")

adata

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'

In [8]:
print("Reducing data...")
scib.preprocessing.reduce_data(adata, batch_key="batch", umap=True)

adata

Reducing data...
HVG
Using 2000 HVGs from full intersect set
Using 0 HVGs from n_batch-1 set
Using 2000 HVGs
Computed 2000 highly variable genes
PCA


  adata.var["highly_variable"] = np.in1d(adata.var_names, hvg_list)


Nearest Neigbours


  from .autonotebook import tqdm as notebook_tqdm


UMAP




AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [9]:
# keep only HVGs
adata = adata[:, adata.var["highly_variable"]].copy()

adata

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [10]:
adata.uns["neighbors"]

{'connectivities_key': 'connectivities',
 'distances_key': 'distances',
 'params': {'n_neighbors': 15,
  'method': 'umap',
  'random_state': 0,
  'metric': 'euclidean',
  'use_rep': 'X_pca'}}

In [11]:
batch_key = "batch"
label_key = "cell_type"
type_ = "full"
embed = "X_pca"
scaled = True
return_df = True
verbose = True

In [12]:
# compute connectivities for non-knn type data integrations
if type_ != "knn" and embed is not None:
    # compute nearest neighbors distance matrix and a neighborhood graph of cells
    # 50 neighbors for each cell
    adata_tmp = sc.pp.neighbors(adata, n_neighbors=50, use_rep=embed, copy=True)

adata_tmp

AnnData object with n_obs × n_vars = 1800 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [13]:
adata_tmp.uns["neighbors"]

{'connectivities_key': 'connectivities',
 'distances_key': 'distances',
 'params': {'n_neighbors': 50,
  'method': 'umap',
  'random_state': 0,
  'metric': 'euclidean',
  'use_rep': 'X_pca'}}

In [14]:
if verbose:
    print(f"batch: {batch_key}")

batch: batch


In [15]:
# set upper bound for k0
size_max = 2**31 - 1
size_max

2147483647

In [16]:
# check if neighborhood size too small or only one batch in cell type

In [17]:
# counts: # cells in cell type and # batches in cell type
counts = adata_tmp.obs.groupby(label_key).agg(
    {
        label_key: "count",
        batch_key: "nunique"
    }
)
counts

  counts = adata_tmp.obs.groupby(label_key).agg(


Unnamed: 0_level_0,cell_type,batch
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1
cell type 1,600,2
cell type 2,600,2
cell type 3,600,2


In [18]:
# labels: cell types with at least 2 batches and at least 10 cells (we will run kBET on these)
labels = counts.query(f"{label_key}>=10 and {batch_key} > 1").index
labels

CategoricalIndex(['cell type 1', 'cell type 2', 'cell type 3'], categories=['cell type 1', 'cell type 2', 'cell type 3'], ordered=False, dtype='category', name='cell_type')

In [19]:
# skipped: cell types with only one batch or less than 10 cells to skip
skipped = counts.index.difference(labels)
skipped

CategoricalIndex([], categories=['cell type 1', 'cell type 2', 'cell type 3'], ordered=False, dtype='category', name='cell_type')

In [20]:
print(f"{len(skipped)} labels consist of a single batch or is too small (less than 10 cells). Skip.")

0 labels consist of a single batch or is too small (less than 10 cells). Skip.


In [21]:
# assign NaN to skipped cell types
kBET_scores = {
    "cluster": list(skipped),
    "kBET": [np.nan] * len(skipped)
}
kBET_scores

{'cluster': [], 'kBET': []}

In [22]:
labels

CategoricalIndex(['cell type 1', 'cell type 2', 'cell type 3'], categories=['cell type 1', 'cell type 2', 'cell type 3'], ordered=False, dtype='category', name='cell_type')

### cell type 1

In [23]:
clus = "cell type 1"

In [24]:
if verbose:
    print(f"\nProcessing {clus}...")


Processing cell type 1...


In [25]:
sc_title = "Sc_04 cell type 1"
sc_title

'Sc_04 cell type 1'

In [26]:
sc_dir = "1/scib_kbet/"
type(sc_dir), sc_dir

(str, '1/scib_kbet/')

In [27]:
os.makedirs(sc_dir, exist_ok=True)

In [28]:
# get parts of adata object related only current cell type
adata_sub = adata_tmp[adata_tmp.obs[label_key] == clus, :].copy()
adata_sub

AnnData object with n_obs × n_vars = 600 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [29]:
# mean: avg number of cells per batch in cell type
mean = np.mean(adata_sub.obs[batch_key].value_counts())
print(f"Average number of cells per batch in {clus}:", mean)

Average number of cells per batch in cell type 1: 300.0


In [30]:
quarter_mean = np.floor(mean / 4).astype("int")
quarter_mean

np.int64(75)

In [31]:
k0 = np.min([70, np.max([10, quarter_mean])])
print(f"Using k0={k0}")

Using k0=70


In [32]:
# check k0 for reasonability
if k0 * adata_sub.n_obs >= size_max:
    print("Neighborhood size k0={k0} is not reasonable. Changin it")
    k0 = np.floor(size_max / adata_sub.n_obs).astype("int")

if verbose:
    print(f"Use {k0} nearest neighbors.")

Use 70 nearest neighbors.


In [33]:
matrix = np.zeros(shape=(adata_sub.n_obs, k0 + 1))
matrix.shape

(600, 71)

In [34]:
# n_comp: # strongly connected components
# labs: np array that holds index of component each cell belong to
n_comp, labs = scipy.sparse.csgraph.connected_components(
    csgraph=adata_sub.obsp["connectivities"],
    connection="strong"
)
n_comp

2

In [35]:
labs

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [36]:
Counter(labs)

Counter({np.int32(0): 480, np.int32(1): 120})

In [37]:
if verbose:
    print(f"There are {n_comp} connected components (i.e., batches are not mixed).")

There are 2 connected components (i.e., batches are not mixed).


In [38]:
# get component sizes
comp_size = pd.value_counts(labs)
comp_size

  comp_size = pd.value_counts(labs)


0    480
1    120
Name: count, dtype: int64

In [39]:
# threshold
comp_size_thresh = 3 * k0
comp_size_thresh

np.int64(210)

In [40]:
# check which components are smaller than threshold
idx_nonan = np.flatnonzero(
    np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)
)
idx_nonan.shape, idx_nonan

  np.in1d(labs, comp_size[comp_size >= comp_size_thresh].index)


((480,),
 array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
        169, 170, 171, 172, 173, 174, 175,

In [41]:
len(idx_nonan), len(labs)

(480, 600)

In [42]:
print(f"{len(idx_nonan)} cells are in components of size >= 3*k0.")
print(f"{len(labs) - len(idx_nonan)} cells are in components of size < 3*k0 and will be ignored.")

480 cells are in components of size >= 3*k0.
120 cells are in components of size < 3*k0 and will be ignored.


In [43]:
len(idx_nonan) / len(labs)

0.8

In [44]:
# run kBET only if at least 75% of cells (in current cell type) are in components of size >= 3*k0
if len(idx_nonan) / len(labs) < 0.75:
    score = 1   # 100% rejection rate

In [45]:
# get part of adata object related to the cells in these valid components
adata_sub_sub = adata_sub[idx_nonan, :].copy()
adata_sub_sub   # only component 0 is selected

AnnData object with n_obs × n_vars = 480 × 2000
    obs: 'cell_id', 'batch', 'cell_type'
    var: 'highly_variable'
    uns: 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'

In [46]:
nn_index_tmp = np.empty(shape=(adata_sub.n_obs, k0))
nn_index_tmp.shape, nn_index_tmp

((600, 70),
 array([[6.08931954e-310, 5.13958606e-310, 5.13959039e-310, ...,
         3.03101420e+000, 3.06540394e+000, 3.06568575e+000],
        [3.07262087e+000, 3.07597089e+000, 3.07689834e+000, ...,
         3.61631083e+000, 3.62086511e+000, 3.62399912e+000],
        [3.62826157e+000, 3.62884283e+000, 3.63332891e+000, ...,
         2.94693494e+000, 2.96366310e+000, 2.98333788e+000],
        ...,
        [1.08433985e-311, 1.08858384e-311, 1.09282783e-311, ...,
         1.18592059e-318, 6.08931954e-310, 6.08931954e-310],
        [5.13958627e-310, 5.13958627e-310, 8.40898387e-021, ...,
         2.76268683e-033, 1.09957686e-015, 2.86786176e-014],
        [3.24453177e-012, 5.63514328e-020, 1.70054082e-030, ...,
         1.51329447e-014, 1.22648827e-023, 1.86408997e-023]]))

In [47]:
nn_index_tmp[:] = np.nan
nn_index_tmp

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [48]:
nn_index_tmp[idx_nonan] = diffusion_nn(adata_sub_sub, k=k0).astype("float")
nn_index_tmp

array([[298.,  16., 442., ..., 182.,  94., 377.],
       [347., 407.,  18., ..., 183., 368., 101.],
       [410.,  30., 323., ..., 244., 186., 138.],
       ...,
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

In [49]:
print("Indices of nan rows:")
idx_all_nan = np.where(np.all(np.isnan(nn_index_tmp), axis=1))[0]
idx_all_nan

Indices of nan rows:


array([480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492,
       493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
       506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518,
       519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531,
       532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544,
       545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557,
       558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570,
       571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583,
       584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596,
       597, 598, 599])

In [50]:
matrix.shape

(600, 71)

In [51]:
batch = adata_sub.obs[batch_key]
batch

cell_id
cell_0      batch 1
cell_1      batch 1
cell_2      batch 1
cell_3      batch 1
cell_4      batch 1
             ...   
cell_595    batch 2
cell_596    batch 2
cell_597    batch 2
cell_598    batch 2
cell_599    batch 2
Name: batch, Length: 600, dtype: category
Categories (2, object): ['batch 1', 'batch 2']

In [52]:
Counter(batch)

Counter({'batch 1': 480, 'batch 2': 120})

In [53]:
sc_dir

'1/scib_kbet/'

In [54]:
# call kBET
score = kBET_single(
    matrix=matrix,
    batch=batch, # send batch labels of all cells in current cell type
    knn=nn_index_tmp + 1,  # nn_index in python is 0-based and 1-based in R
    verbose=verbose,
    k0=k0
)

score

Number of kBET tests is set to 60.


  anndata2ri.activate()


np.float64(1.0)

In [55]:
# call kBET
score = kBET_single_with_plot(
    matrix=matrix,
    batch=batch, # send batch labels of all cells in current cell type
    knn=nn_index_tmp + 1,  # nn_index in python is 0-based and 1-based in R
    verbose=verbose,
    k0=k0,
    scenario=sc_title,
    sc_dir=sc_dir
)

score

R[write to console]: Keep up to date with changes at https://tidyverse.org/blog/



Number of kBET tests is set to 60.


  anndata2ri.activate()


np.float64(1.0)

In [89]:
print("End of the notebook")

End of the notebook
