# Multicellular coordination networks

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedGroupKFold
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
import scanpy as sc
import pandas as pd

## Load normalized data

In [7]:
features = sc.read_h5ad("../../data/top_features_multi_pT.h5ad")



## Prepare cross-validation

In [8]:
cell_table = pd.read_csv("../../data/cell_table_with_types_stage.csv", 
                         index_col = 0)

  cell_table = pd.read_csv("../../data/cell_table_with_types_stage.csv",


In [9]:
metab_markers = ['CA9', 'CD98', 'CytC', 'MCT1', 'ASCT2', 'LDH', 'GS', 'GLS', 'ATP5A', 'CS', 'PKM2', 'GLUT1', 'ARG1', 'CPT1A', 'Ki67']

# Only metabolic markers for cancer/epithelial cells
df = cell_table.loc[cell_table.consensus == "Cancer_cell",metab_markers] 
meta = cell_table.loc[cell_table.consensus == "Cancer_cell",["Stage","fov"]]

# A few FOVs show only few epithelial cells and might only add noise to the analysis
sparse_epi_fovs = meta.fov.value_counts()[meta.fov.value_counts() <= 20].index
df = df.loc[~meta.fov.isin(sparse_epi_fovs)]
meta = meta.loc[~meta.fov.isin(sparse_epi_fovs)]

# Only keep well-annotated stages
epithelial_subset = meta["Stage"].isin(["pT1", "pT2", "pT3", "pT4"]).values

In [10]:
# Define which fovs will be held out for validation (outer loop)
fov_stage_table = meta.loc[epithelial_subset].drop_duplicates().reset_index(drop=True)
fov_inner, fov_val, y_inner, y_val = train_test_split(
    fov_stage_table["fov"], fov_stage_table["Stage"], test_size=0.2, random_state=0, stratify=fov_stage_table["Stage"])
meta["inner"] = meta["fov"].isin(fov_inner)
# For sanity check, the following should consistently be used as validation set
" ".join(fov_val.sort_values())

'A1a A1f A1h A2g A2i A2q A2r A3m A4e A4n A5a A5q A6b A6c A6g A6m A6p A6q A6r A7f A7p A8a A8m A9o A9q B1h B1k B2a B2b B2k B2o B3c B3g B4b B4g B4m B6i B6q B7c B7p B7r B8b B8d B8g B8i B8l B9c B9d B9h B9m B9n B9o C1i C1l C2a C2f C2k C3c C3h C4a C4g C4k C5a C5k C5l C6d C8a C8h D1i D1l D1m D2e D2k D4c D5b D5d D5h D5k D5l D6c D6k D7a D8d D8h'

In [11]:
n_splits = 4

cv_folds = StratifiedGroupKFold(n_splits=n_splits)
for train, test in cv_folds.split(df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]],
                                  meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
                                  groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]):
    # All stages should be present in both train and test
    assert len(meta.loc[epithelial_subset].loc[meta["inner"]].iloc[test].groupby("fov")["Stage"].first().unique()) == 4
    assert len(meta.loc[epithelial_subset].loc[meta["inner"]].iloc[train].groupby("fov")["Stage"].first().unique()) == 4
    # Print the first test FOVs
    print(meta.loc[epithelial_subset].loc[meta["inner"]].iloc[test].groupby("fov").first().index[:5]) 

Index(['A1e', 'A1n', 'A3b', 'A3d', 'A3e'], dtype='object', name='fov')
Index(['A1c', 'A1m', 'A1o', 'A1p', 'A1q'], dtype='object', name='fov')
Index(['A1l', 'A1r', 'A2b', 'A2c', 'A2l'], dtype='object', name='fov')
Index(['A1d', 'A1i', 'A1k', 'A2a', 'A2d'], dtype='object', name='fov')


We can split the data at the cell level to allow applications that involve processing single cells. The grouping ensures that the data is split per FOV without contamination (i.e. cells from a given FOV present both in training and testing).

In [12]:
meta_per_fov = meta.loc[epithelial_subset].groupby("fov").first()

In [13]:
df_per_fov = df.copy()
df_per_fov["fov"] = meta["fov"] 
df_per_fov = df_per_fov.loc[epithelial_subset].groupby("fov").mean()
meta_per_fov = meta.loc[epithelial_subset].groupby("fov").first()

In [14]:
def conv_traintest_cells_to_fov():
    for train, test in cv_folds.split(df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]],
                                    meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
                                    groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]):
        test_fovs = meta.loc[epithelial_subset].loc[meta["inner"]].iloc[test].groupby("fov").first().index
        train_fovs = meta.loc[epithelial_subset].loc[meta["inner"]].iloc[train].groupby("fov").first().index
        test_fovs_ind = np.where(meta_per_fov.loc[meta_per_fov["inner"]].index.isin(test_fovs))[0]
        train_fovs_ind = np.where(meta_per_fov.loc[meta_per_fov["inner"]].index.isin(train_fovs))[0]
        yield (train_fovs_ind, test_fovs_ind)

# Should work as the output of `split` method: two arrays of indices
cv_folds_fov = [x for x in conv_traintest_cells_to_fov()]

for train, test in cv_folds_fov:
    # All stages should be present in both train and test
    assert len(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test]["Stage"].unique()) == 4
    assert len(meta_per_fov.loc[meta_per_fov["inner"]].iloc[train]["Stage"].unique()) == 4
    # Print the first test FOVs
    print(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test].index[:5])

Index(['A1e', 'A1n', 'A3b', 'A3d', 'A3e'], dtype='object', name='fov')
Index(['A1c', 'A1m', 'A1o', 'A1p', 'A1q'], dtype='object', name='fov')
Index(['A1l', 'A1r', 'A2b', 'A2c', 'A2l'], dtype='object', name='fov')
Index(['A1d', 'A1i', 'A1k', 'A2a', 'A2d'], dtype='object', name='fov')


In [15]:
le = LabelEncoder()
le.fit(meta_per_fov.loc[meta_per_fov["inner"]]["Stage"])

## Test prediction

Note on missing values: Some of the samples only had few or none of some cell types, resulting in missing values in the table. Inputation for prediction would not be very helpful, as it is likely that all the values for a cell type are missing for a patient, and thus predictions would be compared to all-inferred values. It seems better to simply exclude these specific patient + cell-type combinations from the respective testing / validation sets.

In [16]:
ref_ctype = "Endothelial_cell"
features_subset = features[meta_per_fov.index[meta_per_fov["inner"]]]
X = features_subset[:,features_subset.var["Cell type"] != ref_ctype].X
Y = features_subset[:,features_subset.var["Cell type"] == ref_ctype].X

In [17]:
# Get the rows of Y that are not NaN
idx_exclude = np.where(np.sum(np.isnan(Y), axis=1))[0]
X = np.delete(X, idx_exclude, 0)
Y = np.delete(Y, idx_exclude, 0)

In [18]:
# Create a mapping from old to new indices after exclusion
valid_indices = np.setdiff1d(np.arange(features_subset.shape[0]),
                             idx_exclude)
old_to_new = {old: new for new, old in enumerate(valid_indices)}

meta_filtered = meta_per_fov.loc[meta_per_fov["inner"]]
meta_filtered = meta_filtered.iloc[valid_indices]

In [19]:
# Also exclude from the CV folds
cv_folds_fov_filtered = []
for train, test in cv_folds_fov:
    # Map old indices to new indices in the filtered array
    train_filtered = np.array([old_to_new[idx] for idx in train if idx not in idx_exclude])
    test_filtered = np.array([old_to_new[idx] for idx in test if idx not in idx_exclude])
    cv_folds_fov_filtered.append((train_filtered, test_filtered))
    
    # All stages should be present in both train and test
    assert len(meta_filtered.iloc[test_filtered]["Stage"].unique()) == 4
    assert len(meta_filtered.iloc[train_filtered]["Stage"].unique()) == 4
    # Print the first test FOVs
    print(meta_filtered.iloc[test_filtered].index[:5])

Index(['A1e', 'A1n', 'A3b', 'A3d', 'A3e'], dtype='object', name='fov')
Index(['A1c', 'A1m', 'A1o', 'A1p', 'A1q'], dtype='object', name='fov')
Index(['A1l', 'A1r', 'A2b', 'A2c', 'A2l'], dtype='object', name='fov')
Index(['A1d', 'A1i', 'A1k', 'A2a', 'A2d'], dtype='object', name='fov')


In [20]:
# Visualize how many samples were excluded
print([(x.shape, y.shape) for x, y in cv_folds_fov])
print([(x.shape, y.shape) for x, y in cv_folds_fov_filtered])

[((251,), (83,)), ((249,), (85,)), ((250,), (84,)), ((252,), (82,))]
[((248,), (81,)), ((244,), (85,)), ((247,), (82,)), ((248,), (81,))]


In [21]:
clf = XGBRegressor(eval_metric='aucpr',
                   n_estimators=250, 
                   max_depth=3, 
                   device="cuda", 
                   random_state=0,
                   tree_method="hist")
clf.fit(X, Y)

In [22]:
scores = cross_val_score(
    XGBRegressor(
                   objective="reg:squarederror",
                   n_estimators=250, 
                   max_depth=3, 
                   device="cuda", 
                   random_state=0,
                   tree_method="hist"),
    X,
    Y,
    cv=cv_folds_fov_filtered,
    scoring="neg_median_absolute_error"
    )
pd.Series(scores).describe()

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




count    4.000000
mean    -0.303540
std      0.043066
min     -0.342085
25%     -0.340128
50%     -0.304181
75%     -0.267593
max     -0.263714
dtype: float64

In [23]:
scores = cross_val_score(
    XGBRegressor(
                   objective="reg:absoluteerror",
                   n_estimators=250, 
                   max_depth=3, 
                   device="cuda", 
                   random_state=0,
                   tree_method="hist"),
    X,
    Y,
    cv=cv_folds_fov_filtered,
    scoring="neg_median_absolute_error"
    )
pd.Series(scores).describe()

count    4.000000
mean    -0.278457
std      0.020214
min     -0.305887
25%     -0.286860
50%     -0.274185
75%     -0.265783
max     -0.259572
dtype: float64

In [None]:
scores = cross_val_score(
    XGBRegressor(  
                   objective="reg:squarederror",
                   n_estimators=250, 
                   max_depth=3, 
                   random_state=0,
                   tree_method="hist", 
                   multi_strategy="multi_output_tree"),
    X,
    Y,
    cv=cv_folds_fov_filtered,
    scoring="r2"
    )
pd.Series(scores).describe()

count    4.000000
mean     0.257928
std      0.128966
min      0.097373
25%      0.187790
50%      0.270716
75%      0.340853
max      0.392905
dtype: float64

In [None]:
for ctype in features_subset.var["Cell type"].unique():
    Xc = features_subset[:,features_subset.var["Cell type"] == ctype].X
    Xc = np.delete(Xc, idx_exclude, 0)

    scores = cross_val_score(
        XGBRegressor(  
                    objective="reg:squarederror",
                    n_estimators=250, 
                    max_depth=3, 
                    random_state=0,
                    tree_method="hist", 
                    device="cuda", 
                    ),
        Xc,
        Y,
        cv=cv_folds_fov_filtered,
        scoring="r2"
        )
    print(ctype, np.median(scores))

Endothelial_cell 0.9777576260964527
CD4_lymphocyte 0.06363645339847468
Fibroblast 0.16439431943625973
Other_immune_cell -0.19218553594400511
Monocyte -0.06277772133831981
Macrophage 0.12496416236366392
Epithelial_cell -0.15127024131033334
Cytotoxic_lymphocyte -0.10484204989412726


In [None]:
for ctype in features_subset.var["Cell type"].unique():
    Xc = features_subset[:,features_subset.var["Cell type"] == ctype].X
    Xc = np.delete(Xc, idx_exclude, 0)

    scores = cross_val_score(
        XGBRegressor(  
                    objective="reg:absoluteerror",
                    n_estimators=250, 
                    max_depth=3, 
                    random_state=0,
                    tree_method="hist", 
                    device="cuda", 
                    ),
        Xc,
        Y,
        cv=cv_folds_fov_filtered,
        scoring="r2"
        )
    print(ctype, np.median(scores))

Endothelial_cell 0.9703451007377246
CD4_lymphocyte 0.1508324862899236
Fibroblast 0.19694528016637208
Other_immune_cell -0.03856352499755084
Monocyte 0.13181669653633832
Macrophage 0.1692838781141337
Epithelial_cell 0.04547868976477555
Cytotoxic_lymphocyte -0.07607204746417749


In [None]:
for ctype in features_subset.var["Cell type"].unique():
    Xc = features_subset[:,features_subset.var["Cell type"] == ctype].X
    Xc = np.delete(Xc, idx_exclude, 0)

    scores = cross_val_score(
        XGBRegressor(  
                    objective="reg:squarederror",
                    n_estimators=250, 
                    max_depth=3, 
                    random_state=0,
                    tree_method="hist", 
                    multi_strategy="multi_output_tree"),
        Xc,
        Y,
        cv=cv_folds_fov_filtered,
        scoring="r2"
        )
    print(ctype, np.median(scores))

In [None]:
for ref_ctype in features_subset.var["Cell type"].unique():
    features_subset = features[meta_per_fov.index[meta_per_fov["inner"]]]
    X = features_subset[:,features_subset.var["Cell type"] != ref_ctype].X
    Y = features_subset[:,features_subset.var["Cell type"] == ref_ctype].X

    # Get the rows of Y that are not NaN
    idx_exclude = np.where(np.sum(np.isnan(Y), axis=1))[0]
    X = np.delete(X, idx_exclude, 0)
    Y = np.delete(Y, idx_exclude, 0)

    # Create a mapping from old to new indices after exclusion
    valid_indices = np.setdiff1d(np.arange(features_subset.shape[0]),
                                idx_exclude)
    old_to_new = {old: new for new, old in enumerate(valid_indices)}

    meta_filtered = meta_per_fov.loc[meta_per_fov["inner"]]
    meta_filtered = meta_filtered.iloc[valid_indices]

    # Also exclude from the CV folds
    cv_folds_fov_filtered = []
    for train, test in cv_folds_fov:
        # Map old indices to new indices in the filtered array
        train_filtered = np.array([old_to_new[idx] for idx in train if idx not in idx_exclude])
        test_filtered = np.array([old_to_new[idx] for idx in test if idx not in idx_exclude])
        cv_folds_fov_filtered.append((train_filtered, test_filtered))
        
        # All stages should be present in both train and test
        assert len(meta_filtered.iloc[test_filtered]["Stage"].unique()) == 4
        assert len(meta_filtered.iloc[train_filtered]["Stage"].unique()) == 4
        # Print the first test FOVs
        # print(meta_filtered.iloc[test_filtered].index[:5])

    for ctype in features_subset.var["Cell type"].unique():
        Xc = features_subset[:,features_subset.var["Cell type"] == ctype].X
        Xc = np.delete(Xc, idx_exclude, 0)

        scores = cross_val_score(
            XGBRegressor(  
                        objective="reg:squarederror",
                        n_estimators=250, 
                        max_depth=3, 
                        random_state=0,
                        tree_method="hist", 
                        multi_strategy="multi_output_tree"),
            Xc,
            Y,
            cv=cv_folds_fov_filtered,
            scoring="r2"
            )
        if (np.median(scores) > 0.15) and (ref_ctype != ctype):
            print(ref_ctype, ctype, np.median(scores))

Endothelial_cell Fibroblast 0.1669610350178599
Endothelial_cell Macrophage 0.20462846573014626
Other_immune_cell Monocyte 0.15060545358904176
Monocyte Other_immune_cell 0.22036084404054013
Monocyte Macrophage 0.21564650640109978
Macrophage Endothelial_cell 0.17165495181855114
Macrophage CD4_lymphocyte 0.18977357468358416
Macrophage Fibroblast 0.15432508451535004


In [None]:
for ref_ctype in features_subset.var["Cell type"].unique():
    features_subset = features[meta_per_fov.index[meta_per_fov["inner"]]]
    X = features_subset[:,features_subset.var["Cell type"] != ref_ctype].X
    Y = features_subset[:,features_subset.var["Cell type"] == ref_ctype].X

    # Get the rows of Y that are not NaN
    idx_exclude = np.where(np.sum(np.isnan(Y), axis=1))[0]
    X = np.delete(X, idx_exclude, 0)
    Y = np.delete(Y, idx_exclude, 0)

    # Create a mapping from old to new indices after exclusion
    valid_indices = np.setdiff1d(np.arange(features_subset.shape[0]),
                                idx_exclude)
    old_to_new = {old: new for new, old in enumerate(valid_indices)}

    meta_filtered = meta_per_fov.loc[meta_per_fov["inner"]]
    meta_filtered = meta_filtered.iloc[valid_indices]

    # Also exclude from the CV folds
    cv_folds_fov_filtered = []
    for train, test in cv_folds_fov:
        # Map old indices to new indices in the filtered array
        train_filtered = np.array([old_to_new[idx] for idx in train if idx not in idx_exclude])
        test_filtered = np.array([old_to_new[idx] for idx in test if idx not in idx_exclude])
        cv_folds_fov_filtered.append((train_filtered, test_filtered))
        
        # All stages should be present in both train and test
        assert len(meta_filtered.iloc[test_filtered]["Stage"].unique()) == 4
        assert len(meta_filtered.iloc[train_filtered]["Stage"].unique()) == 4
        # Print the first test FOVs
        # print(meta_filtered.iloc[test_filtered].index[:5])

    for ctype in features_subset.var["Cell type"].unique():
        Xc = features_subset[:,features_subset.var["Cell type"] == ctype].X
        Xc = np.delete(Xc, idx_exclude, 0)

        scores = cross_val_score(
            XGBRegressor(  
                        objective="reg:squarederror",
                        n_estimators=250, 
                        max_depth=3, 
                        random_state=0,
                        tree_method="hist", 
                        device="cuda"),
            Xc,
            Y,
            cv=cv_folds_fov_filtered,
            scoring="r2"
            )
        if (np.median(scores) > 0.15) and (ref_ctype != ctype):
            print(ref_ctype, ctype, np.median(scores))

Endothelial_cell Fibroblast 0.16439431943625973
Fibroblast Macrophage 0.16149396527168652
Other_immune_cell Monocyte 0.1646420425464486
Monocyte Other_immune_cell 0.1669861312537248
Monocyte Macrophage 0.16464264510136895
Macrophage Fibroblast 0.15899045440484255


In [None]:
for ref_ctype in features_subset.var["Cell type"].unique():
    features_subset = features[meta_per_fov.index[meta_per_fov["inner"]]]
    X = features_subset[:,features_subset.var["Cell type"] != ref_ctype].X
    Y = features_subset[:,features_subset.var["Cell type"] == ref_ctype].X

    # Get the rows of Y that are not NaN
    idx_exclude = np.where(np.sum(np.isnan(Y), axis=1))[0]
    X = np.delete(X, idx_exclude, 0)
    Y = np.delete(Y, idx_exclude, 0)

    # Create a mapping from old to new indices after exclusion
    valid_indices = np.setdiff1d(np.arange(features_subset.shape[0]),
                                idx_exclude)
    old_to_new = {old: new for new, old in enumerate(valid_indices)}

    meta_filtered = meta_per_fov.loc[meta_per_fov["inner"]]
    meta_filtered = meta_filtered.iloc[valid_indices]

    # Also exclude from the CV folds
    cv_folds_fov_filtered = []
    for train, test in cv_folds_fov:
        # Map old indices to new indices in the filtered array
        train_filtered = np.array([old_to_new[idx] for idx in train if idx not in idx_exclude])
        test_filtered = np.array([old_to_new[idx] for idx in test if idx not in idx_exclude])
        cv_folds_fov_filtered.append((train_filtered, test_filtered))
        
        # All stages should be present in both train and test
        assert len(meta_filtered.iloc[test_filtered]["Stage"].unique()) == 4
        assert len(meta_filtered.iloc[train_filtered]["Stage"].unique()) == 4
        # Print the first test FOVs
        # print(meta_filtered.iloc[test_filtered].index[:5])

    for ctype in features_subset.var["Cell type"].unique():
        Xc = features_subset[:,features_subset.var["Cell type"] == ctype].X
        Xc = np.delete(Xc, idx_exclude, 0)

        scores = cross_val_score(
            XGBRegressor(  
                        objective="reg:absoluteerror",
                        n_estimators=250, 
                        max_depth=3, 
                        random_state=0,
                        tree_method="hist", 
                        device="cuda"),
            Xc,
            Y,
            cv=cv_folds_fov_filtered,
            scoring="r2"
            )
        if (np.median(scores) > 0.15) and (ref_ctype != ctype):
            print(ref_ctype, ctype, np.median(scores))

Endothelial_cell CD4_lymphocyte 0.1508324862899236
Endothelial_cell Fibroblast 0.19694528016637208
Endothelial_cell Macrophage 0.1692838781141337
CD4_lymphocyte Endothelial_cell 0.2111337366578061
CD4_lymphocyte Fibroblast 0.19090438042737123
CD4_lymphocyte Macrophage 0.2496096337795684
Fibroblast Endothelial_cell 0.18428768371359266
Fibroblast CD4_lymphocyte 0.17922579469274835
Fibroblast Macrophage 0.1573880283306985
Other_immune_cell Monocyte 0.20162066752988284
Monocyte Other_immune_cell 0.27939617407730777
Monocyte Macrophage 0.2352695816361389
Monocyte Cytotoxic_lymphocyte 0.15347203232659767
Macrophage Endothelial_cell 0.2549663448616054
Macrophage CD4_lymphocyte 0.21556828441147202
Macrophage Fibroblast 0.22015675401407248


## Main model selection loop

In [None]:
def process_fold(X, Y, train, test, n_estimators = 250, n_depth = 3):
    # logging.getLogger('tensorflow').setLevel(logging.ERROR)
    # with warnings.catch_warnings(action="ignore"):      
    # Step 1: Define train and test subsets of the compositional data
    train_func = X[train]
    test_func = X[test]
    train_meta = Y[train]
    test_meta = Y[test]

    # Step 2: Train a classifier on the training data composition to predict the stage of each FOV
    xgb = XGBRegressor(  
                    objective="reg:absoluteerror",
                    n_estimators=n_estimators, 
                    max_depth=n_depth, 
                    random_state=0,
                    tree_method="hist", 
                    device="cuda")
    xgb.fit(train_func,
            train_meta)

    # Step 3: Predict stage of each FOV in the test data
    preds = xgb.predict(test_func)

    # Step 4: Compute f1_score
    score = r2_score(test_meta, 
                    preds)
    return score

In [None]:
def test_parameters(X, Y, cv_folds, file_path, estimators, depth, verbose = False):
    scores = [0 for _ in cv_folds]
    for i, (train, test) in enumerate(cv_folds):
        scores[i] = process_fold(X, Y, train, test, n_estimators=estimators, n_depth=depth)
        if scores[i] <= 0:
            if verbose:
                print("Bad performance, skipping.", file_path, estimators, depth)
            return -1
    mean_score = np.median(scores)
    # Append to f1 score log file
    with open(file_path, "a") as f:
        f.write(f"{estimators},{depth},{mean_score}\n")
    
    if verbose:
        print(file_path, estimators, depth, mean_score)
    return mean_score

In [None]:
process_fold(Xc, Y, train_filtered, test_filtered)

0.9623933669951212

In [None]:
test_parameters(Xc, Y, cv_folds_fov_filtered, "../../data/ignoreme.txt", estimators = 250, depth = 3, verbose = True)

../../data/ignoreme.txt 250 3 0.9785541503630105


0.9785541503630105

In [None]:
def test_celltype_pair(ref_ctype, query_ctype):
    X = features_subset[:,features_subset.var["Cell type"] != ref_ctype].X
    Y = features_subset[:,features_subset.var["Cell type"] == ref_ctype].X

    # Get the rows of Y that are not NaN
    idx_exclude = np.where(np.sum(np.isnan(Y), axis=1))[0]
    X = np.delete(X, idx_exclude, 0)
    Y = np.delete(Y, idx_exclude, 0)

    # Create a mapping from old to new indices after exclusion
    valid_indices = np.setdiff1d(np.arange(features_subset.shape[0]),
                                idx_exclude)
    old_to_new = {old: new for new, old in enumerate(valid_indices)}

    meta_filtered = meta_per_fov.loc[meta_per_fov["inner"]]
    meta_filtered = meta_filtered.iloc[valid_indices]

    # Also exclude from the CV folds
    cv_folds_fov_filtered = []
    for train, test in cv_folds_fov:
        # Map old indices to new indices in the filtered array
        train_filtered = np.array([old_to_new[idx] for idx in train if idx not in idx_exclude])
        test_filtered = np.array([old_to_new[idx] for idx in test if idx not in idx_exclude])
        cv_folds_fov_filtered.append((train_filtered, test_filtered))
        
        # All stages should be present in both train and test
        assert len(meta_filtered.iloc[test_filtered]["Stage"].unique()) == 4
        assert len(meta_filtered.iloc[train_filtered]["Stage"].unique()) == 4

    Xc = features_subset[:,features_subset.var["Cell type"] == query_ctype].X
    Xc = np.delete(Xc, idx_exclude, 0)

    # Loop over parameters
    estimator_range = np.arange(15, 511, 10)
    depth_range = np.arange(3, 10, 2)
    estimator_range, depth_range = np.meshgrid(estimator_range, depth_range)
    Parallel(n_jobs=28)(delayed(test_parameters)(Xc, Y, cv_folds_fov_filtered, 
                    f"../../data/model_celltypes_{ref_ctype}_{query_ctype}.csv",
                     estimators = r_estimators, depth = r_depth, 
                     verbose = False)
                    for r_estimators, r_depth in zip(estimator_range.ravel(), depth_range.ravel()))
    

In [None]:
from joblib import Parallel, delayed
import warnings, logging

In [None]:
for c1 in features.var["Cell type"].unique():
    for c2 in features.var["Cell type"].unique():
        test_celltype_pair(c1, c2)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device or