## Prepare MuData object
We want to prepare a MuData object where the different views represent the different cell types profiled, each row is one sample (patient) and each column is one feature derived from the MIBI images and previously used to predict cancer stage (see *StageXGB.ipynb*).

In [1]:
import pandas as pd
import numpy as np
import muon as mu
import scanpy as sc
from time import ctime

  from .autonotebook import tqdm as notebook_tqdm


## 1 – Cell type composition
As we now stratify data per cell type, this will lead to a single value per FOV in each view: the corresponding proportion of cells belonging to this paerticular type.

In [2]:
cell_table = pd.read_csv("../../data/cell_table_with_types_stage.csv", 
                         index_col = 0)

  cell_table = pd.read_csv("../../data/cell_table_with_types_stage.csv",


In [3]:
metab_markers = ['CA9', 'CD98', 'CytC', 'MCT1', 'ASCT2', 'LDH', 'GS', 'GLS', 'ATP5A', 'CS', 'PKM2', 'GLUT1', 'ARG1', 'CPT1A', 'Ki67']

# Only metabolic markers for cancer/epithelial cells
df = cell_table.loc[cell_table.consensus == "Cancer_cell",metab_markers] 
meta = cell_table.loc[cell_table.consensus == "Cancer_cell",["Stage","fov"]]

# A few FOVs show only few epithelial cells and might only add noise to the analysis
sparse_epi_fovs = meta.fov.value_counts()[meta.fov.value_counts() <= 20].index
df = df.loc[~meta.fov.isin(sparse_epi_fovs)]
meta = meta.loc[~meta.fov.isin(sparse_epi_fovs)]

# Only keep well-annotated stages
epithelial_subset = meta["Stage"].isin(["Colon-no.", "pT1", "pT2", "pT3", "pT4"]).values

df_per_fov = df.copy()
df_per_fov["fov"] = meta["fov"] 
df_per_fov = df_per_fov.loc[epithelial_subset].groupby("fov").mean()
meta_per_fov = meta.loc[epithelial_subset].groupby("fov").first()

In [4]:
cell_type_proportions = cell_table.groupby("fov")["consensus"].value_counts().unstack().fillna(0)
# Normalize by the number of cells in each FOV
cell_type_proportions = cell_type_proportions.div(cell_type_proportions.sum(axis=1), axis=0)
# Match to metadata and kept FOVs
cell_type_proportions = cell_type_proportions.loc[meta_per_fov.index]
# Some types are only present in a minority of FOVs
(cell_type_proportions > 0).sum(axis='rows')/len(cell_type_proportions)

consensus
APC                  0.801418
B_cell               0.189125
CAF                  0.959811
CD163_Macrophage     0.879433
CD4_Tcell            0.943262
CD68_Macrophage      0.950355
CD8_Tcell            0.869976
Cancer_cell          1.000000
Endothelial_cell     0.983452
Monocyte             0.992908
NK_cell              0.782506
Neutrophil           0.808511
Other_immune_cell    0.983452
T_reg_cell           0.652482
Unclear              1.000000
dtype: float64

In [5]:
# We redefine more balanced classes
types_of_interest = dict(
    Other_immune_cell = ["APC", "B_cell", "Neutrophil", "Other_immune_cell"],
    Fibroblast = ["CAF"],
    Macrophage = ["CD163_Macrophage", "CD68_Macrophage"],
    CD4_lymphocyte = ["CD4_Tcell", "T_reg_cell"],
    Epithelial_cell = ["Cancer_cell"],
    Endothelial_cell = ["Endothelial_cell"],
    Monocyte = ["Monocyte"],
    Cytotoxic_lymphocyte = ["NK_cell", "CD8_Tcell"],
)

Could be stored as AnnData directly:
```Python
features_ad = {}
for type in types_of_interest.keys():
    features_ad[type] = mu.AnnData(cell_type_proportions.loc[:,types_of_interest[type]].sum(axis=1).to_numpy().reshape(-1, 1))
    # Keep track of the name of the features and observations
    features_ad[type].obs.index = cell_type_proportions.index.to_list()
    features_ad[type].var.index = ["Proportion"]
```
Alternatively, we keep each cell-type dataset as a `DataFrame` to easily join additional features in the following sections.

In [6]:
features_ad = {}
for type in types_of_interest.keys():
    features_ad[type] = pd.DataFrame(cell_type_proportions.loc[:,types_of_interest[type]].sum(axis=1).to_numpy().reshape(-1, 1),
                                     index = cell_type_proportions.index.to_list(),
                                     columns = ["proportion"])

## 2 – Metabolic clusters

Note that the metabolic clusters are defined per cell type, and that they are computed over the whole dataset, and not limited to the predictive models cross-validation inner fold, so even for cancer cells, the result will be different than the clusters previously described.

In [7]:
# Use class to define local scope and run the code directly
class Scope:
    # Step 0: Define parameters
    n_neighbors = 5
    resolution = 0.23

    for type in types_of_interest.keys():
        print(f"{ctime()} – Processing {type}")
        # Step 1: Extract relevant cell data
        X = cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),metab_markers+["fov"]]
        X = X.loc[X.fov.isin(df_per_fov.index)]

        # Step 2: Define metabolic clusters
        ad = mu.AnnData(X.drop(columns=["fov"]))
        sc.pp.neighbors(ad, n_neighbors=n_neighbors)
        sc.tl.leiden(ad, resolution=resolution)
        ad.obs.leiden = ad.obs.leiden.values.astype(int)

        # Step 3: Aggregate per FOV
        fov_cluster_composition = pd.DataFrame(ad.obs.leiden.values, columns=["Cluster"])
        fov_cluster_composition["fov"] = (
            X["fov"].values
        )
        fov_cluster_composition = (
            fov_cluster_composition.groupby("fov")["Cluster"].value_counts().unstack().fillna(0)
        )
        # Normalize by the number of cells in each FOV
        fov_cluster_composition = (
            fov_cluster_composition.div(fov_cluster_composition.sum(axis=1), axis=0)
        )
        fov_cluster_composition.columns = [f"metabolic_cluster_{col}" for col in fov_cluster_composition.columns]

        # Step 4: Store results in multi-feature DataFrame object
        features_ad[type] = features_ad[type].join(fov_cluster_composition)

Fri Jan 31 17:52:35 2025 – Processing Other_immune_cell



 To achieve the future defaults please pass: flavor="igraph" and n_iterations=2.  directed must also be False to work with igraph's implementation.
  sc.tl.leiden(ad, resolution=resolution)


Fri Jan 31 17:53:00 2025 – Processing Fibroblast




Fri Jan 31 17:53:33 2025 – Processing Macrophage




Fri Jan 31 17:53:36 2025 – Processing CD4_lymphocyte




Fri Jan 31 17:53:39 2025 – Processing Epithelial_cell




Fri Jan 31 17:56:54 2025 – Processing Endothelial_cell




Fri Jan 31 17:57:01 2025 – Processing Monocyte




Fri Jan 31 17:57:06 2025 – Processing Cytotoxic_lymphocyte




## 3 – MISTy features
In a FOV, is the cell type a good predictor of another cell type?

In [8]:
misty_features_raw = pd.read_csv("../../data/misty_lineage_features.csv", index_col=0)
# Subset to top 100 most common features
misty_features = misty_features_raw.iloc[:,np.argsort(-np.sum(misty_features_raw > 0))[:100]]

  return reduction(axis=axis, out=out, **passkwargs)


In [9]:
for type, subtypes in types_of_interest.items():
    for subtype in subtypes:
        # Get only columns of `misty_features` that correspond to interactions of subtypes
        features_ad[type] = features_ad[type].join(misty_features.loc[:,misty_features.columns.str.contains(f"_l.{subtype}")])
        features_ad[type] = features_ad[type].join(misty_features.loc[:,misty_features.columns.str.contains(f"_p.{subtype}")])

## 4 – Kasumi features
Kasumi results are not defined at the level of individual cells nor stratified by cell types and are not included in the factor analysis. 

## 5 – Morphological features

In [10]:
# Use class to define local scope and run the code directly
class Scope:
    # Step 0: Define parameters
    morpho_features = ['eccentricity', 'perimeter', 'area']

    for type in types_of_interest.keys():
        print(f"{ctime()} – Processing {type}")
        # Step 1: Extract relevant cell data
        X = cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),morpho_features+["fov"]]
        X = X.loc[X.fov.isin(df_per_fov.index)]

        # Step 2: Aggregate per FOV
        df_morpho_per_fov = X.groupby("fov").mean().join(X.groupby("fov").std(), rsuffix="_std")

        # Step 3: Store results in multi-feature DataFrame object
        features_ad[type] = features_ad[type].join(df_morpho_per_fov)

Fri Jan 31 17:57:08 2025 – Processing Other_immune_cell
Fri Jan 31 17:57:08 2025 – Processing Fibroblast
Fri Jan 31 17:57:08 2025 – Processing Macrophage
Fri Jan 31 17:57:08 2025 – Processing CD4_lymphocyte
Fri Jan 31 17:57:08 2025 – Processing Epithelial_cell
Fri Jan 31 17:57:08 2025 – Processing Endothelial_cell
Fri Jan 31 17:57:08 2025 – Processing Monocyte
Fri Jan 31 17:57:08 2025 – Processing Cytotoxic_lymphocyte


## 6 – Additional features
More cell-type-level features could be included, such as mean and variance of functional markers relevant for a cell type.  
This can be an interesting time to include PD1 (T cells), STING1 and PDL1 (all?).  
For all cell types, we could include Ki67, MSH2, MSH6.
What about DCN?

In [11]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PD1"].mean())

Fri Jan 31 17:57:08 2025 – Expression in Other_immune_cell
0.07829730542952233
Fri Jan 31 17:57:08 2025 – Expression in Fibroblast
0.02036398851607919
Fri Jan 31 17:57:08 2025 – Expression in Macrophage
0.07754504306914248
Fri Jan 31 17:57:08 2025 – Expression in CD4_lymphocyte
0.21412038656954321
Fri Jan 31 17:57:08 2025 – Expression in Epithelial_cell
0.09137567799882322
Fri Jan 31 17:57:08 2025 – Expression in Endothelial_cell
0.06979465511900515
Fri Jan 31 17:57:08 2025 – Expression in Monocyte
0.05747557888268267
Fri Jan 31 17:57:08 2025 – Expression in Cytotoxic_lymphocyte
0.1618491228117907


In [12]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PDL1"].mean())

Fri Jan 31 17:57:08 2025 – Expression in Other_immune_cell
0.12091111463322718
Fri Jan 31 17:57:08 2025 – Expression in Fibroblast
0.030203571452527413
Fri Jan 31 17:57:08 2025 – Expression in Macrophage
0.16249753025669383
Fri Jan 31 17:57:08 2025 – Expression in CD4_lymphocyte
0.11350441258574195
Fri Jan 31 17:57:08 2025 – Expression in Epithelial_cell
0.06635339089230838
Fri Jan 31 17:57:08 2025 – Expression in Endothelial_cell
0.08904043214021373
Fri Jan 31 17:57:08 2025 – Expression in Monocyte
0.12649067323902882
Fri Jan 31 17:57:08 2025 – Expression in Cytotoxic_lymphocyte
0.09267511686315565


In [13]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PDL1"].std())

Fri Jan 31 17:57:08 2025 – Expression in Other_immune_cell
0.14291770399929662
Fri Jan 31 17:57:09 2025 – Expression in Fibroblast
0.046669879899428376
Fri Jan 31 17:57:09 2025 – Expression in Macrophage
0.1779816841313718
Fri Jan 31 17:57:09 2025 – Expression in CD4_lymphocyte
0.1330325918971477
Fri Jan 31 17:57:09 2025 – Expression in Epithelial_cell
0.07340557715989628
Fri Jan 31 17:57:09 2025 – Expression in Endothelial_cell
0.1021410369765828
Fri Jan 31 17:57:09 2025 – Expression in Monocyte
0.17149666072296801
Fri Jan 31 17:57:09 2025 – Expression in Cytotoxic_lymphocyte
0.10927458079883498


In [14]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"STING1"].mean())

Fri Jan 31 17:57:09 2025 – Expression in Other_immune_cell
0.22799981269187136
Fri Jan 31 17:57:09 2025 – Expression in Fibroblast
0.1344997992812595
Fri Jan 31 17:57:09 2025 – Expression in Macrophage
0.36416741189387775
Fri Jan 31 17:57:09 2025 – Expression in CD4_lymphocyte
0.3708812559006673
Fri Jan 31 17:57:09 2025 – Expression in Epithelial_cell
0.1997073624666894
Fri Jan 31 17:57:09 2025 – Expression in Endothelial_cell
0.3575580239285345
Fri Jan 31 17:57:09 2025 – Expression in Monocyte
0.2982399147082395
Fri Jan 31 17:57:09 2025 – Expression in Cytotoxic_lymphocyte
0.221863044902187


## 7 – Compile and export

In [15]:
# Visualize the results
for type in types_of_interest.keys():
    print(f"{ctime()} – Visualizing {type}")
    print(features_ad[type].head())

Fri Jan 31 17:57:09 2025 – Visualizing Other_immune_cell
     proportion  metabolic_cluster_0  metabolic_cluster_1  \
A1a    0.016150             0.040000             0.000000   
A1c    0.042797             0.042254             0.774648   
A1d    0.065589             0.028986             0.115942   
A1e    0.031785             0.230769             0.134615   
A1f    0.041812             0.020833             0.270833   

     metabolic_cluster_2  metabolic_cluster_3  metabolic_cluster_4  \
A1a             0.000000             0.600000             0.000000   
A1c             0.014085             0.084507             0.000000   
A1d             0.043478             0.420290             0.000000   
A1e             0.384615             0.076923             0.000000   
A1f             0.083333             0.333333             0.041667   

     metabolic_cluster_5  metabolic_cluster_6  metabolic_cluster_7  \
A1a             0.360000             0.000000             0.000000   
A1c            

In [16]:
mdata = mu.MuData({k:mu.AnnData(v) for k,v in features_ad.items()})

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


In [17]:
# Keep track of clinical metadata
assert np.all(meta_per_fov.index == mdata.obs.index)
mdata.obs["Stage"] = meta_per_fov.Stage

In [18]:
mdata.write_h5mu("../../data/celltype_features.h5mu")

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)
