## Prepare MuData object
We want to prepare a MuData object where the different views represent the different cell types profiled, each row is one sample (patient) and each column is one feature derived from the MIBI images and previously used to predict cancer stage (see *StageXGB.ipynb*).

In [1]:
import pandas as pd
import numpy as np
import muon as mu
import scanpy as sc
from time import ctime

  from .autonotebook import tqdm as notebook_tqdm


## 1 – Cell type composition
As we now stratify data per cell type, this will lead to a single value per FOV in each view: the corresponding proportion of cells belonging to this paerticular type.

In [10]:
cell_table = pd.read_csv("../../data/cell_table_with_types_stage.csv", 
                         index_col = 0)

  cell_table = pd.read_csv("../../data/cell_table_with_types_stage.csv",


In [3]:
metab_markers = ['CA9', 'CD98', 'CytC', 'MCT1', 'ASCT2', 'LDH', 'GS', 'GLS', 'ATP5A', 'CS', 'PKM2', 'GLUT1', 'ARG1', 'CPT1A', 'Ki67']

# Only metabolic markers for cancer/epithelial cells
df = cell_table.loc[cell_table.consensus == "Cancer_cell",metab_markers] 
meta = cell_table.loc[cell_table.consensus == "Cancer_cell",["Stage","fov"]]

# A few FOVs show only few epithelial cells and might only add noise to the analysis
sparse_epi_fovs = meta.fov.value_counts()[meta.fov.value_counts() <= 20].index
df = df.loc[~meta.fov.isin(sparse_epi_fovs)]
meta = meta.loc[~meta.fov.isin(sparse_epi_fovs)]

# Only keep well-annotated stages
epithelial_subset = meta["Stage"].isin(["Colon-no.", "pT1", "pT2", "pT3", "pT4"]).values

df_per_fov = df.copy()
df_per_fov["fov"] = meta["fov"] 
df_per_fov = df_per_fov.loc[epithelial_subset].groupby("fov").mean()
meta_per_fov = meta.loc[epithelial_subset].groupby("fov").first()

In [5]:
cell_type_proportions = cell_table.groupby("fov")["consensus"].value_counts().unstack().fillna(0)
# Normalize by the number of cells in each FOV
cell_type_proportions = cell_type_proportions.div(cell_type_proportions.sum(axis=1), axis=0)
# Match to metadata and kept FOVs
cell_type_proportions = cell_type_proportions.loc[meta_per_fov.index]
# Some types are only present in a minority of FOVs
(cell_type_proportions > 0).sum(axis='rows')/len(cell_type_proportions)

consensus
APC                  0.801418
B_cell               0.189125
CAF                  0.959811
CD163_Macrophage     0.879433
CD4_Tcell            0.943262
CD68_Macrophage      0.950355
CD8_Tcell            0.869976
Cancer_cell          1.000000
Endothelial_cell     0.983452
Monocyte             0.992908
NK_cell              0.782506
Neutrophil           0.808511
Other_immune_cell    0.983452
T_reg_cell           0.652482
Unclear              1.000000
dtype: float64

In [6]:
# We redefine more balanced classes
types_of_interest = dict(
    Other_immune_cell = ["APC", "B_cell", "Neutrophil", "Other_immune_cell"],
    Fibroblast = ["CAF"],
    Macrophage = ["CD163_Macrophage", "CD68_Macrophage"],
    CD4_lymphocyte = ["CD4_Tcell", "T_reg_cell"],
    Epithelial_cell = ["Cancer_cell"],
    Endothelial_cell = ["Endothelial_cell"],
    Monocyte = ["Monocyte"],
    Cytotoxic_lymphocyte = ["NK_cell", "CD8_Tcell"],
)

Could be stored as AnnData directly:
```Python
features_ad = {}
for type in types_of_interest.keys():
    features_ad[type] = mu.AnnData(cell_type_proportions.loc[:,types_of_interest[type]].sum(axis=1).to_numpy().reshape(-1, 1))
    # Keep track of the name of the features and observations
    features_ad[type].obs.index = cell_type_proportions.index.to_list()
    features_ad[type].var.index = ["Proportion"]
```
Alternatively, we keep each cell-type dataset as a `DataFrame` to easily join additional features in the following sections.

In [7]:
features_ad = {}
for type in types_of_interest.keys():
    features_ad[type] = pd.DataFrame(cell_type_proportions.loc[:,types_of_interest[type]].sum(axis=1).to_numpy().reshape(-1, 1),
                                     index = cell_type_proportions.index.to_list(),
                                     columns = ["proportion"])

## 2 – Metabolic markers

In [13]:
for type in types_of_interest.keys():
    features_ad[type] = features_ad[type].join(
        cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),metab_markers].groupby(cell_table['fov']).median())

## 3 – MISTy features
In a FOV, is the cell type a good predictor of another cell type?

In [14]:
misty_features_raw = pd.read_csv("../../data/misty_lineage_features.csv", index_col=0)
# Subset to top 100 most common features
misty_features = misty_features_raw.iloc[:,np.argsort(-np.sum(misty_features_raw > 0))[:100]]

  return reduction(axis=axis, out=out, **passkwargs)


In [15]:
for type, subtypes in types_of_interest.items():
    for subtype in subtypes:
        # Get only columns of `misty_features` that correspond to interactions of subtypes
        features_ad[type] = features_ad[type].join(misty_features.loc[:,misty_features.columns.str.contains(f"_l.{subtype}")])
        features_ad[type] = features_ad[type].join(misty_features.loc[:,misty_features.columns.str.contains(f"_p.{subtype}")])

## 4 – Kasumi features
Kasumi results are not defined at the level of individual cells nor stratified by cell types and are not included in the factor analysis. 

## 5 – Morphological features

In [16]:
# Use class to define local scope and run the code directly
class Scope:
    # Step 0: Define parameters
    morpho_features = ['eccentricity', 'perimeter', 'area']

    for type in types_of_interest.keys():
        print(f"{ctime()} – Processing {type}")
        # Step 1: Extract relevant cell data
        X = cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),morpho_features+["fov"]]
        X = X.loc[X.fov.isin(df_per_fov.index)]

        # Step 2: Aggregate per FOV
        df_morpho_per_fov = X.groupby("fov").mean().join(X.groupby("fov").std(), rsuffix="_std")

        # Step 3: Store results in multi-feature DataFrame object
        features_ad[type] = features_ad[type].join(df_morpho_per_fov)

Tue Jun 10 15:41:31 2025 – Processing Other_immune_cell
Tue Jun 10 15:41:31 2025 – Processing Fibroblast
Tue Jun 10 15:41:31 2025 – Processing Macrophage
Tue Jun 10 15:41:31 2025 – Processing CD4_lymphocyte
Tue Jun 10 15:41:31 2025 – Processing Epithelial_cell
Tue Jun 10 15:41:31 2025 – Processing Endothelial_cell
Tue Jun 10 15:41:31 2025 – Processing Monocyte
Tue Jun 10 15:41:31 2025 – Processing Cytotoxic_lymphocyte


In [26]:
# Percentage of samples with missing values in each cell type
for typ in features_ad:
    print(typ, features_ad[typ].area.isna().sum() / len(features_ad[typ]))

Other_immune_cell 0.0070921985815602835
Fibroblast 0.04018912529550828
Macrophage 0.01182033096926714
CD4_lymphocyte 0.04964539007092199
Epithelial_cell 0.0
Endothelial_cell 0.016548463356973995
Monocyte 0.0070921985815602835
Cytotoxic_lymphocyte 0.0591016548463357


## 6 – Compile and export

In [11]:
# Visualize the results
for type in types_of_interest.keys():
    print(f"{ctime()} – Visualizing {type}")
    print(features_ad[type].head())

Wed Apr 23 13:09:20 2025 – Visualizing Other_immune_cell
     proportion       CA9      CD98      CytC      MCT1     ASCT2       LDH  \
A1a    0.016150  0.029697  0.153992  0.003456  0.000967  0.052040  0.014728   
A1c    0.042797  0.241808  0.029671  0.005882  0.199735  0.035947  0.156672   
A1d    0.065589  0.080167  0.188568  0.003726  0.039115  0.122691  0.068015   
A1e    0.031785  0.071631  0.211533  0.072472  0.028754  0.091640  0.100465   
A1f    0.041812  0.070783  0.116316  0.004614  0.051783  0.048986  0.151857   

           GS       GLS     ATP5A  ...  \
A1a  0.108009  0.013618  0.018946  ...   
A1c  0.056366  0.027748  0.054056  ...   
A1d  0.117775  0.030195  0.022767  ...   
A1e  0.166585  0.053973  0.074810  ...   
A1f  0.074441  0.031381  0.036152  ...   

     paraview.120_p.Other_immune_cell_Monocyte  \
A1a                                        0.0   
A1c                                        0.0   
A1d                                        0.0   
A1e            

In [12]:
mdata = mu.MuData({k:mu.AnnData(v) for k,v in features_ad.items()})

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


In [13]:
# Keep track of clinical metadata
assert np.all(meta_per_fov.index == mdata.obs.index)
mdata.obs["Stage"] = meta_per_fov.Stage

In [14]:
mdata.write_h5mu("../../data/celltype_features.h5mu")

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


## 7 – Additional features
More cell-type-level features could be included, such as mean and variance of functional markers relevant for a cell type.  
This can be an interesting time to include PD1 (T cells), STING1 and PDL1 (all?).  
For all cell types, we could include Ki67, MSH2, MSH6.
What about DCN? Images are low-intensity and noisy and considered uninformative.

In [15]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PD1"].mean())

Wed Apr 23 13:09:25 2025 – Expression in Other_immune_cell
0.07829730542952233
Wed Apr 23 13:09:25 2025 – Expression in Fibroblast
0.02036398851607919
Wed Apr 23 13:09:25 2025 – Expression in Macrophage
0.07754504306914248
Wed Apr 23 13:09:25 2025 – Expression in CD4_lymphocyte
0.21412038656954321
Wed Apr 23 13:09:25 2025 – Expression in Epithelial_cell
0.09137567799882322
Wed Apr 23 13:09:25 2025 – Expression in Endothelial_cell
0.06979465511900515
Wed Apr 23 13:09:25 2025 – Expression in Monocyte
0.05747557888268267
Wed Apr 23 13:09:25 2025 – Expression in Cytotoxic_lymphocyte
0.1618491228117907


In [16]:
for type in ["CD4_lymphocyte", "Cytotoxic_lymphocyte"]:
    features_ad[type] = features_ad[type].join(
        cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PD1"].groupby(cell_table['fov']).median())

In [17]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PDL1"].mean())

Wed Apr 23 13:09:26 2025 – Expression in Other_immune_cell
0.12091111463322718
Wed Apr 23 13:09:26 2025 – Expression in Fibroblast
0.030203571452527413
Wed Apr 23 13:09:26 2025 – Expression in Macrophage
0.16249753025669383
Wed Apr 23 13:09:26 2025 – Expression in CD4_lymphocyte
0.11350441258574195
Wed Apr 23 13:09:26 2025 – Expression in Epithelial_cell
0.06635339089230838
Wed Apr 23 13:09:26 2025 – Expression in Endothelial_cell
0.08904043214021373
Wed Apr 23 13:09:26 2025 – Expression in Monocyte
0.12649067323902882
Wed Apr 23 13:09:26 2025 – Expression in Cytotoxic_lymphocyte
0.09267511686315565


In [18]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PDL1"].std())

Wed Apr 23 13:09:27 2025 – Expression in Other_immune_cell
0.14291770399929662
Wed Apr 23 13:09:27 2025 – Expression in Fibroblast
0.046669879899428376
Wed Apr 23 13:09:27 2025 – Expression in Macrophage
0.1779816841313718
Wed Apr 23 13:09:27 2025 – Expression in CD4_lymphocyte
0.1330325918971477
Wed Apr 23 13:09:27 2025 – Expression in Epithelial_cell
0.07340557715989628
Wed Apr 23 13:09:27 2025 – Expression in Endothelial_cell
0.1021410369765828
Wed Apr 23 13:09:27 2025 – Expression in Monocyte
0.17149666072296801
Wed Apr 23 13:09:27 2025 – Expression in Cytotoxic_lymphocyte
0.10927458079883498


In [19]:
for type in types_of_interest.keys():
    if type in ["CD4_lymphocyte", "Cytotoxic_lymphocyte"]:
        # Skip T cells
        continue
    features_ad[type] = features_ad[type].join(
        cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"PDL1"].groupby(cell_table['fov']).median())

In [20]:
for type in types_of_interest.keys():
    print(f"{ctime()} – Expression in {type}")
    print(cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),"STING1"].mean())

Wed Apr 23 13:09:30 2025 – Expression in Other_immune_cell
0.22799981269187136
Wed Apr 23 13:09:30 2025 – Expression in Fibroblast
0.1344997992812595
Wed Apr 23 13:09:30 2025 – Expression in Macrophage
0.36416741189387775
Wed Apr 23 13:09:30 2025 – Expression in CD4_lymphocyte
0.3708812559006673
Wed Apr 23 13:09:30 2025 – Expression in Epithelial_cell
0.1997073624666894
Wed Apr 23 13:09:30 2025 – Expression in Endothelial_cell
0.3575580239285345
Wed Apr 23 13:09:30 2025 – Expression in Monocyte
0.2982399147082395
Wed Apr 23 13:09:30 2025 – Expression in Cytotoxic_lymphocyte
0.221863044902187


In [21]:
for type in types_of_interest.keys():
    features_ad[type] = features_ad[type].join(
        cell_table.loc[cell_table.consensus.isin(types_of_interest[type]),["STING1", "MSH2", "MSH6"]].groupby(cell_table['fov']).median())

In [22]:
# Assemble into MuData
mdata = mu.MuData({k:mu.AnnData(v) for k,v in features_ad.items()})
# Keep track of clinical metadata
assert np.all(meta_per_fov.index == mdata.obs.index)
mdata.obs["Stage"] = meta_per_fov.Stage
# Export to h5mu, including functional markers
mdata.write_h5mu("../../data/celltype_features_with_functional_markers.h5mu")

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)
  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)
