# Stage modelling from spatial features

In [1]:
import os
import numpy as np
import anndata as ad
import pandas as pd
from plotnine import *

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedGroupKFold, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.neighbors import KNeighborsClassifier
import scanpy as sc
from joblib import Parallel, delayed
import warnings, logging

## Overall cell features

In [2]:
cell_table = pd.read_csv("../../data/cell_table_with_types_stage.csv", 
                         index_col = 0)



In [3]:
metab_markers = ['CA9', 'CD98', 'CytC', 'MCT1', 'ASCT2', 'LDH', 'GS', 'GLS', 'ATP5A', 'CS', 'PKM2', 'GLUT1', 'ARG1', 'CPT1A', 'Ki67']

# Only metabolic markers for cancer/epithelial cells
df = cell_table.loc[cell_table.consensus == "Cancer_cell",metab_markers] 
meta = cell_table.loc[cell_table.consensus == "Cancer_cell",["Stage","fov"]]

# A few FOVs show only few epithelial cells and might only add noise to the analysis
sparse_epi_fovs = meta.fov.value_counts()[meta.fov.value_counts() <= 20].index
df = df.loc[~meta.fov.isin(sparse_epi_fovs)]
meta = meta.loc[~meta.fov.isin(sparse_epi_fovs)]

# Only keep well-annotated stages
epithelial_subset = meta["Stage"].isin(["Colon-no.", "pT1", "pT2", "pT3", "pT4"]).values

We choose to perform the prediction task on well annotated FOVs with sufficient epithelial cells.
```python
# Note: 13 FOVs are not annotated as healthy or to a specific cancer stage
cell_table.loc[cell_table.fov.isin(meta.loc[~epithelial_subset].fov), "fov"].unique()

# Additionally, 7 FOVs do not contain epithelial cells and are therefore lost
set(cell_table.fov) - set(meta.fov)

# Finally, we also exclude 27 FOVs that contain only few epithelial cells
# The rationale is to compare on the cellular organization around the colorectal epithelium
set(sparse_epi_fovs)
```

In [4]:
# The least represented conditions are not affected
print(cell_table.groupby("fov").first().Stage.value_counts())
print(meta.loc[epithelial_subset].groupby("fov").first().Stage.value_counts())

Stage
pT3          234
pT4          105
pT2           92
pT1           19
SCT            6
Colon-no.      5
Name: count, dtype: int64
Stage
pT3          218
pT4           97
pT2           85
pT1           18
Colon-no.      5
Name: count, dtype: int64


## Prepare cross-validation

In [5]:
# Define which fovs will be held out for validation (outer loop)
fov_stage_table = meta.loc[epithelial_subset].drop_duplicates().reset_index(drop=True)
fov_inner, fov_val, y_inner, y_val = train_test_split(
    fov_stage_table["fov"], fov_stage_table["Stage"], test_size=0.2, random_state=0, stratify=fov_stage_table["Stage"])
meta["inner"] = meta["fov"].isin(fov_inner)
# For sanity check, the following should consistently be used as validation set
" ".join(fov_val.sort_values())

'A1a A1h A2g A2i A2q A2r A3m A4e A4n A5a A5f A5q A6b A6c A6g A6m A6p A6q A6r A7f A7p A8a A8h A8m A9o B1h B1k B2a B2b B2h B2k B2o B3c B3g B4b B4g B4m B5r B6i B6q B7c B7p B7r B8b B8d B8g B8i B8l B9c B9d B9h B9m B9n B9o C1l C2a C2f C2k C3c C3h C4a C4g C4k C5a C5k C5l C6d C8a C8h D1i D1l D1m D4c D4h D5b D5h D5k D5l D6b D6c D6k D7a D8d D8h E4e'

In [6]:
n_splits = 4

cv_folds = StratifiedGroupKFold(n_splits=n_splits)
for train, test in cv_folds.split(df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]],
                                  meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
                                  groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]):
    # All stages should be present in both train and test
    assert len(meta.loc[epithelial_subset].loc[meta["inner"]].iloc[test].groupby("fov")["Stage"].first().unique()) == 5
    assert len(meta.loc[epithelial_subset].loc[meta["inner"]].iloc[train].groupby("fov")["Stage"].first().unique()) == 5
    # Print the first test FOVs
    print(meta.loc[epithelial_subset].loc[meta["inner"]].iloc[test].groupby("fov").first().index[:5]) 

Index(['A1d', 'A1e', 'A1l', 'A1n', 'A2e'], dtype='object', name='fov')
Index(['A1c', 'A1m', 'A1o', 'A1p', 'A1q'], dtype='object', name='fov')
Index(['A1i', 'A1r', 'A2b', 'A2c', 'A2l'], dtype='object', name='fov')
Index(['A1f', 'A1k', 'A2a', 'A2d', 'A2m'], dtype='object', name='fov')


We can split the data at the cell level to allow applications that involve processing single cells. The grouping ensures that the data is split per FOV without contamination (i.e. cells from a given FOV present both in training and testing).

In [7]:
df_per_fov = df.copy()
df_per_fov["fov"] = meta["fov"] 
df_per_fov = df_per_fov.loc[epithelial_subset].groupby("fov").mean()
meta_per_fov = meta.loc[epithelial_subset].groupby("fov").first()

Alternatively, we directly split the FOVs, for applications that involve FOV-level features. 
Note: we could generate independent folds, but the FOV distribution would not be identical.
We choose to directly define the folds to be the same to make results more comparable.
```Python
cv_folds_fov = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
for train, test in cv_folds_fov.split(df_per_fov.loc[meta_per_fov["inner"]],
                                      meta_per_fov.loc[meta_per_fov["inner"]]["Stage"]
                                    ):
    # All stages should be present in both train and test
    assert len(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test]["Stage"].unique()) == 5
    assert len(meta_per_fov.loc[meta_per_fov["inner"]].iloc[train]["Stage"].unique()) == 5
    # Print the first test FOVs
    print(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test].index[:5]) 

```

In [8]:
def conv_traintest_cells_to_fov():
    for train, test in cv_folds.split(df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]],
                                    meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
                                    groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]):
        test_fovs = meta.loc[epithelial_subset].loc[meta["inner"]].iloc[test].groupby("fov").first().index
        train_fovs = meta.loc[epithelial_subset].loc[meta["inner"]].iloc[train].groupby("fov").first().index
        test_fovs_ind = np.where(meta_per_fov.loc[meta_per_fov["inner"]].index.isin(test_fovs))[0]
        train_fovs_ind = np.where(meta_per_fov.loc[meta_per_fov["inner"]].index.isin(train_fovs))[0]
        yield (train_fovs_ind, test_fovs_ind)

# Should work as the output of `split` method: two arrays of indices
cv_folds_fov = [x for x in conv_traintest_cells_to_fov()]

for train, test in cv_folds_fov:
    # All stages should be present in both train and test
    assert len(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test]["Stage"].unique()) == 5
    assert len(meta_per_fov.loc[meta_per_fov["inner"]].iloc[train]["Stage"].unique()) == 5
    # Print the first test FOVs
    print(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test].index[:5])

Index(['A1d', 'A1e', 'A1l', 'A1n', 'A2e'], dtype='object', name='fov')
Index(['A1c', 'A1m', 'A1o', 'A1p', 'A1q'], dtype='object', name='fov')
Index(['A1i', 'A1r', 'A2b', 'A2c', 'A2l'], dtype='object', name='fov')
Index(['A1f', 'A1k', 'A2a', 'A2d', 'A2m'], dtype='object', name='fov')


## Model 0: Baseline (most abundant label)

In [43]:
np.mean([f1_score(LabelEncoder().fit_transform(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test]["Stage"]),
                [3 for _ in range(meta_per_fov.loc[meta_per_fov["inner"]].iloc[test].shape[0])],
                average="macro")
                for train, test in cv_folds_fov])

0.13593358969663677

## Model 1: Cell type composition

In [9]:
cell_type_proportions = cell_table.groupby("fov")["consensus"].value_counts().unstack().fillna(0)
# Normalize by the number of cells in each FOV
cell_type_proportions = cell_type_proportions.div(cell_type_proportions.sum(axis=1), axis=0)
# Match to metadata and kept FOVs
cell_type_proportions = cell_type_proportions.loc[meta_per_fov.index]

In [10]:
cross_val_score(
    XGBClassifier(
        n_estimators=250, 
        max_depth=3, 
        device="cuda", 
        random_state=0),
    cell_type_proportions.loc[meta_per_fov["inner"]],
    LabelEncoder().fit_transform(meta_per_fov.loc[meta_per_fov["inner"]]["Stage"]),
    cv=cv_folds_fov, 
    scoring='f1_macro')

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




array([0.21614339, 0.24350877, 0.19701984, 0.22308611])

For cell type composition, XGBoost seems appropriate.
```Python
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Define models and their hyperparameters
models = {
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'SVM': SVC(),
    'LogisticRegression': LogisticRegression()
}

params = {
    'RandomForest': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [3, 5, 7]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [3, 5],
        'device': ['cuda'],
        'random_state': [0]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs']
    }
}

# Perform GridSearchCV for each model
best_models = {}
best_scores = {}
for model_name in models:
    grid_search = GridSearchCV(models[model_name], params[model_name], cv=cv_folds_fov, scoring='f1_macro')
    grid_search.fit(cell_type_proportions.loc[meta_per_fov["inner"]], LabelEncoder().fit_transform(meta_per_fov.loc[meta_per_fov["inner"]]["Stage"]))
    best_models[model_name] = grid_search.best_estimator_
    best_scores[model_name] = grid_search.best_score_

# Print the best models and their parameters
for model in best_models.keys():
    print(model, best_models[model])
    print("Best score:", best_scores[model])

```

RandomForest RandomForestClassifier(max_depth=7, n_estimators=50)
Best score: 0.18921113795228467
XGBoost XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cuda', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=50, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
Best score: 0.22902662111995187
SVM SVC(C=10)
Best score: 0.16942377871729714
LogisticRegression LogisticRegression(C=10)
Best sco

In [45]:
def process_fold_composition(train, test, n_estimators = 250):
    logging.getLogger('tensorflow').setLevel(logging.ERROR)
    with warnings.catch_warnings(action="ignore"):      
        # Step 1: Define train and test subsets of the compositional data
        train_composition = cell_type_proportions.loc[meta_per_fov["inner"]].iloc[train]
        test_composition = cell_type_proportions.loc[meta_per_fov["inner"]].iloc[test]
        train_meta = meta_per_fov.loc[meta_per_fov["inner"]].iloc[train]["Stage"]
        test_meta = meta_per_fov.loc[meta_per_fov["inner"]].iloc[test]["Stage"]

        # Step 2: Train a classifier on the training data composition to predict the stage of each FOV
        xgb = XGBClassifier(
            n_estimators=n_estimators, 
            max_depth=3, 
            device="cuda", 
            random_state=0)
        xgb.fit(train_composition,
                LabelEncoder().fit_transform(train_meta))

        # Step 3: Predict stage of each FOV in the test data
        preds = xgb.predict(test_composition)

        # Step 4: Compute f1_score
        score = f1_score(LabelEncoder().fit_transform(test_meta), 
                        preds, 
                        average="macro")
        return score

In [48]:
def test_estimator_composition(estimators):
    scores = [process_fold_composition(train, test, n_estimators=estimators)
              for train, test in cv_folds_fov]
    mean_score = np.mean(scores)
    # Append to f1 score log file
    with open(f"../../data/model1_cell_composition_f1_scores.txt", "a") as f:
        f.write(f"{estimators},{mean_score}\n")
    
    print(estimators, mean_score)
    return mean_score

In [53]:
# Parallelize the hyperparameter loop | ~81mn
Parallel(n_jobs=28)(delayed(test_estimator_composition)(r_estimators)
                for r_estimators in np.arange(50, 1001, 10))

pd.read_csv("../../data/model1_cell_composition_f1_scores.txt", header=None, names=["estimators", "score"]).sort_values("score", ascending=False).head(10)

50 0.22646412466862742
60 0.23237111790051587
70 0.22481340528160249
80 0.22185852425545932
90 0.22652141523648733
100 0.22402519230548182
110 0.22149567220861638
120 0.2361872495572352
130 0.2347837120697448
140 0.2352441551717641
150 0.23573434499117085
160 0.2428207545948951
170 0.22214746394352586
180 0.22200453777508428
190 0.22108205101633283
200 0.2199281668690276
210 0.22177034262198053
220 0.21963307242887645
230 0.21963307242887645
240 0.21544310609445325
250 0.21993952678294126
260 0.22110051885712914
270 0.21993952678294126
280 0.21743320607600433
290 0.21662121815978286
300 0.21662121815978286
310 0.21851115554192702
320 0.21772180512993505
330 0.21851115554192702
340 0.21851115554192702
350 0.21641236551696796
360 0.21751295248712016
370 0.21641236551696796
380 0.21751295248712016
390 0.21751295248712016
400 0.21751295248712016
410 0.21672360207512825
420 0.21672360207512825
430 0.21672360207512825
440 0.2165444964133374
450 0.2165444964133374
460 0.2165444964133374
470 0

Unnamed: 0,estimators,score
11,160,0.242821
7,120,0.236187
10,150,0.235734
9,140,0.235244
8,130,0.234784
1,60,0.232371
71,760,0.230606
4,90,0.226521
0,50,0.226464
2,70,0.224813


## Model 2: Metabolic clusters
See *MetabViz.ipynb* for details.

In [12]:
cross_val_score(
    XGBClassifier(
        n_estimators=250, 
        max_depth=3, 
        device="cuda", 
        random_state=0),
    df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]],
    LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"]),
    groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"], 
    cv=cv_folds, 
    scoring='f1_macro')

array([0.21137964, 0.31287193, 0.25688164, 0.19952781])

df_per_fov = df.copy()
df_per_fov["fov"] = meta["fov"] 
df_per_fov = df_per_fov.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]].groupby("fov").mean()

In [14]:
pd.read_csv("../../data/cluster_f1_scores.txt").sort_values("score", ascending=False).head(10)

Unnamed: 0,resolution,neighbors,estimators,score
153,0.23,5,169,0.369303
68,0.9,5,440,0.360735
56,0.13,11,31,0.355948
352,0.63,3,268,0.354892
140,0.86,4,369,0.35136
47,0.29,2,228,0.350469
48,0.29,2,242,0.348103
172,0.58,2,356,0.345388
315,0.35,9,165,0.34234
271,0.41,6,448,0.341682


In [15]:
res = 0.23
neighbors = 5
estimators = 169

In [16]:
def process_fold(train, test, n_neighbors = 30, resolution = 0.5, n_estimators = 250):
    logging.getLogger('tensorflow').setLevel(logging.ERROR)
    with warnings.catch_warnings(action="ignore"):
        # Step 1: Define metabolic clusters on training data
        ad = sc.AnnData(df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]].iloc[train])
        sc.pp.neighbors(ad, n_neighbors=n_neighbors)
        sc.tl.leiden(ad, resolution=resolution)
        ad.obs.leiden = ad.obs.leiden.values.astype(int)

        # Step 2: Define a classifier to propagate the clusters to the test data
        neigh = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1)
        neigh.fit(df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]].iloc[train].values, 
                ad.obs.leiden.values)

        # Step 3: Compute proportion of cells in each cluster for each FOV in the training data
        train_fov_cluster_composition = pd.DataFrame(ad.obs.leiden.values, columns=["Cluster"])
        train_fov_cluster_composition["fov"] = (
            meta.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]].iloc[train]["fov"].values
        )
        train_fov_cluster_composition = (
            train_fov_cluster_composition.groupby("fov")["Cluster"].value_counts().unstack().fillna(0)
        )
        # Normalize by the number of cells in each FOV
        train_fov_cluster_composition = (
            train_fov_cluster_composition.div(train_fov_cluster_composition.sum(axis=1), axis=0)
        )

        # Step 4: Train a classifier on the training data composition to predict the stage of each FOV
        xgb = XGBClassifier(
            n_estimators=n_estimators, 
            max_depth=3, 
            device="cuda", 
            random_state=0)
        xgb.fit(train_fov_cluster_composition,
                LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]].groupby("fov").first()["Stage"].loc[train_fov_cluster_composition.index]))
        
        # Step 5: Predict clusters on test data
        test_clusters = neigh.predict(df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]].iloc[test].values)

        # Step 6: Compute proportion of cells in each cluster for each FOV in the test data
        test_fov_cluster_composition = pd.DataFrame(test_clusters, columns=["Cluster"])
        test_fov_cluster_composition["fov"] = meta.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]].iloc[test]["fov"].values
        test_fov_cluster_composition = test_fov_cluster_composition.groupby("fov")["Cluster"].value_counts().unstack().fillna(0)
        # Normalize by the number of cells in each FOV
        test_fov_cluster_composition = test_fov_cluster_composition.div(test_fov_cluster_composition.sum(axis=1), axis=0)
        
        # Ensure all clusters are covered in the testing dataframe
        for cluster in train_fov_cluster_composition.columns:
            if cluster not in test_fov_cluster_composition.columns:
                test_fov_cluster_composition[cluster] = 0

        # Reorder columns to match the training set
        test_fov_cluster_composition = test_fov_cluster_composition[train_fov_cluster_composition.columns]

        # Step 7: Predict stage of each FOV in the test data
        preds = xgb.predict(test_fov_cluster_composition)

        # Step 8: Compute f1_score
        score = f1_score(LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]].groupby("fov").first()["Stage"].loc[test_fov_cluster_composition.index]), 
                        preds, 
                        average="macro")
        return score
    
# Function to process a single set of hyperparameters
def process_hyperparameters(r_resolution, r_neighbors, r_estimators):
    r_neighbors = int(r_neighbors)
    print(r_resolution, r_neighbors, r_estimators)
    scores = Parallel(n_jobs=1)(delayed(process_fold)(train, 
                                                       test, 
                                                       n_neighbors=r_neighbors,
                                                       resolution=r_resolution,
                                                       n_estimators=r_estimators) 
                                 for train, test in cv_folds.split(
        df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    ))
    mean_score = np.mean(scores)
    print("[Results]  ", r_resolution, r_neighbors, r_estimators, mean_score)
    
    # Ensure file exists
    if not os.path.isfile("../../data/cluster_f1_scores.txt"):
        with open("../../data/cluster_f1_scores.txt", "w") as f:
            f.write("resolution,neighbors,estimators,score\n")

    # Append to f1 score log file
    with open(f"../../data/cluster_f1_scores.txt", "a") as f:
        f.write(f"{r_resolution},{r_neighbors},{r_estimators},{mean_score}\n")

In [17]:
scores = Parallel(n_jobs=-1)(delayed(process_fold)(train, 
                                                    test, 
                                                    n_neighbors=neighbors,
                                                    resolution=res,
                                                    n_estimators=estimators) 
                                 for train, test in cv_folds.split(
        df.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"]],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    )) # This should confirm the model selection estimates

print(np.mean(scores))

2025-01-15 17:29:30.000384: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-15 17:29:30.022882: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-15 17:29:30.174898: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-15 17:29:30.195231: I tensorflow/core/platform/cpu_feature_g

0.36930323414085386


## Model 3: MISTy cell type features
Will need to be re-run with `zoi` set up.

In [19]:
misty_features = pd.read_csv("../../data/misty_lineage_features.csv", index_col=0)
# Subset to top 100 most common features
misty_features.iloc[:,np.argsort(-np.sum(misty_features > 0))[:100]]

Unnamed: 0_level_0,juxtaview.40_l.Cancer_cell_Cancer_cell,juxtaview.40_l.Endothelial_cell_Endothelial_cell,paraview.100_p.Endothelial_cell_Endothelial_cell,paraview.100_p.CAF_CAF,paraview.100_p.Cancer_cell_Cancer_cell,juxtaview.40_l.CAF_CAF,paraview.100_p.Monocyte_Monocyte,juxtaview.40_l.Monocyte_Monocyte,paraview.100_p.Other_immune_cell_Other_immune_cell,paraview.100_p.CD68_Macrophage_CD68_Macrophage,...,paraview.100_p.Endothelial_cell_CD68_Macrophage,paraview.100_p.Other_immune_cell_CD163_Macrophage,paraview.100_p.CAF_Monocyte,juxtaview.40_l.Other_immune_cell_CD8_Tcell,paraview.100_p.CAF_CD4_Tcell,paraview.100_p.Monocyte_Endothelial_cell,paraview.100_p.CD4_Tcell_Monocyte,paraview.100_p.CAF_CD68_Macrophage,paraview.100_p.Cancer_cell_CD4_Tcell,paraview.100_p.CD4_Tcell_Other_immune_cell
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1a,3.241289,1.869391,1.735802,1.214244,1.510572,2.906951,0.000000,0.000000,1.686409,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,1.588098,0.0,0.000000,0.000000
A1c,3.372725,3.330195,1.714173,1.193790,2.344175,3.171733,0.937414,0.000000,2.217521,1.431484,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
A1d,3.140940,2.861029,2.328312,1.404295,2.072094,2.471668,1.100926,0.000000,2.132901,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
A1e,2.847571,3.204835,1.515047,1.959971,1.997507,3.088748,3.116230,1.486980,0.000000,0.000000,...,1.383975,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
A1f,2.902234,3.088106,0.000000,2.361394,1.301067,1.574763,1.309559,1.584863,0.000000,2.593865,...,0.000000,0.0,0.0,0.942197,0.0,1.442211,1.223243,0.0,1.210359,2.400164
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E3c,3.314948,3.053053,1.040383,2.578936,2.574410,3.453385,1.671570,2.109663,3.133494,2.190319,...,0.000000,0.0,0.0,0.000000,0.0,1.916585,0.000000,0.0,0.000000,0.000000
E3e,1.312875,1.099634,0.000000,0.000000,0.000000,0.000000,1.214136,2.492264,2.139975,0.000000,...,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000
E4a,3.246438,2.904361,1.910934,1.010343,0.000000,1.716840,2.164491,2.737081,1.948477,0.000000,...,0.000000,0.0,0.0,0.973520,0.0,0.000000,0.000000,0.0,0.000000,0.000000
E4d,2.924364,1.503004,1.256012,0.000000,0.984475,0.000000,1.809912,2.678005,0.000000,0.000000,...,2.294258,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000


In [None]:
def process_fold_misty(train, test, n_estimators = 250):
    logging.getLogger('tensorflow').setLevel(logging.ERROR)
    with warnings.catch_warnings(action="ignore"):
        # Define correct FOV-level data
        
        # Step 4: Train a classifier on the training data to predict the stage of each FOV
        xgb = XGBClassifier(
            n_estimators=n_estimators, 
            max_depth=3, 
            device="cuda", 
            random_state=0)
        xgb.fit(train_df,
                LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[train]))
        
        # # Step 7: Predict stage of each FOV in the test data
        test_df = df_misty.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()].iloc[test]
        preds = xgb.predict(test_df)

        # # Step 8: Compute f1_score
        score = f1_score(LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[test]), 
                        preds, 
                        average="macro")
        return score

## Model 4: Kasumi metabolic features

## Model 5: Morphological features

## Other tests

### MISTy duplicated per cell

In [20]:
# Join meta and misty features with FOV alignment
misty_features_per_cell = (
    meta
    .set_index("fov")
    .join(misty_features, how="left")
    .drop(["Stage", "inner"], axis="columns")
)

# Combine with original dataframe
df_misty = pd.concat(
    [df.reset_index(drop=True), 
     misty_features_per_cell.reset_index(drop=True)], 
    axis=1
    )

In [21]:
df_misty.head()

Unnamed: 0,CA9,CD98,CytC,MCT1,ASCT2,LDH,GS,GLS,ATP5A,CS,...,juxtaview.40_l.NK_cell_B_cell,juxtaview.40_l.CD68_Macrophage_B_cell,juxtaview.40_l.CD8_Tcell_B_cell,juxtaview.40_l.Endothelial_cell_B_cell,juxtaview.40_l.CD4_Tcell_B_cell,juxtaview.40_l.Monocyte_B_cell,juxtaview.40_l.CD163_Macrophage_B_cell,juxtaview.40_l.APC_B_cell,juxtaview.40_l.T_reg_cell_B_cell,juxtaview.40_l.B_cell_B_cell
0,0.021289,0.006393,0.015713,0.000193,0.045864,0.013713,0.02407,0.04426,0.089495,0.097397,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.014578,0.008831,0.106513,0.002774,0.028734,0.002855,0.024809,0.061641,0.063325,0.079685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.113299,0.034138,0.011429,0.001506,0.02826,0.013809,0.009312,0.022929,0.006431,0.04586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.128277,0.040913,0.030368,0.00528,0.034091,0.032231,0.008779,0.032329,0.026225,0.063046,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.107682,0.07013,0.098551,0.020924,0.117576,0.035627,0.028709,0.05507,0.125172,0.104851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df_misty.shape

(262188, 398)

In [None]:
misty_features.shape

In [None]:
misty_features_per_cell.shape

In [None]:
set(meta.fov.unique()) - set(misty_features.index.to_list()) # SCT samples are listed in meta

In [None]:
set(misty_features.index.to_list()) - set(meta.fov.unique()) # Why?

In [None]:
meta.fov.unique().shape

In [None]:
set(misty_features.index.to_list()) - set(meta.fov.unique())

In [17]:
def process_fold_misty(train, test, n_estimators = 250):
    logging.getLogger('tensorflow').setLevel(logging.ERROR)
    with warnings.catch_warnings(action="ignore"):
        train_df = df_misty.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()].iloc[train]
        
        # Step 4: Train a classifier on the training data to predict the stage of each FOV
        xgb = XGBClassifier(
            n_estimators=n_estimators, 
            max_depth=3, 
            device="cuda", 
            random_state=0)
        xgb.fit(train_df,
                LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[train]))
        
        # # Step 7: Predict stage of each FOV in the test data
        test_df = df_misty.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()].iloc[test]
        preds = xgb.predict(test_df)

        # # Step 8: Compute f1_score
        score = f1_score(LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[test]), 
                        preds, 
                        average="macro")
        return score

In [118]:
for train, test in cv_folds.split(
        df_misty.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    ):
    process_fold_misty(train, test)

In [None]:
scores = Parallel(n_jobs=-1)(delayed(process_fold_misty)(train, 
                                                    test, 
                                                    n_estimators=estimators) 
                                 for train, test in cv_folds.split(
        df_misty.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    ))

print(np.mean(scores))

In [None]:
scores = [process_fold_misty(train, test, n_estimators=300)
          for train, test in cv_folds.split(
        df_misty.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    )]

print(np.mean(scores))

In [132]:
def process_fold_misty_features_per_cell(train, test, n_estimators = 250):
    logging.getLogger('tensorflow').setLevel(logging.ERROR)
    with warnings.catch_warnings(action="ignore"):
        train_df = misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()].iloc[train]
        
        # Step 4: Train a classifier on the training data to predict the stage of each FOV
        xgb = XGBClassifier(
            n_estimators=n_estimators, 
            max_depth=3, 
            device="cuda", 
            random_state=0)
        xgb.fit(train_df,
                LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[train]))
        
        # # Step 7: Predict stage of each FOV in the test data
        test_df = misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()].iloc[test]
        preds = xgb.predict(test_df)

        # # Step 8: Compute f1_score
        score = f1_score(LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[test]), 
                        preds, 
                        average="macro")
        return score

In [None]:
scores = [process_fold_misty_features_per_cell(train, test, n_estimators=220)
          for train, test in cv_folds.split(
        misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    )]

print(np.mean(scores))

In [None]:
for estimators in np.arange(50, 1001, 200):
    scores = [process_fold_misty_features_per_cell(train, test, n_estimators=estimators)
              for train, test in cv_folds.split(
            misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()],
            meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
            groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
        )]
    mean_score = np.mean(scores)
    print(estimators, mean_score)
    # Append to f1 score log file
    with open(f"../../data/misty_f1_scores.txt", "a") as f:
        f.write(f"{estimators},{mean_score}\n")

In [164]:
def test_estimator_misty_features_per_cell(estimators):
    scores = [process_fold_misty_features_per_cell(train, test, n_estimators=estimators)
          for train, test in cv_folds.split(
        misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    )]
    mean_score = np.mean(scores)
    # Append to f1 score log file
    with open(f"../../data/misty_f1_scores.txt", "a") as f:
        f.write(f"{estimators},{mean_score}\n")
    
    print(estimators, mean_score)
    return mean_score

In [None]:
# Parallelize the hyperparameter loop | ~81mn
Parallel(n_jobs=28)(delayed(test_estimator_misty_features_per_cell)(r_estimators)
                for r_estimators in np.arange(50, 1001, 5))

In [None]:
pd.read_csv("../../data/misty_f1_scores.txt", header=None, names=["estimators", "score"]).sort_values("score", ascending=False).head(10)

In [149]:
def transform_features(df, threshold=0.8):
    # Calculate the coefficient of variation for each column
    coef_var = df.std() / df.mean()
    
    # Sort columns by coefficient of variation in descending order
    sorted_columns = coef_var.sort_values(ascending=False).index
    
    # Compute the full pairwise correlation matrix
    corr_matrix = df.corr()
    
    # Initialize a list to keep track of selected columns
    selected_columns = []
    
    # Initialize a set to keep track of columns to be dropped
    dropped_columns = set()
    
    # Iterate over sorted columns
    for col in sorted_columns:
        if col not in dropped_columns:
            selected_columns.append(col)
            # Drop columns that are highly correlated with the current column
            dropped_columns.update(corr_matrix.index[corr_matrix[col].abs() > threshold].tolist())
    
    # Return the dataframe with selected columns
    return df[selected_columns]

uncorr_misty_features_per_cell = transform_features(misty_features_per_cell, threshold=0.5)

In [151]:
def process_fold_uncorr_misty_features_per_cell(train, test, n_estimators = 250):
    logging.getLogger('tensorflow').setLevel(logging.ERROR)
    with warnings.catch_warnings(action="ignore"):
        train_df = uncorr_misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()].iloc[train]
        
        # Step 4: Train a classifier on the training data to predict the stage of each FOV
        xgb = XGBClassifier(
            n_estimators=n_estimators, 
            max_depth=3, 
            device="cuda", 
            random_state=0)
        xgb.fit(train_df,
                LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[train]))
        
        # # Step 7: Predict stage of each FOV in the test data
        test_df = uncorr_misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()].iloc[test]
        preds = xgb.predict(test_df)

        # # Step 8: Compute f1_score
        score = f1_score(LabelEncoder().fit_transform(meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"].iloc[test]), 
                        preds, 
                        average="macro")
        return score

In [None]:
scores = [process_fold_uncorr_misty_features_per_cell(train, test, n_estimators=300)
          for train, test in cv_folds.split(
        uncorr_misty_features_per_cell.loc[epithelial_subset].loc[meta.loc[epithelial_subset]["inner"].to_list()],
        meta.loc[epithelial_subset].loc[meta["inner"]]["Stage"],
        groups=meta.loc[epithelial_subset].loc[meta["inner"]]["fov"]
    )]

print(np.mean(scores))

In [4]:
adata = ad.read_h5ad('../../data/adata_consensus_cell_types.h5ad')

In [5]:
all_functional_and_metab = ['CA9', 'CD98', 'CytC', 'MSH2', 'MCT1', 'ASCT2',
       'LDH', 'STING1', 'GS', 'GLS', 'ATP5A', 'CS', 'PKM2', 'GLUT1', 'MSH6', 'ARG1', 'CPT1A', 'Ki67']

In [6]:
# Group rows by cell type and compute median expression
df = adata.obs.loc[:,all_functional_and_metab]
df["cell_type"] = adata.obs["annotation_consensus"].values
df = df.groupby("cell_type").median().T
df = df.drop("Unclear", axis=1)

In [7]:
# Get tumor stage
clini = pd.read_csv("../../data/summary_clinical_data_modified.csv", index_col=2)
adata.obs = adata.obs.merge(clini, left_on="fov", right_index=True, how="left")