# Explore annotated cell types
Corresponds to fig 2 and sfig 2 in draft.

In [None]:
import os
import numpy as np
import anndata as ad
import pandas as pd
import scanpy as sc
from plotnine import *

In [None]:
adata = ad.read_h5ad('../../data/adata_consensus_cell_types.h5ad')

In [None]:
adata.var_names

In [None]:
adata.X.shape

In [None]:
579011 - 90917

In [None]:
adata.obs.annotation_consensus.value_counts()

In [None]:
markers_t = ['SMA', 'CD31', 'CD163', 'CD68', 'CD8', 'CD45', 'PanCK', 'MPO', 'CD7'] + ["CD3e", "CD8", 'CD7', "CD14", "MPO",'CD20', 'CD68', "CD163", "HLADRa"]

## Display clusters

In [None]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=150, facecolor="white")

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)

In [None]:
# Display all markers used to generate UMAP
# Note that scanpy does not save in the file at the path provided but enforce its own relative path structure
sc.pl.umap(adata, color=adata.var_names, color_map="plasma_r", save='/fig2/umap_lineage_markers.pdf')

In [None]:
# Display curated cell types
sc.pl.umap(adata, color="annotation_consensus")

In [None]:
# Display curated cell types, excluding unclear cells
sc.pl.umap(adata[adata.obs["consensus"] != "Unclear"], color="annotation_consensus", save='/fig2/umap_types.pdf')

In [None]:
# Display the expression levels in each cureated cell type
sc.pl.dotplot(adata, adata.var_names, groupby="annotation_consensus", dendrogram=False, standard_scale="var")

In [None]:
# Group rows by cell type and compute median expression
df = pd.DataFrame(adata.X, columns=adata.var_names)
df["cell_type"] = adata.obs["annotation_consensus"].values
df = df.groupby("cell_type").median().T
# Note: z-score transformations just look more noisy
# Drop unclear cells for figure (actually low for all markers)
df = df.drop("Unclear", axis=1)

# Manual rearrangement of cell types and markers
# to display positive markers on the diagonal
cell_types_ordered = [
    'T_reg_cell', 'CD4_Tcell', 'CD8_Tcell', 'NK_cell',
    'B_cell', 'CD68_Macrophage', 'CD163_Macrophage', 
    'Other_immune_cell', 'Monocyte', 'APC', 
    "Neutrophil", "Cancer_cell", "Endothelial_cell", "CAF",

]
markers_ordered = [
    "SMA", "CD31",  "PanCK", "MPO",
    'HLADRa','CD14', 'CD163', 'CD68', 'CD20',  
    'CD8', "CD4", 'FoxP3', 'CD7', "CD3e", "CD45",
]
assert set(df.index) == set(markers_ordered)
assert set(df.columns) == set(cell_types_ordered)
# Reorder the DataFrame
df_ordered = df.melt(ignore_index=False).reset_index()
df_ordered["cell_type"] = pd.Categorical(df_ordered["cell_type"], categories=cell_types_ordered, ordered=True)
df_ordered["index"] = pd.Categorical(df_ordered["index"], categories=markers_ordered, ordered=True)

# Display as heatmap
gp = (
    ggplot(df_ordered, aes(x="index", y="cell_type")) 
    + geom_tile(aes(fill="value")) 
    + theme_classic() 
    + theme(axis_text_x=element_text(angle=90)) 
    + labs(y="Cell type", x="Lineage marker", fill="Median\nexpression") 
    # + coord_equal()
    # Use reverted plasma color palette
    + scale_fill_gradientn(colors = ["#EFF822", "#CC4977","#0F0782"])
)
ggsave(gp, "../../figures/fig2/heatmap_lineage_markers.pdf", width = 6, height = 4)
gp

In [None]:
from plotnine import *
import matplotlib as mpl
colors = mpl.rcParams["axes.prop_cycle"]
colors = [x["color"] for x in list(colors)]

In [None]:
sub_ad = adata[adata.obs["consensus"] != "Unclear"]
sub_ad = pd.concat([sub_ad.to_df(), sub_ad.obs], axis = 1)
for marker in adata.var_names:
    gp = (ggplot(sub_ad, aes(x = "annotation_consensus", fill = "annotation_consensus", y = marker)) 
    + geom_violin(style="right", scale="width", width = 1.25) 
    # Use stat_summary to compute and plot the median for each group
    + stat_summary(fun_y=np.median, geom='point', color='white', size=2)
    + theme_classic() 
    + coord_flip()
    # + coord_fixed()
    + scale_fill_manual(values = colors)
    + theme(legend_position = "none")
    + labs(y = f"{marker} expression", x = "Cell type"))
    ggsave(gp, f"../../figures/fig2/violin_{marker}_celltype.pdf", width = 3.5, height = 3.5)

In [None]:
adata.obs.annotation_consensus.value_counts()

In [None]:
# Re-order cell types by abundance
adata.obs["ordered_annotation_consensus"] = adata.obs["annotation_consensus"].cat.reorder_categories(
    adata.obs["annotation_consensus"].value_counts().index.to_list(),
    ordered=True
)

# Display as bar chart

gp = (
    ggplot(adata.obs, aes(x="ordered_annotation_consensus", fill="annotation_consensus")) 
    + geom_bar() 
    + theme_classic() 
    + scale_fill_manual(values = colors)
    + coord_flip()
    # + scale_y_log10()
    + labs(x="Cell type", y="Number of cells")
    + guides(fill=False)
)
ggsave(gp, "../../figures/fig2/absolute_cell_counts.pdf", width = 3, height = 4)
gp

In [None]:
# Create a new data frame with the proportion of each curated cell type per FOV
proportion_per_fov = pd.crosstab(adata.obs["fov"], adata.obs["annotation_consensus"], normalize="index")

# Sort the FOVs by the proportion of "Cancer_cell" in descending order
proportion_per_fov.index = pd.CategoricalIndex(proportion_per_fov.index, 
                                          categories=proportion_per_fov.loc[:, "Cancer_cell"].sort_values(ascending=False).index.to_list(),
                                          ordered=True)

# Convert to long format
proportion_per_fov_long = proportion_per_fov.melt(var_name="annotation_consensus", value_name="proportion", ignore_index=False).reset_index()

# Display the proportion of each curated cell type per FOV as stacked barplots
gp = (
    ggplot(proportion_per_fov_long, aes(x="fov", y="proportion", fill="annotation_consensus")) +
    geom_bar(stat="identity") +
    coord_flip() +
    theme_minimal() +
    labs(x="Field of view", y="Proportion", fill="Cell type", title="Cell type composition per FOV") +
    theme(axis_text_y=element_blank(), axis_title_x=element_text(margin={'t': 10}))
)

# ggsave(gp, "proportion_per_fov.pdf")
gp

In [None]:
proportion_per_fov_long.annotation_consensus = pd.Categorical(proportion_per_fov_long.annotation_consensus)
proportion_per_fov_long.annotation_consensus.cat.remove_unused_categories()

In [None]:
# Repeat the same analysis but excluding the "Unclear" labelled cells
clear_obs = adata.obs.loc[adata.obs.annotation_consensus != "Unclear"]

# Create a new data frame with the proportion of each curated cell type per FOV
proportion_per_fov = pd.crosstab(clear_obs["fov"], clear_obs["annotation_consensus"], normalize="index")

# Sort the FOVs by the proportion of "Cancer_cell" in descending order
proportion_per_fov.index = pd.CategoricalIndex(proportion_per_fov.index, 
                                          categories=proportion_per_fov.loc[:, "Cancer_cell"].sort_values(ascending=False).index.to_list(),
                                          ordered=True)

# Convert to long format
proportion_per_fov_long = proportion_per_fov.melt(var_name="annotation_consensus", value_name="proportion", ignore_index=False).reset_index()
# Drop unused levels
proportion_per_fov_long.annotation_consensus = pd.Categorical(proportion_per_fov_long.annotation_consensus)
proportion_per_fov_long.annotation_consensus = proportion_per_fov_long.annotation_consensus.cat.remove_unused_categories()

# Display the proportion of each curated cell type per FOV as stacked barplots
gp = (
    ggplot(proportion_per_fov_long, aes(x="fov", y="proportion", fill="annotation_consensus")) +
    geom_bar(stat="identity") +
    coord_flip(ylim = [0, 1], xlim = [0,1]) +
    theme_minimal() +
    scale_fill_manual(values = colors) +
    labs(x="Sample", y="Proportion", fill="Cell type") +
    theme(axis_text_y=element_blank(), axis_title_x=element_text(margin={'t': 10}), panel_background = element_blank())
)

# ggsave(gp, "figures/fig2/proportion_per_fov_no_unclear.pdf")
gp


In [None]:
# Now, we want to split FOVs based on whether they correspond to patients or healthy donors
clini = pd.read_csv("../../data/summary_clinical_data_modified.csv", index_col=2)

# Convert to long format
proportion_per_fov_long = proportion_per_fov.melt(var_name="annotation_consensus", value_name="proportion", ignore_index=False).reset_index()
# Drop unused levels
proportion_per_fov_long.annotation_consensus = pd.Categorical(proportion_per_fov_long.annotation_consensus)
proportion_per_fov_long.annotation_consensus = proportion_per_fov_long.annotation_consensus.cat.remove_unused_categories()
# Store the original categorical information
proportion_per_fov_long.fov = pd.Categorical(proportion_per_fov_long.fov)
original_categories = proportion_per_fov_long.fov.cat.categories
original_ordered_status = proportion_per_fov_long.fov.cat.ordered
# Merge with clinical data
proportion_per_fov_long = pd.merge(proportion_per_fov_long, clini, left_on="fov", right_index=True, how='left')
# Restore the categorical nature and order
proportion_per_fov_long['fov'] = pd.Categorical(proportion_per_fov_long['fov'], categories=original_categories, ordered=original_ordered_status)
# Categories need to be re-ordered
proportion_per_fov_long["pT"] = proportion_per_fov_long["pT group"].astype("category")
gp = (
    ggplot(proportion_per_fov_long, aes(x="fov", y="proportion", fill="annotation_consensus")) +
    geom_bar(stat="identity") +
    coord_flip(ylim = [0, 1], xlim = [0,1]) +
    theme_minimal() +
    scale_fill_manual(values = colors) +
    labs(x="Field of view", y="Proportion", fill="Cell type") +
    theme(axis_text_y=element_blank(), axis_title_x=element_text(margin={'t': 10}), panel_background = element_blank())
)
gp + facet_wrap('~pT') + theme_classic()
# Conclusion: adding info about tumor stage would not visually contribute to the plot

In [None]:
proportion_per_fov_long["tumor"] = [x in clini.index for x in proportion_per_fov_long.fov]
# Samples on E1 and E2 lines are SCT samples and neither normal nor tumor
proportion_per_fov_long.loc[[("E1" in x) or ("E2" in x) for x in proportion_per_fov_long["fov"]], "tumor"] = pd.NA

# Display the proportion of each curated cell type per FOV as stacked barplots
gp = (
    ggplot(proportion_per_fov_long, aes(x="fov", y="proportion", fill="annotation_consensus")) +
    geom_bar(stat="identity") +
    coord_flip() +
    theme_minimal() +
    facet_wrap("~tumor") +
    labs(x="Field of view", y="Proportion", fill="Cell type", title="Cell type composition per FOV") +
    theme(axis_text_y=element_blank(), axis_title_x=element_text(margin={'t': 10}))
)

# ggsave(gp, "proportion_per_fov_no_unclear_stratified.pdf")
gp + theme_classic()

In [None]:
x = proportion_per_fov_long[[x == False for x in proportion_per_fov_long.tumor if x is not pd.NA]]
x[x.annotation_consensus == "Cancer_cell"]

In [None]:
# Median proportion of each cell type per FOV, grouped by tumor status
proportion_per_fov_long.groupby(["tumor", "annotation_consensus"]).proportion.median()

In [None]:
data = pd.crosstab(adata.obs["fov"], adata.obs["annotation_consensus"], normalize="index").values
data[data == 0] = np.min(data[data != 0])/10

# Function to perform ALR transformation
def alr_transform(data):
    reference = data[:, -1].reshape(-1, 1)
    alr_data = np.log(data[:, :-1] / reference)
    return alr_data

# Function to perform inverse ALR transformation
def alr_inverse_transform(alr_data, reference):
    data = np.exp(alr_data)
    data = np.hstack((data, np.ones((data.shape[0], 1))))
    data = data * reference
    return data

# EM algorithm for zero imputation
def lr_em(data, max_iter=100, tol=1e-6):
    for _ in range(max_iter):
        alr_data = alr_transform(data)
        mean = np.nanmean(alr_data, axis=0)
        cov = np.nan_to_num(np.cov(alr_data, rowvar=False))
        imputed_data = np.random.multivariate_normal(mean, cov, size=data.shape[0])
        imputed_data = alr_inverse_transform(imputed_data, data[:, -1].reshape(-1, 1))
        data[data == 1e-6] = imputed_data[data == 1e-6]
        if np.linalg.norm(data - imputed_data) < tol:
            break
    return data

# Apply lrEM algorithm
imputed_data = lr_em(data)

# CLR transformation
def clr_transformation(data):
    geometric_mean = np.exp(np.mean(np.log(data), axis=1)).reshape(-1, 1)
    clr_data = np.log(data / geometric_mean)
    return clr_data

clr_data = clr_transformation(imputed_data)
clr_labels = proportion_per_fov_long.tumor

In [None]:
# Display clr transformed data as violin plots for each cell type, comparing tumor and non-tumor FOVs
clr_df = pd.DataFrame(clr_data, columns=adata.obs["annotation_consensus"].cat.categories.to_list())
clr_df["tumor"] = clr_labels
clr_df = clr_df.melt(id_vars="tumor", var_name="cell_type", value_name="clr_value")
gp = (
    ggplot(clr_df, aes(x="cell_type", y="clr_value", fill="tumor")) +
    geom_violin(scale="width") +
    stat_summary(fun_y=np.median, geom='point', color='white', size=2) +
    coord_flip() +
    theme_classic() +
    labs(x="Cell type", y="CLR transformed value", fill="Tumor status") +
    theme(legend_position="top")
)
gp

In [None]:
# Count values and do not discard NAs
proportion_per_fov_long.tumor.value_counts(dropna=False)
clr_df["tumor"].value_counts(dropna=False)

In [None]:
from statsmodels.multivariate.manova import MANOVA
# Assuming clr_data is a DataFrame and clr_labels is a Series or list
df = pd.DataFrame(clr_data)
# Add column names
df.columns = adata.obs["annotation_consensus"].cat.categories.to_list()
# Filter irrelevant columns (not biological interpretation)
df = df.iloc[:, :-3]
# Add labels to the DataFrame
df['labels'] = clr_labels  
# Ignore SCT samples (labels = NA)
df = df.loc[df['labels'].notna()]

# Perform MANOVA
covar = ' + '.join([x for x in df.columns if x != "labels"])
maov = MANOVA.from_formula(f'{covar} ~ labels', data=df)
print(maov.mv_test())

In [None]:
# Display clr transformed data as violin plots for each cell type, comparing tumor and non-tumor FOVs
clr_df = pd.DataFrame(clr_data, columns=adata.obs["annotation_consensus"].cat.categories.to_list())
clr_df["tumor"] = clr_labels
# Ignore SCT samples (tumor = NA)
clr_df = clr_df.loc[clr_df["tumor"].notna()]
clr_df = clr_df.melt(id_vars="tumor", var_name="cell_type", value_name="clr_value")
# Exclude uninterpretable cell types / keep only immune cells
excluded_types = ["Cancer_cell", "Unclear", "Endothelial_cell", "CAF"]
clr_df = clr_df.loc[~clr_df.cell_type.isin(excluded_types)]

gp = (
    ggplot(clr_df, aes(x="cell_type", y="clr_value", fill="tumor")) +
    geom_violin(scale="width") +
    stat_summary(fun_y=np.median, geom='point', color='white', size=2) +
    coord_flip() +
    theme_classic() +
    scale_fill_brewer(type="qual") +
    labs(x="Cell type", y="CLR transformed value", fill="Tumor status") +
    theme(legend_position="top")
)
gp

In [None]:
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
p_values = []
for cell_type in clr_df.cell_type.unique():
    cell_type_df = clr_df.loc[clr_df.cell_type == cell_type,:]
    cell_type_df.tumor = pd.Series(cell_type_df.tumor, dtype="bool")
    u, p_value = mannwhitneyu(cell_type_df.loc[cell_type_df.tumor, "clr_value"], 
                              cell_type_df.loc[~cell_type_df.tumor, "clr_value"])
    print(f"{cell_type}: Mann-Whitney U: {u}, p-value: {p_value}")
    p_values.append(p_value)
    
# BH FDR correction
_, p_values_corrected, _, _ = multipletests(p_values, method="fdr_bh")
for cell_type, p_value in zip(clr_df.cell_type.unique(), p_values_corrected):
    print(f"Cell type: {cell_type}, FDR: {p_value}")

In [None]:
clr_df_sig = pd.DataFrame({"cell_type": clr_df.cell_type.unique(),
              "significant": [("< 0.005" if x < 0.005 else ("< 0.05" if x < 0.05 else "")) for x in p_values_corrected],
              "clr_value": 5,
              })
gp = gp + geom_label(fill = "#ffffffaa", data=clr_df_sig, mapping = aes(label="significant"), size=8)
ggsave(gp, "../../figures/fig2/clr_violin_status.pdf")
gp

In [None]:
clr_df = pd.DataFrame(clr_data, columns=adata.obs["annotation_consensus"].cat.categories.to_list())
clr_df["Annot"] = proportion_per_fov_long.loc[:,["fov"]].join(clini, on = "fov")["pT group"]
clr_df = clr_df.melt(id_vars="Annot", var_name="cell_type", value_name="clr_value")
# Only display patients with known pT status
clr_df = clr_df.loc[~pd.isna(clr_df.Annot)]
# Exclude uninterpretable cell types / keep only immune cells
excluded_types = ["Cancer_cell", "Unclear", "Endothelial_cell", "CAF"]
clr_df = clr_df.loc[~clr_df.cell_type.isin(excluded_types)]
gp = (
    ggplot(clr_df, aes(x="cell_type", y="clr_value", fill="Annot")) +
    geom_violin(scale="width") +
    stat_summary(fun_y=np.median, geom='point', color='white', size=2) +
    coord_flip() +
    theme_classic() +
    scale_fill_brewer(palette="Purples") +
    labs(x="Cell type", y="CLR transformed value", fill = "T staging") +
    theme(legend_position="top")
)
ggsave(gp, "../../figures/fig2/clr_violin_pt.pdf")
gp

In [None]:
from scipy.stats import kendalltau
from statsmodels.stats.multitest import multipletests

p_values = []
for cell_type in clr_df.cell_type.unique():
    tau, p_value = kendalltau(clr_df.loc[clr_df.cell_type == cell_type, "clr_value"], 
                              [int(x[2]) for x in clr_df.loc[clr_df.cell_type == cell_type, "Annot"]])
    p_values.append(p_value)

# BH FDR correction
_, p_values_corrected, _, _ = multipletests(p_values, method="fdr_bh")
for cell_type, p_value in zip(clr_df.cell_type.unique(), p_values_corrected):
    print(f"Cell type: {cell_type}, FDR: {p_value}")

clr_df_sig = pd.DataFrame({"cell_type": clr_df.cell_type.unique(),
              "significant": [("< 0.0005" if x < 0.0005 else "< 0.005" if x < 0.005 
                               else ("< 0.05" if x < 0.05 else "")) for x in p_values_corrected],
              "clr_value": 5,
              })
gp = gp + geom_label(fill = "#ffffffaa", data=clr_df_sig, mapping = aes(label="significant"), size=8)
ggsave(gp, "../../figures/fig2/clr_violin_pt.pdf")
gp

In [None]:
# Which FOVs are in each node stage?
clini['pN'] = pd.NA
n0fovs = clini['pN raw'].str.contains('pN0', na=False)
n1fovs = clini['pN raw'].str.contains('pN1', na=False)
n2fovs = clini['pN raw'].str.contains('pN2', na=False)
clini.loc[n0fovs, 'pN'] = 'pN0'
clini.loc[n1fovs, 'pN'] = 'pN1'
clini.loc[n2fovs, 'pN'] = 'pN2'
clini.pN.value_counts(dropna=False)

In [None]:
clr_df = pd.DataFrame(clr_data, columns=adata.obs["annotation_consensus"].cat.categories.to_list())
clr_df["Annot"] = proportion_per_fov_long.loc[:,["fov"]].join(clini, on = "fov")['pN']
clr_df = clr_df.melt(id_vars="Annot", var_name="cell_type", value_name="clr_value")
# Only display patients with known pT status
clr_df = clr_df.loc[~pd.isna(clr_df.Annot)]
# Exclude uninterpretable cell types / keep only immune cells
excluded_types = ["Cancer_cell", "Unclear", "Endothelial_cell", "CAF"]
clr_df = clr_df.loc[~clr_df.cell_type.isin(excluded_types)]
gp = (
    ggplot(clr_df, aes(x="cell_type", y="clr_value", fill="Annot")) +
    geom_violin(scale="width") +
    stat_summary(fun_y=np.median, geom='point', color='white', size=2) +
    coord_flip() +
    theme_classic() +
    scale_fill_brewer(palette="Greens") +
    labs(x="Cell type", y="CLR transformed value", fill = "N staging") +
    theme(legend_position="top")
)
ggsave(gp, "../../figures/fig2/clr_violin_pn.pdf")
gp

In [None]:
p_values = []
for cell_type in clr_df.cell_type.unique():
    tau, p_value = kendalltau(clr_df.loc[clr_df.cell_type == cell_type, "clr_value"], 
                              [int(x[2]) for x in clr_df.loc[clr_df.cell_type == cell_type, "Annot"]])
    p_values.append(p_value)

# BH FDR correction
_, p_values_corrected, _, _ = multipletests(p_values, method="fdr_bh")
for cell_type, p_value in zip(clr_df.cell_type.unique(), p_values_corrected):
    print(f"Cell type: {cell_type}, FDR: {p_value}")


clr_df_sig = pd.DataFrame({"cell_type": clr_df.cell_type.unique(),
              "significant": [("< 0.0005" if x < 0.0005 else "< 0.005" if x < 0.005 
                               else ("< 0.05" if x < 0.05 else "")) for x in p_values_corrected],
              "clr_value": 5,
              })
gp = gp + geom_label(fill = "#ffffffaa", data=clr_df_sig, mapping = aes(label="significant"), size=8)
ggsave(gp, "../../figures/fig2/clr_violin_pn.pdf")
gp

In [None]:
clr_df = pd.DataFrame(clr_data, columns=adata.obs["annotation_consensus"].cat.categories.to_list())
clr_df["Annot"] = proportion_per_fov_long.loc[:,["fov"]].join(clini, on = "fov")['MSI gesamt RED']
# Rename MSI status levels
clr_df["Annot"] = clr_df["Annot"].replace({"stabil": "stable", "instabil": "unstable"})
clr_df = clr_df.melt(id_vars="Annot", var_name="cell_type", value_name="clr_value")
# Only display patients with known pT status
clr_df = clr_df.loc[~pd.isna(clr_df.Annot)]
# Exclude uninterpretable cell types / keep only immune cells
excluded_types = ["Cancer_cell", "Unclear", "Endothelial_cell", "CAF"]
clr_df = clr_df.loc[~clr_df.cell_type.isin(excluded_types)]
gp = (
    ggplot(clr_df, aes(x="cell_type", y="clr_value", fill="Annot")) +
    geom_violin(scale="width") +
    stat_summary(fun_y=np.median, geom='point', color='white', size=2) +
    coord_flip() +
    theme_classic() +
    scale_fill_brewer(palette="Reds") +
    labs(x="Cell type", y="CLR transformed value", fill = "MSI") +
    theme(legend_position="top")
)
ggsave(gp, "../../figures/fig2/clr_violin_msi.pdf")
gp

In [None]:
# Proportion of cancer cells per FOV
pc = proportion_per_fov.loc[:, "Cancer_cell"]

# Proportion of immune cells per FOV
pi = proportion_per_fov.loc[:, ["APC", "B_cell", "CD4_Tcell", "CD8_Tcell", "CD68_Macrophage", "CD163_Macrophage", 
                           "Monocyte", "NK_cell", "Neutrophil", "Other_immune_cell", "T_reg_cell"]].sum(axis=1)

immune_ratio_df = pd.DataFrame((pi / pc)).join(clini.loc[:,'MSI gesamt RED'], on = "fov")
immune_ratio_df.columns = ["Ratio", "MSI"]
immune_ratio_df["MSI"] = immune_ratio_df["MSI"].replace({"stabil": "stable", "instabil": "unstable"})
immune_ratio_df["Ratio"] = np.log2(immune_ratio_df["Ratio"])

# Drop missing MS status
immune_ratio_df = immune_ratio_df.loc[~immune_ratio_df.MSI.isna(),:]
# Drop infinite ratios (lacking immune or cancer cells)
immune_ratio_df = immune_ratio_df.loc[~np.isinf(immune_ratio_df.Ratio),:]

u, p_value = mannwhitneyu(immune_ratio_df.loc[immune_ratio_df.MSI == "stable","Ratio"], 
                          immune_ratio_df.loc[immune_ratio_df.MSI == "unstable","Ratio"])

gp = (
    ggplot(immune_ratio_df, 
           aes(x="MSI", y="Ratio", fill="MSI")) +
    geom_violin(scale="width") +
    stat_summary(fun_y=np.median, geom='point', color='white', size=2) +
    coord_flip() +
    theme_classic() +
    scale_fill_brewer(palette="Reds") +
    labs(x="MS status", y="log2(immune/cancer)", fill = "MS status") +
    theme(legend_position="top") +
    ggtitle(f"U: {u}, p-value: {p_value:.2e}")
)
ggsave(gp, "../../figures/fig2/ratio_violin_msi.pdf")
gp

## Correlation endo / immune
See Keren et al. (20), Figure 2c.

In [None]:
immune_types = ['APC', 'B_cell', 'CD4_Tcell', 'CD8_Tcell', 'CD68_Macrophage', 'CD163_Macrophage', 'Monocyte', 'NK_cell', 'Neutrophil', 'Other_immune_cell', 'T_reg_cell']
adata.obs["ImmuEndo"] = ["Immu" if x in immune_types else "Endo" if x == 'Endothelial_cell' else "Canc" if x == "Cancer_cell" else "Misc" for x in adata.obs["consensus"]]
immuendo_df = pd.crosstab(adata.obs["fov"], adata.obs["ImmuEndo"])
# Replace all 0s by 0.1
immuendo_df_logSupport = immuendo_df.replace(0, 0.1)

In [None]:
# x-axis: Number of immune cell
# y-axis: Number of endothelial cells
spear_cor = immuendo_df.corr(method="spearman").loc["Endo","Immu"]
# _, p_value = mannwhitneyu(cell_type_df.loc[cell_type_df.tumor, "clr_value"], 
#                               cell_type_df.loc[~cell_type_df.tumor, "clr_value"])

gp = (
    ggplot(immuendo_df_logSupport, aes(x = "Immu", y = "Endo")) 
    + geom_text(label = f"SCC: {spear_cor:.2f}", x = 1.1, y = 2.5, size = 10)
    + geom_point()
    + theme_classic()
    + scale_x_log10()
    + scale_y_log10()
    + labs(x = "Number of immune cells", y = "Number of endothelial cells")
)
ggsave(gp, "../../figures/fig2/endo_immu_correlation.pdf", width = 4, height = 4)
gp

In [None]:
spear_cor = immuendo_df.corr(method="spearman").loc["Endo","Canc"]
gp = (
    ggplot(immuendo_df_logSupport, aes(y = "Canc", x = "Endo")) 
    + geom_text(label = f"SCC: {spear_cor:.2f}", x = 0.05, y = 1.5, size = 10)
    + geom_point()
    + theme_classic()
    + scale_x_log10()
    + scale_y_log10()
    + labs(x = "Number of endothelial cells", y = "Number of cancer cells")
)
ggsave(gp, "../../figures/fig2/endo_canc_correlation.pdf", width = 4, height = 4)
print(gp)

spear_cor = immuendo_df.corr(method="spearman").loc["Immu","Canc"]
gp = (
    ggplot(immuendo_df_logSupport, aes(y = "Canc", x = "Immu")) 
    + geom_text(label = f"SCC: {spear_cor:.2f}", x = 1.1, y = 1.5, size = 10)
    + geom_point()
    + theme_classic()
    + scale_x_log10()
    + scale_y_log10()
    + labs(x = "Number of immune cells", y = "Number of cancer cells")
)
ggsave(gp, "../../figures/fig2/immu_canc_correlation.pdf", width = 4, height = 4)
print(gp)