## Imports

In [None]:
import os
import numpy as np
import anndata as ad
import pandas as pd
import pytorch_lightning as pl
from plotnine import *
from lightning_lite import seed_everything
from pytorch_lightning.callbacks import EarlyStopping  # ModelCheckpoint
import scanpy as sc

from starling import starling, utility

In [None]:
seed_everything(10, workers=True)

In [None]:
os.getcwd()

## Prepare data

In [None]:
cells = pd.read_csv("cell_table_transformed.csv", index_col=0)
# Exclude one FOV where most cells are badly segmented
cells = cells.loc[cells.fov != "C6h"]

In [None]:
# Goal is not to get proper phenotype but to identify spillover
lineage_channels = ['SMA','CD4','CD31','CD7', 'CD163', 'CD68','CD8', 'CD3e', 'MPO', 'CD20', 'HLADRa', 'CD14', 'CD45', 'PanCK', 'FoxP3']

In [None]:
adata = ad.AnnData(cells.loc[:,lineage_channels], obs=cells.drop(lineage_channels, axis=1))

## Explore clustering results

In [None]:
sc.settings.verbosity = 1  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=150, facecolor="white")

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=["PanCK", "CD3e", "CD45", "SMA", "CD4", "CD31", 
                         "CD7", "CD14", "CD163", "DCN", "CD68", "CD8",
                         "Vimentin", "MPO", "CD20", "HLADRa"])

## Confusion matrix
### Load Scyan output

In [None]:
scyan_results = ad.read_h5ad("../../../.scyan_data/scyan_run_mini_noUnclear/default.h5ad")
assert scyan_results.X.shape[0] == adata.X.shape[0]
assert adata.n_obs == scyan_results.n_obs

### Compare to Teresa's gating annotation

In [None]:
annotation_t = pd.read_csv("./cell_table_with_annotation_final.csv")
annotation_t = annotation_t.loc[annotation_t.fov != "C6h"]

In [None]:
adata.obs["annotation_t"] = pd.Categorical(annotation_t.cell_type_final,
    categories = sorted(annotation_t.cell_type_final.unique()))
sc.pl.umap(adata, color="annotation_t")

In [None]:
adata.obs["annotation_scyan"] = pd.Categorical(scyan_results.obs.scyan_pop,
    categories = sorted(scyan_results.obs.scyan_pop.astype(str).unique()))
# Remove unused categories
adata.obs["annotation_scyan"] = adata.obs["annotation_scyan"].cat.remove_unused_categories()
sc.pl.umap(adata, color="annotation_scyan")

### Compute and display confusion

In [None]:
# Convert NAs to "Unclear"
# Add "Unclear" as a category to the "annotation_scyan" column
adata.obs["annotation_scyan"] = adata.obs["annotation_scyan"].cat.add_categories("Unclear")
adata.obs["annotation_scyan"].fillna("Unclear", inplace = True)
# Confusion: scyan vs gating
gating_classes = adata.obs["annotation_t"].unique()
scyan_classes = adata.obs["annotation_scyan"].unique()

confmat = pd.DataFrame({
    "Gating": np.repeat(gating_classes, len(scyan_classes)),
    "Scyan": np.tile(scyan_classes, len(gating_classes)),
    "Count": 0
})

In [None]:
for i in range(adata.n_obs):
    scyan_lab = adata.obs.annotation_scyan[i]
    gating_lab = adata.obs.annotation_t[i]
    confmat.loc[(confmat.Gating == gating_lab) & (confmat.Scyan == scyan_lab), "Count"] += 1

In [None]:
# Set nan values to "Unclear"
confmat['Scyan'].fillna("Unclear", inplace = True)
# Sort unique values in the column
confmat['Scyan'] = pd.Categorical(confmat['Scyan'], 
    categories = sorted(confmat['Scyan'].unique()))
confmat['Gating'] = confmat['Gating'].astype('category')
# Compute fraction of Starling classes for each Scyan class
confmat['Fraction'] = confmat['Count'] / confmat.groupby('Gating')['Count'].transform('sum')
gp = ggplot(confmat, aes(x = "Gating", y = "Scyan")) + \
    geom_tile(aes(fill = "Fraction")) +\
    coord_flip() +\
    theme_minimal() +\
    geom_text(aes(label = "Count"), color = "#ffffff", size = 8)
gp

In [None]:
ggsave(gp, "figures/fig2/confusion_matrix_consensus.pdf", dpi = 300, height = 6, width = 10)

### Inspect the distribution of the Scyan confidence scores

In [None]:
ggplot(scyan_results.obs, aes(x = "scyan_log_probs")) +\
    geom_histogram(bins = 50) +\
    theme_classic()

In [None]:
unclear_index = (adata.obs.annotation_t == "NK_cell") & (adata.obs.annotation_scyan == "Cancer")
unclear_index.reset_index(drop = True, inplace = True)
ggplot(
    scyan_results.obs.reset_index(drop = True).loc[unclear_index],
    aes(x = "scyan_log_probs")) +\
    geom_histogram(bins = 50) +\
    geom_vline(xintercept = -3, color = "grey") +\
    theme_classic()

In [None]:
np.sum(unclear_index)

In [None]:
np.sum((adata.obs.annotation_t == "Cancer_cell") & (adata.obs.annotation_scyan == "Endo"))

While patterns of errors might be prevalent in some combinations of labels, only a few thousands cells are concerned, so treating them so that the majority have a correct label should be sufficient.

In [None]:
# Get maximum, quartiles and mean
scyan_results.obs.loc[pd.isna(scyan_results.obs.scyan_pop), "scyan_log_probs"].describe()

In [None]:
scyan_results.obs.loc[~pd.isna(scyan_results.obs.scyan_pop), "scyan_log_probs"].describe()

Unsure how the absolute values of the probability scores should be interpreted. However, this is what Scyan uses for setting a threshold in assignment: for this run, all cells with a score < -20 were not given a cell type label. If they are indeed problematic cells, this threshold could be raised.

In [None]:
p1 = adata.obs[["fov", "label", "annotation_t"]]
p2 = adata.obs[["fov", "label", "annotation_scyan"]]
# Prepend all values with "s_"
p2.annotation_scyan = p2["annotation_scyan"].apply(lambda x: "s_" + x)
p2.columns = p1.columns
pd.concat([p1, p2]).to_csv("annotation_complete_scyan_gating.csv", index=False, header=False)

In [None]:
p2.loc[p2.annotation_t == "s_Unclear"].iloc[5:,:]

## Make consensus annotation

In [None]:
# Define consensus cell type based on both annotations
# Key 1 = gating, Key 2 = scyan

# Default to unclear label
from collections import defaultdict
consensus_dict = {x: defaultdict(lambda: "Unclear")
                 for x in adata.obs.annotation_t.unique()}

consensus_dict["T_reg_cell"]["Lympho"] = "T_reg_cell"
consensus_dict["Other_immune_cell"]["Lympho"] = "Other_immune_cell"
consensus_dict["Other_immune_cell"]["Myelo"] = "Other_immune_cell"
consensus_dict["Neutrophil"]["Lympho"] = "Neutrophil"
consensus_dict["NK_cell"]["Lympho"] = "NK_cell"
consensus_dict["NK_cell"]["Cancer"] = "Cancer_cell"
consensus_dict["Monocyte"]["Myelo"] = "Monocyte"
consensus_dict["Monocyte"]["Lympho"] = "Monocyte"
consensus_dict["Endothelial_cell"]["Endo"] = "Endothelial_cell"
consensus_dict["Endothelial_cell"]["Cancer"] = "Cancer_cell"
consensus_dict["DN_Tcells"]["Lympho"] = "Other_immune_cell"
consensus_dict["Cancer_cell_pot"]["Cancer"] = "Cancer_cell"
consensus_dict["Cancer_cell"]["Cancer"] = "Cancer_cell"
consensus_dict["CD8_Tcell"]["Lympho"] = "CD8_Tcell"
consensus_dict["CD8_Tcell"]["Myelo"] = "Other_immune_cell"
consensus_dict["CD68+_macrophage"]["Myelo"] = "CD68_Macrophage"
consensus_dict["CD68+_macrophage"]["Lympho"] = "Other_immune_cell"
consensus_dict["CD4_Tcell"]["Lympho"] = "CD4_Tcell"
consensus_dict["CD163+_macrophage"]["Myelo"] = "CD163_Macrophage"
consensus_dict["CD163+_macrophage"]["Lympho"] = "Other_immune_cell"
consensus_dict["CAF"]["CAF"] = "CAF"
consensus_dict["B_cell"]["Lympho"] = "B_cell"
consensus_dict["APC"]["Lympho"] = "APC"
consensus_dict["APC"]["Myelo"] = "APC"

In [None]:
adata.obs['annotation_consensus'] = adata.obs.apply(lambda row: consensus_dict[row['annotation_t']][row['annotation_scyan']], axis=1)

In [None]:
# Display confusion matrix for validation

# Confusion: scyan vs gating
consensus_classes = adata.obs["annotation_consensus"].unique()

confmat = pd.DataFrame({
    "Gating": np.repeat(gating_classes, len(consensus_classes)),
    "Consensus": np.tile(consensus_classes, len(gating_classes)),
    "Count": 0
})

for i in range(adata.n_obs):
    consensus_lab = adata.obs.annotation_consensus[i]
    gating_lab = adata.obs.annotation_t[i]
    confmat.loc[(confmat.Gating == gating_lab) & (confmat.Consensus == consensus_lab), "Count"] += 1

In [None]:
# Sort unique values in the column
confmat['Consensus'] = confmat['Consensus'].astype('category')
confmat['Gating'] = confmat['Gating'].astype('category')
# Compute fraction of Starling classes for each Scyan class
confmat['Fraction'] = confmat['Count'] / confmat.groupby('Gating')['Count'].transform('sum')
gp = ggplot(confmat, aes(x = "Gating", y = "Consensus")) + \
    geom_tile(aes(fill = "Fraction")) +\
    coord_flip() +\
    geom_text(aes(label = "Count"), color = "#ffffff", size = 8)
gp

In [None]:
adata.obs['annotation_consensus'].value_counts()

In [None]:
# Export the annotations as anndata
adata.write("adata_consensus_cell_types.h5ad")

# Export the annotations for mantis viewer
adata.obs[["fov", "label", "annotation_consensus"]].to_csv("annotation_consensus.csv", index=False, header=False)