# Visualize performance of different verions of 
- BayesPrism
- DWLS
- MuSiC
- hspe
- bisque

For each method, we present results at major (50% tumour purity) - minor - subset lineage levels

In [None]:
import numpy as np
import pandas as pd
import anndata as adata

import math
import random
import itertools

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity as skl_cosine

from scipy.stats import pearsonr
from scipy.spatial.distance import cosine as scipy_cosine
from math import sqrt
from sklearn.metrics import confusion_matrix
from scipy.spatial.distance import braycurtis, cdist

%load_ext blackcellmagic

## Set up paths and load setting files

#### Path to subset/minor/major experiments

In [None]:
# Subset
subset_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/subset_level"
)

# Minor
minor_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/minor_level"
)

# Major
major_prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"

# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

#### List major/minor/subset cell types

In [None]:
subset_c_types = [
    "Endothelial",
    "CAFs",
    "PVL",
    "B cells Memory",
    "B cells Naive",
    "T_cells_c4_CD8+_ZFP36",
    "T_cells_c6_IFIT1",
    "T_cells_c7_CD8+_IFNG",
    "T_cells_c8_CD8+_LAG3",
    "T_cells_c0_CD4+_CCR7",
    "T_cells_c1_CD4+_IL7R",
    "T_cells_c2_CD4+_T-regs_FOXP3",
    "T_cells_c3_CD4+_Tfh_CXCL13",
    "T_cells_c9_NK_cells_AREG",
    "T_cells_c11_MKI67",
    "T_cells_c10_NKT_cells_FCGR3A",
    "Myeloid_c10_Macrophage_1_EGR1",
    "Myeloid_c12_Monocyte_1_IL1B",
    "Myeloid_c2_LAM2_APOE",
    "Myeloid_c1_LAM1_FABP5",
    "Cycling_Myeloid",
    "Myeloid_c4_DCs_pDC_IRF7",
    "Normal Epithelial",
    "Plasmablasts",
    "Myeloid_c8_Monocyte_2_S100A9",
    "Myeloid_c9_Macrophage_2_CXCL10",
    "Myeloid_c11_cDC2_CD1C",
    "Cancer Epithelial",
    "Myeloid_c3_cDC1_CLEC9A",
]
minor_c_types = [
    "Cancer Epithelial",
    "T cells CD4+",
    "T cells CD8+",
    "Endothelial",
    "CAFs",
    "Macrophage",
    "PVL",
    "Normal Epithelial",
    "Plasmablasts",
    "B cells Memory",
    "Monocyte",
    "Cycling_Myeloid",
    "Cycling T-cells",
    "NK cells",
    "NKT cells",
    "DCs",
    "B cells Naive",
]
major_c_types = [
    "B-cells",
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Myeloid",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
    "T-cells",
]

#### Load groundtruth

In [None]:
# We only use tumour purity = 50%
pur_lvl = 0.5

In [None]:
# Subset
subset_truth_df = pd.read_csv(
    Path(subset_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
subset_truth_df = subset_truth_df[subset_c_types]

# Minor
minor_truth_df = pd.read_csv(
    Path(minor_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
minor_truth_df = minor_truth_df[minor_c_types]

# Major
major_truth_df = pd.read_csv(
    Path(major_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
major_truth_df = major_truth_df[major_c_types]

#### Extract lineages metadata

In [None]:
# Load lineage mapping of all cell types in the data
lineages_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_immune_lineages.tsv"),
    sep="\t",
)

# Replace columns
lineages_df.rename(
    columns={
        "celltype_major": "Major Cell Type",
        "celltype_major_short": "Annotated Major Cell Type",
        "celltype_minor": "Minor Cell Type",
        "celltype_minor_short": "Annotated Minor Cell Type",
        "celltype_subset": "Subset Cell Type",
        "celltype_subset_short": "Annotated Subset Cell Type",
        "marker_genes": "Marker Genes",
        "counts": "Cell Counts",
        "num_of_patients": "Patient Counts",
    },
    inplace=True,
)

In [None]:
# We removed Myeloid_c7_Monocyte_3_FCGR3A, exclude it from the lineages DataFrame
lineages_df = lineages_df[
    lineages_df["Subset Cell Type"] != "Myeloid_c7_Monocyte_3_FCGR3A"
]

# We also removed 3 cell types that MuSiC dropped
lineages_df = lineages_df[
    ~lineages_df["Subset Cell Type"].isin(
        [
            "Myeloid_c0_DC_LAMP3",
            "Myeloid_c5_Macrophage_3_SIGLEC1",
            "T_cells_c5_CD8+_GZMK",
        ]
    )
]

#### Extract colour pallete

In [None]:
# Load lineage mapping of all cell types in the data
colour_pallete_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

# Convert to dictionary
colour_pallete_d = {
    row["all_celltype"]: {"fill": row["fill"], "line": row["line"]}
    for i, row in colour_pallete_df.iterrows()
}

## Plot boxplots of all methods together across tumour purity levels

In [None]:
# Lineage prexis mapping
lineage_prefix_mappings = {
    "major": {
        "prefix": major_prefix,
        "truth_df": major_truth_df,
    },
    "minor": {
        "prefix": minor_prefix,
        "truth_df": minor_truth_df,
    },
    "subset": {
        "prefix": subset_prefix,
        "truth_df": subset_truth_df,
    },
}

### Calculate Bray-Curtis dissimilarity

In [None]:
bray_curtis_l = []

# Iterate over lineage levels and different model versions to collect predictions
for lineage_lvl in tqdm(["major", "minor", "subset"]):
    for model_ver in [
        "bisque",
        "bisque_scaled_marker",
        "bprism_v2",
        "bprism_v2_marker_genes_cell_states",
        "music",
        "music_marker_genes",
        "dwls",
        "dwls_mast",
        "hspe",
        "hspe_marker_genes",
    ]:
        tmp_prefix = lineage_prefix_mappings[lineage_lvl]["prefix"]

        # Load predictions and replace tiny negative values with 0
        res_df = pd.read_csv(
            Path(tmp_prefix).joinpath(f"data/results/{model_ver}.csv"),
            sep="\t",
            index_col=0,
        )

        # Clip tiny negative numbers to 0
        res_df.clip(lower=0, inplace=True)

        # We only use predictions where tumour purity level == 50%
        tmp_truth_df = lineage_prefix_mappings[lineage_lvl]["truth_df"][
            lineage_prefix_mappings[lineage_lvl]["truth_df"]["Cancer Epithelial"] == 0.5
        ].sort_index()
        tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

        # Rearrange index and columns and make sure they match
        tmp_truth_df = tmp_truth_df[tmp_res_df.columns]
        tmp_truth_df.sort_index(inplace=True)
        tmp_res_df.sort_index(inplace=True)
        assert (tmp_truth_df.index == tmp_truth_df.index).all()
        assert (tmp_res_df.columns == tmp_res_df.columns).all()

        # Iterate over res_df and calculate Bray-Curtis index
        for sample_id in tmp_res_df.index:
            bray_curtis_dissi = braycurtis(
                tmp_res_df.loc[sample_id], tmp_truth_df.loc[sample_id]
            )
            bray_curtis_l.append(
                (
                    sample_id,
                    bray_curtis_dissi,
                    tmp_truth_df.loc[sample_id, "Cancer Epithelial"],
                    model_ver,
                    model_ver.split("_")[0],
                    lineage_lvl,
                    sample_id.split("_")[0],
                )
            )

# Concatenate all rmse dataframes
bray_curtis_df = pd.DataFrame(
    bray_curtis_l,
    columns=[
        "Mixture ID",
        "Bray-Curtis Dissi",
        "Purity Level",
        "Method",
        "Parent Method",
        "Lineage Level",
        "Patient",
    ],
)

# Rename methods
bray_curtis_df["Parent Method"] = bray_curtis_df["Parent Method"].replace(
    {"bisque": "Bisque", "bprism": "BayesPrism", "music": "MuSiC", "dwls": "DWLS"}
)

# Infer marker genes
bray_curtis_df["Marker Genes"] = bray_curtis_df["Method"].replace(
    {
        "bisque": "No marker genes",
        "bisque_scaled_marker": "With marker genes",
        "bprism_v2": "No marker genes",
        "bprism_v2_marker_genes_cell_states": "With marker genes",
        "music": "No marker genes",
        "music_marker_genes": "With marker genes",
        "dwls": "Seurat",
        "dwls_mast": "MAST",
        "hspe": "No marker genes",
        "hspe_marker_genes": "With marker genes",
    }
)

In [None]:
# Save source data
bray_curtis_df[bray_curtis_df["Parent Method"] != "DWLS"].to_csv(
    "source_data/supp_figure_18a.tsv", sep="\t"
)
bray_curtis_df[bray_curtis_df["Parent Method"] == "DWLS"].to_csv(
    "source_data/supp_figure_18c.tsv", sep="\t"
)

In [None]:
fig = px.box(
    bray_curtis_df,
    x="Lineage Level",
    y="Bray-Curtis Dissi",
    color="Marker Genes",
    facet_col="Lineage Level",
    facet_row="Parent Method",
    category_orders={
        "Lineage Level": ["major", "minor", "subset"],
        "Parent Method": [
            "BayesPrism",
            "Bisque",
            "MuSiC",
            "hspe",
            "DWLS",
        ],
    },
    color_discrete_map={
        "No marker genes": "blue",
        "With marker genes": "green",
        "Seurat": "purple",
        "MAST": "teal",
    },
)

# Update trace properties for main plots
fig.update_traces(marker=dict(size=2.5), line=dict(width=0.8), opacity=1)

# Update axes of main plot
fig.update_yaxes(
    title="Bray-Curtis Dissimilarity",
    linecolor="black",
    linewidth=0.8,
    ticks="outside",
    # showticklabels=True, # Hide this so subplot share the same ticklabels
    ticklen=3,
    tickfont_size=9,
    title_font_size=10,
    title_standoff=5,
    showgrid=True,
    gridwidth=0.75,
    gridcolor="lightgray",
    range=[0, 1],
    dtick=0.2,
    matches=None,
)
fig.update_xaxes(
    # title="Tumour purity levels (%)",
    linecolor="black",
    linewidth=0.8,
    ticks="outside",
    ticklen=3,
    # showticklabels=True, # Hide this so subplot share the same ticklabels
    tickfont_size=9,
    title_font_size=10,
    title_standoff=5,
    matches=None,
)
# Format annotations
fig.for_each_annotation(
    lambda a: a.update(
        text=a.text.split("=")[-1],
    )
)

fig.update_layout(
    margin=dict(t=11, l=0, r=3.5, b=0),
    plot_bgcolor="rgba(0,0,0,0)",
    showlegend=True,
    font_size=9,
)

fig.write_image(
    Path(prefix).joinpath("figures/supp_figures/bray_curtis").with_suffix(".svg"),
    width=500,
    height=600,
    scale=5,
)

### Calculate Root Mean Square Error

In [None]:
rmse_l = []

# Iterate over lineage levels and different model versions to collect predictions
for lineage_lvl in tqdm(["major", "minor", "subset"]):
    for model_ver in [
        "bisque",
        "bisque_scaled_marker",
        "bprism_v2",
        "bprism_v2_marker_genes_cell_states",
        "music",
        "music_marker_genes",
        "dwls",
        "dwls_mast",
        "hspe",
        "hspe_marker_genes",
    ]:
        tmp_prefix = lineage_prefix_mappings[lineage_lvl]["prefix"]

        # Load predictions and replace tiny negative values with 0
        res_df = pd.read_csv(
            Path(tmp_prefix).joinpath(f"data/results/{model_ver}.csv"),
            sep="\t",
            index_col=0,
        )

        # Clip tiny negative numbers to 0
        res_df.clip(lower=0, inplace=True)

        # We only use predictions where tumour purity level == 50%
        tmp_truth_df = lineage_prefix_mappings[lineage_lvl]["truth_df"][
            lineage_prefix_mappings[lineage_lvl]["truth_df"]["Cancer Epithelial"] == 0.5
        ].sort_index()
        tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

        # Rearrange index and columns and make sure they match
        tmp_truth_df = tmp_truth_df[tmp_res_df.columns]
        tmp_truth_df.sort_index(inplace=True)
        tmp_res_df.sort_index(inplace=True)
        assert (tmp_truth_df.index == tmp_truth_df.index).all()
        assert (tmp_res_df.columns == tmp_res_df.columns).all()

        # Iterate over res_df and calculate RMSE
        for sample_id in tmp_res_df.index:
            rmse = (
                mean_squared_error(
                    tmp_res_df.loc[sample_id] * 100, tmp_truth_df.loc[sample_id] * 100
                )
                ** 0.5
            )
            rmse_l.append(
                (
                    sample_id,
                    rmse,
                    tmp_truth_df.loc[sample_id, "Cancer Epithelial"],
                    model_ver,
                    model_ver.split("_")[0],
                    lineage_lvl,
                    sample_id.split("_")[0],
                )
            )

# Concatenate all rmse dataframes
rmse_df = pd.DataFrame(
    rmse_l,
    columns=[
        "Mixture ID",
        "RMSE",
        "Purity Level",
        "Method",
        "Parent Method",
        "Lineage Level",
        "Patient",
    ],
)

# Rename methods
rmse_df["Parent Method"] = bray_curtis_df["Parent Method"].replace(
    {"bisque": "Bisque", "bprism": "BayesPrism", "music": "MuSiC", "dwls": "DWLS"}
)


# Infer marker genes
rmse_df["Marker Genes"] = rmse_df["Method"].replace(
    {
        "bisque": "No marker genes",
        "bisque_scaled_marker": "With marker genes",
        "bprism_v2": "No marker genes",
        "bprism_v2_marker_genes_cell_states": "With marker genes",
        "music": "No marker genes",
        "music_marker_genes": "With marker genes",
        "dwls": "Seurat",
        "dwls_mast": "MAST",
        "hspe": "No marker genes",
        "hspe_marker_genes": "With marker genes",
    }
)

In [None]:
# Save source data
rmse_df[rmse_df["Parent Method"] != "DWLS"].to_csv(
    "source_data/supp_figure_18b.tsv", sep="\t"
)
rmse_df[rmse_df["Parent Method"] == "DWLS"].to_csv(
    "source_data/supp_figure_18d.tsv", sep="\t"
)

In [None]:
fig = px.box(
    rmse_df,
    x="Lineage Level",
    y="RMSE",
    color="Marker Genes",
    facet_col="Lineage Level",
    facet_row="Parent Method",
    category_orders={
        "Lineage Level": ["major", "minor", "subset"],
        "Parent Method": [
            "BayesPrism",
            "Bisque",
            "MuSiC",
            "hspe",
            "DWLS",
        ],
    },
    color_discrete_map={
        "No marker genes": "blue",
        "With marker genes": "green",
        "Seurat": "purple",
        "MAST": "teal",
    },
)

# Update trace properties for main plots
fig.update_traces(marker=dict(size=2.5), line=dict(width=0.8), opacity=1)

# Update axes of main plot
fig.update_yaxes(
    title="RMSE (%)",
    linecolor="black",
    linewidth=0.8,
    ticks="outside",
    # showticklabels=True, # Hide this so subplot share the same ticklabels
    ticklen=3,
    tickfont_size=9,
    title_font_size=10,
    title_standoff=5,
    showgrid=True,
    gridwidth=0.75,
    gridcolor="lightgray",
    range=[0, 30],
    dtick=5,
    matches=None,
)
fig.update_xaxes(
    # title="Tumour purity levels (%)",
    linecolor="black",
    linewidth=0.8,
    ticks="outside",
    ticklen=3,
    # showticklabels=True, # Hide this so subplot share the same ticklabels
    tickfont_size=9,
    title_font_size=10,
    title_standoff=5,
    matches=None,
)
# Format annotations
fig.for_each_annotation(
    lambda a: a.update(
        text=a.text.split("=")[-1],
    )
)

fig.update_layout(
    margin=dict(t=11, l=0, r=3.5, b=0),
    plot_bgcolor="rgba(0,0,0,0)",
    showlegend=True,
    font_size=9,
)

fig.write_image(
    Path(prefix).joinpath("figures/supp_figures/rmse").with_suffix(".svg"),
    width=500,
    height=600,
    scale=5,
)