# Visualization code for Extended Data Figure 10

In [None]:
import numpy as np
import pandas as pd
import anndata as adata

import random
import itertools

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity as skl_cosine

from scipy.stats import pearsonr
from scipy.spatial.distance import braycurtis
from math import sqrt
from sklearn.metrics import confusion_matrix

%load_ext blackcellmagic

## Set up paths and load setting files

#### Path to subset/minor/major experiments

In [None]:
# Subset
subset_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/subset_level"
)

# Minor
minor_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/minor_level"
)

# Major
major_prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"

# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

#### List major/minor/subset cell types

In [None]:
subset_c_types = [
    "Endothelial",
    "CAFs",
    "PVL",
    "B cells Memory",
    "B cells Naive",
    "T_cells_c4_CD8+_ZFP36",
    "T_cells_c6_IFIT1",
    "T_cells_c7_CD8+_IFNG",
    "T_cells_c8_CD8+_LAG3",
    "T_cells_c0_CD4+_CCR7",
    "T_cells_c1_CD4+_IL7R",
    "T_cells_c2_CD4+_T-regs_FOXP3",
    "T_cells_c3_CD4+_Tfh_CXCL13",
    "T_cells_c9_NK_cells_AREG",
    "T_cells_c11_MKI67",
    "T_cells_c10_NKT_cells_FCGR3A",
    "Myeloid_c10_Macrophage_1_EGR1",
    "Myeloid_c12_Monocyte_1_IL1B",
    "Myeloid_c2_LAM2_APOE",
    "Myeloid_c1_LAM1_FABP5",
    "Cycling_Myeloid",
    "Myeloid_c4_DCs_pDC_IRF7",
    "Normal Epithelial",
    "Plasmablasts",
    "Myeloid_c8_Monocyte_2_S100A9",
    "Myeloid_c9_Macrophage_2_CXCL10",
    "Myeloid_c11_cDC2_CD1C",
    "Cancer Epithelial",
    "Myeloid_c3_cDC1_CLEC9A",
]
minor_c_types = [
    "Cancer Epithelial",
    "T cells CD4+",
    "T cells CD8+",
    "Endothelial",
    "CAFs",
    "Macrophage",
    "PVL",
    "Normal Epithelial",
    "Plasmablasts",
    "B cells Memory",
    "Monocyte",
    "Cycling_Myeloid",
    "Cycling T-cells",
    "NK cells",
    "NKT cells",
    "DCs",
    "B cells Naive",
]
major_c_types = [
    "B-cells",
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Myeloid",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
    "T-cells",
]

#### Load groundtruth

In [None]:
# We only use tumour purity = 50%
pur_lvl = 0.5

In [None]:
# Subset
subset_truth_df = pd.read_csv(
    Path(subset_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
subset_truth_df = subset_truth_df[subset_c_types]

# Minor
minor_truth_df = pd.read_csv(
    Path(minor_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
minor_truth_df = minor_truth_df[minor_c_types]

# Major
major_truth_df = pd.read_csv(
    Path(major_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
major_truth_df = major_truth_df[major_c_types]

#### Extract lineages metadata

In [None]:
# Load lineage mapping of all cell types in the data
lineages_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_immune_lineages.tsv"),
    sep="\t",
)

# Replace columns
lineages_df.rename(
    columns={
        "celltype_major": "Major Cell Type",
        "celltype_major_short": "Annotated Major Cell Type",
        "celltype_minor": "Minor Cell Type",
        "celltype_minor_short": "Annotated Minor Cell Type",
        "celltype_subset": "Subset Cell Type",
        "celltype_subset_short": "Annotated Subset Cell Type",
        "marker_genes": "Marker Genes",
        "counts": "Cell Counts",
        "num_of_patients": "Patient Counts",
    },
    inplace=True,
)

In [None]:
# We removed Myeloid_c7_Monocyte_3_FCGR3A, exclude it from the lineages DataFrame
lineages_df = lineages_df[
    lineages_df["Subset Cell Type"] != "Myeloid_c7_Monocyte_3_FCGR3A"
]

# We also removed 3 cell types that MuSiC dropped
lineages_df = lineages_df[
    ~lineages_df["Subset Cell Type"].isin(
        [
            "Myeloid_c0_DC_LAMP3",
            "Myeloid_c5_Macrophage_3_SIGLEC1",
            "T_cells_c5_CD8+_GZMK",
        ]
    )
]

#### Extract colour pallete

In [None]:
# Load lineage mapping of all cell types in the data
colour_pallete_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

# Convert to dictionary
colour_pallete_d = {
    row["all_celltype"]: {"fill": row["fill"], "line": row["line"]}
    for i, row in colour_pallete_df.iterrows()
}

## [Fig] Box plot of all immune cell types for all methods

In [None]:
# Specify methods and tumour purity level
methods = ["bprism_v2", "dwls"]
pur_lvl = 0.5

In [None]:
# Convert colour pallete to dict
colour_pallete_d = {
    row["all_celltype"]: {"fill": row["fill"], "line": row["line"]}
    for i, row in colour_pallete_df.iterrows()
}

In [None]:
# Create a list to hold raw prediction errors at each lineage level for source data
all_preds_truth_l = []

#### Major cell types: B-cells, T-Cells, Myeloid

In [None]:
major_preds_truth_l = []

pivot_method_metrics_l = []

for method in tqdm(methods):
    # First read out predictions and groundtruth
    metrics_series_l = []

    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(major_prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )

    # Some predictions can be like -0.000001. Clip them up to 0
    res_df[res_df < 0] = 0

    # Get only samples at pur_lvl = 0.5
    tmp_truth_df = major_truth_df[
        major_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Iterate over cell types
    for c_type in tmp_truth_df.columns.tolist():
        ctype_truth_df = tmp_truth_df[c_type]
        ctype_preds_df = tmp_res_df[c_type]

        # Concatenate predictions and groundtruth into a DataFrame
        preds_truth_df = pd.concat(
            [ctype_truth_df.sort_index(), ctype_preds_df.sort_index()], axis=1
        )
        preds_truth_df.columns = ["truth", "preds"]
        preds_truth_df = preds_truth_df.round(2)
        preds_truth_df["Patient"] = [i.split("_")[0] for i in preds_truth_df.index]
        preds_truth_df["Cell Type"] = c_type
        preds_truth_df["Method"] = method

        major_preds_truth_l.append(preds_truth_df)

major_preds_truth_df = pd.concat(major_preds_truth_l, axis=0)

# Get True Errors
major_preds_truth_df["diff"] = (
    major_preds_truth_df["preds"] - major_preds_truth_df["truth"]
)
major_preds_truth_df["True Errors"] = major_preds_truth_df["diff"] * 100

# Merge with colour pallete to get colour codes
major_preds_truth_df = major_preds_truth_df.merge(
    colour_pallete_df.rename(columns={"all_celltype": "Cell Type"}),
    on=["Cell Type"],
    how="inner",
)

In [None]:
# Only plot major immune cell types
major_xaxis_order = ["T-cells", "B-cells", "Myeloid"]
major_preds_truth_df = major_preds_truth_df[
    major_preds_truth_df["Cell Type"].isin(major_xaxis_order)
]

# Append to all_preds_truth_l for source data
all_preds_truth_l.append(
    major_preds_truth_df.rename(
        columns={
            "truth": "Actual proportion",
            "preds": "Predicted proportion",
            "True Errors": "Raw prediction error",
            "lineage": "Lineage level",
        }
    )[
        [
            "Cell Type",
            "Patient",
            "Method",
            "Lineage level",
            "Actual proportion",
            "Predicted proportion",
            "Raw prediction error",
        ]
    ]
)

In [None]:
fig = px.box(
    major_preds_truth_df,
    x="Cell Type",
    y="True Errors",
    facet_row="Method",
    color="Cell Type",
    color_discrete_map={i: v["fill"] for i, v in colour_pallete_d.items()},
    notched=True,
    category_orders={"Mehthod": ["bprism", "dwls"]},
)

# Update all traces
for c_type in major_c_types:
    fig.update_traces(
        fillcolor={i: v["fill"] for i, v in colour_pallete_d.items()}[c_type],
        line=dict(width=1),
        marker=dict(size=2),
        opacity=1,
        selector=dict(name=c_type),
    )


# Update axes
fig.update_xaxes(
    title_standoff=4,
    title_font_size=14,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    tickangle=90,
    tickfont_size=13,
    tickwidth=0.5,
    ticklen=3,
    categoryorder="array",
    categoryarray=np.array(major_xaxis_order),
)
fig.update_yaxes(
    title="Raw prediction errors (%)",
    title_standoff=4,
    title_font_size=14,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    showticklabels=True,
    tickfont_size=13,
    tickwidth=0.5,
    ticklen=3,
    range=[-31, 61],
    dtick=10,
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
)

# Update layout
fig["layout"].update(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    plot_bgcolor="rgba(0,0,0,0)",
    font_size=14,
    font_color="black",
    showlegend=False,
)

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
# Turn this on if we don't want to display subplot annotations
for anno in fig["layout"]["annotations"]:
    anno["text"] = ""

# Add straight 0% line
fig.add_hline(y=0, line_dash="dot", line_width=0.5)

# Make the line of each box thinner
fig.update_traces(
    line_width=0.5,
)

fig.write_image(
    Path("figures/supp_figures/supp_fig_14_major_lineages").with_suffix(".svg"),
    scale=5,
    width=150,
    height=500,
)

#### Minor cell types
B-cells: B cells memomry, B cells naive <br/>
T-Cells: CD4+ T Cells, CD8+ T Cells, NK cells, NKT cells, Cylcing T cells <br/>
Myeloid: Macrophage, DCs, Monocyte, Cylcing Myeloid <br/>

In [None]:
minor_preds_truth_l = []

for method in tqdm(methods):
    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(minor_prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )

    # Some predictions can be like -0.000001. Clip them up to 0
    res_df[res_df < 0] = 0

    # Get only samples at pur_lvl = 0.5
    tmp_truth_df = minor_truth_df[
        minor_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Iterate over cell types
    for c_type in tmp_truth_df.columns.tolist():
        ctype_truth_df = tmp_truth_df[c_type]
        ctype_preds_df = tmp_res_df[c_type]

        # Concatenate predictions and groundtruth into a DataFrame
        preds_truth_df = pd.concat(
            [ctype_truth_df.sort_index(), ctype_preds_df.sort_index()], axis=1
        )
        preds_truth_df.columns = ["truth", "preds"]
        preds_truth_df = preds_truth_df.round(2)
        preds_truth_df["Patient"] = [i.split("_")[0] for i in preds_truth_df.index]
        preds_truth_df["Cell Type"] = c_type
        preds_truth_df["Method"] = method

        minor_preds_truth_l.append(preds_truth_df)

minor_preds_truth_df = pd.concat(minor_preds_truth_l, axis=0)

# Get TE
minor_preds_truth_df["diff"] = (
    minor_preds_truth_df["preds"] - minor_preds_truth_df["truth"]
)
minor_preds_truth_df["True Errors"] = minor_preds_truth_df["diff"] * 100

# Re-annotate Cycling_Myeloid to Cycling Myeloid
minor_preds_truth_df.replace({"Cycling_Myeloid": "Cycling Myeloid"}, inplace=True)

# Merge with colour pallete to get colour codes
minor_preds_truth_df = minor_preds_truth_df.merge(
    colour_pallete_df.rename(columns={"all_celltype": "Cell Type"}),
    on=["Cell Type"],
    how="inner",
)

In [None]:
# Only plot minor immune cell types
minor_xaxis_order = [
    "NK cells",
    "Cycling T-cells",
    "NKT cells",
    "T cells CD4+",
    "T cells CD8+",
    "B cells Memory",
    "B cells Naive",
    "Cycling Myeloid",
    "Macrophage",
    "Monocyte",
    "DCs",
]
plot_minor_preds_truth_df = minor_preds_truth_df[
    minor_preds_truth_df["Cell Type"].isin(minor_xaxis_order)
]


# Append to all_preds_truth_l for source data
all_preds_truth_l.append(
    plot_minor_preds_truth_df.rename(
        columns={
            "truth": "Actual proportion",
            "preds": "Predicted proportion",
            "True Errors": "Raw prediction error",
            "lineage": "Lineage level",
        }
    )[
        [
            "Cell Type",
            "Patient",
            "Method",
            "Lineage level",
            "Actual proportion",
            "Predicted proportion",
            "Raw prediction error",
        ]
    ]
)

In [None]:
fig = px.box(
    plot_minor_preds_truth_df,
    x="Cell Type",
    y="True Errors",
    facet_row="Method",
    color="Cell Type",
    color_discrete_map={i: v["fill"] for i, v in colour_pallete_d.items()},
    notched=True,
    category_orders={"Mehthod": ["bprism", "dwls"]},
)

# Update all traces
for c_type in minor_xaxis_order:
    fig.update_traces(
        fillcolor={i: v["fill"] for i, v in colour_pallete_d.items()}[c_type],
        line=dict(width=1),
        marker=dict(size=3),
        opacity=1,
        selector=dict(name=c_type),
    )

# Update axes
fig.update_xaxes(
    title_standoff=5,
    title_font_size=12,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    tickangle=90,
    tickfont_size=11,
    tickwidth=0.5,
    ticklen=3,
    categoryorder="array",
    categoryarray=np.array(minor_xaxis_order),
)
fig.update_yaxes(
    title="Raw prediction errors (%)",
    title_standoff=5,
    title_font_size=12,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    showticklabels=True,
    tickfont_size=11,
    tickwidth=0.5,
    ticklen=3,
    range=[-31, 61],
    dtick=10,
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
)

# Update layout
fig["layout"].update(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    plot_bgcolor="rgba(0,0,0,0)",
    font_size=12,
    font_color="black",
    showlegend=False,
)

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
# Turn this on if we don't want to display subplot annotations
for anno in fig["layout"]["annotations"]:
    anno["text"] = ""

# Add straight 0% line
fig.add_hline(y=0, line_dash="dot", line_width=0.5)

# Make the line of each box thinner
fig.update_traces(
    line_width=0.5,
)

fig.write_image(
    Path("figures/supp_figures/supp_fig_14_minor_lineages").with_suffix(".svg"),
    scale=5,
    width=350,
    height=450,
)

#### Subset cell types
B-cells: 
T-Cells: 
Myeloid: 

In [None]:
subset_preds_truth_l = []

for method in tqdm(methods):
    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(subset_prefix).joinpath(f"data/results/{method}.csv"),
        sep="\t",
        index_col=0,
    )

    # Some predictions can be like -0.000001. Clip them up to 0
    res_df[res_df < 0] = 0

    # Get only samples at pur_lvl = 0.5
    tmp_truth_df = subset_truth_df[
        subset_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Iterate over cell types
    for c_type in tmp_truth_df.columns.tolist():
        ctype_truth_df = tmp_truth_df[c_type]
        ctype_preds_df = tmp_res_df[c_type]

        # Concatenate predictions and groundtruth into a DataFrame
        preds_truth_df = pd.concat(
            [ctype_truth_df.sort_index(), ctype_preds_df.sort_index()], axis=1
        )
        preds_truth_df.columns = ["truth", "preds"]
        preds_truth_df = preds_truth_df.round(2)
        preds_truth_df["Patient"] = [i.split("_")[0] for i in preds_truth_df.index]
        preds_truth_df["Cell Type"] = c_type
        preds_truth_df["Method"] = method

        subset_preds_truth_l.append(preds_truth_df)

subset_preds_truth_df = pd.concat(subset_preds_truth_l, axis=0)

# Get TE
subset_preds_truth_df["diff"] = (
    subset_preds_truth_df["preds"] - subset_preds_truth_df["truth"]
)
subset_preds_truth_df["True Errors"] = subset_preds_truth_df["diff"] * 100

# Get pretty cell type annotations
subset_preds_truth_df = (
    subset_preds_truth_df.rename(columns={"Cell Type": "Subset Cell Type"})
    .merge(
        lineages_df[["Subset Cell Type", "Annotated Subset Cell Type"]],
        on=["Subset Cell Type"],
        how="inner",
    )
    .drop(["Subset Cell Type"], axis=1)
    .rename(columns={"Annotated Subset Cell Type": "Cell Type"})
)

# Merge with colour pallete to get colour codes
subset_preds_truth_df = subset_preds_truth_df.merge(
    colour_pallete_df.rename(columns={"all_celltype": "Cell Type"}),
    on=["Cell Type"],
    how="inner",
)

In [None]:
# Only plot subset immune cell types
subset_xaxis_order = [
    "Naive/central Memory T Cells",
    "Effector Memory T Cells",
    "T-regs",
    "Tfh",
    "Chemokine-expressing T Cells",
    "IFN-I Signature T Cells",
    "T-cells:IFNG",
    "T-cells:LAG3",
    "M2-like Macrophage:EGR1",
    "LAM2*",
    "LAM1*",
    "M2-like Macrophage:CXCL10",
    "Mono:IL1B",
    "Mono:FCGR3A",
    "Myeloid:pDC/IRF7",
    "Myeloid:cDC2/CD1C",
    "Myeloid:cDC1/CLEC9A",
]
plot_subset_preds_truth_df = subset_preds_truth_df[
    subset_preds_truth_df["Cell Type"].isin(subset_xaxis_order)
]


# Append to all_preds_truth_l for source data
all_preds_truth_l.append(
    plot_subset_preds_truth_df.rename(
        columns={
            "truth": "Actual proportion",
            "preds": "Predicted proportion",
            "True Errors": "Raw prediction error",
            "lineage": "Lineage level",
        }
    )[
        [
            "Cell Type",
            "Patient",
            "Method",
            "Lineage level",
            "Actual proportion",
            "Predicted proportion",
            "Raw prediction error",
        ]
    ]
)

In [None]:
# Plot plotly gorgeously
fig = px.box(
    plot_subset_preds_truth_df,
    x="Cell Type",
    y="True Errors",
    facet_row="Method",
    # facet_row_spacing=0.005,
    color="Cell Type",
    color_discrete_map={i: v["fill"] for i, v in colour_pallete_d.items()},
    notched=True,
)

# Update all traces
for c_type in subset_xaxis_order:
    fig.update_traces(
        fillcolor={i: v["fill"] for i, v in colour_pallete_d.items()}[c_type],
        line=dict(width=1),
        marker=dict(size=3),
        opacity=1,
        selector=dict(name=c_type),
    )

# Update axes
fig.update_xaxes(
    title_standoff=5,
    title_font_size=10,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    tickangle=90,
    tickfont_size=9,
    tickwidth=0.5,
    ticklen=3,
    categoryorder="array",
    categoryarray=np.array(subset_xaxis_order),
)
fig.update_yaxes(
    title="Raw prediction errors (%)",
    title_standoff=5,
    title_font_size=10,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    showticklabels=True,
    tickfont_size=9,
    tickwidth=0.5,
    ticklen=3,
    range=[-31, 61],
    dtick=10,
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
)

# Update layout
fig["layout"].update(
    margin=dict(t=0, l=0, r=0, b=0),  # Tight margin
    plot_bgcolor="rgba(0,0,0,0)",
    font_size=10,
    font_color="black",
    showlegend=False,
)

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
# Turn this on if we don't want to display subplot annotations
for anno in fig["layout"]["annotations"]:
    anno["text"] = ""

# Add straight 0% line
fig.add_hline(y=0, line_dash="dot", line_width=0.5)

# Make the line of each box thinner
fig.update_traces(
    line_width=0.5,
)

fig.write_image(
    Path("figures/supp_figures/supp_fig_14_subset_lineages").with_suffix(".svg"),
    scale=5,
    width=450,
    height=450,
)

### Save source data

In [None]:
# Concatenate all predictions and ground truth across lineage levels
all_preds_truth_df = pd.concat(all_preds_truth_l, axis=0)

# Rename models
all_preds_truth_df.replace({"bprism_v2": "BayesPrism", "dwls": "DWLS"}, inplace=True)

# Save
all_preds_truth_df.to_csv(
    Path(viz_prefix).joinpath("source_data/supp_figure_14.tsv"), sep="\t"
)