# Visualization code for Extended Data Figure 6

In [None]:
import math
import numpy as np
import pandas as pd

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.metrics import mean_squared_error
from math import sqrt

%load_ext blackcellmagic

In [None]:
# Set prefixes
wu_et_al_prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"
bassez_et_al_prefix = (
    "???/deconvolution_benchmarking/05_external_scrna_validation/bassez_et_al"
)
pal_et_al_prefix = (
    "???/deconvolution_benchmarking/05_external_scrna_validation/pal_et_al"
)

# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

# List tumour purity levels
purity_levels = np.arange(0.05, 1, 0.05).round(3).tolist()
reduced_purity_levels = np.arange(0.05, 1, 0.15).round(3).tolist()

In [None]:
# load groundtruth
wu_et_al_truth_df = pd.read_csv(
    Path(wu_et_al_prefix).joinpath("data/results/truth.csv"), index_col=0, sep="\t"
)
bassez_et_al_truth_df = pd.read_csv(
    Path(bassez_et_al_prefix).joinpath("data/results/truth.tsv"), index_col=0, sep="\t"
)
pal_et_al_truth_df = pd.read_csv(
    Path(pal_et_al_prefix).joinpath("data/results/truth.tsv"), index_col=0, sep="\t"
)

## Plot histogram of cancer prediction

In [None]:
# Specify prefix and methods for each dataset
datasets = {
    "wu_et_al": {
        "prefix": wu_et_al_prefix,
        "methods": [
            "hspe",
            "music",
            "cbx",
            "scaden",
            "epic",
            "bisque",
            "dwls",
            "bprism_v2",
            "cpm",
        ],
        "cancer_cell_label": "Cancer Epithelial",
        "truth_df": wu_et_al_truth_df,
    },
    "bassez_et_al": {
        "prefix": bassez_et_al_prefix,
        "methods": [
            "bprism_v2_no_marker_genes",
            "scaden",
            "music",
            "cbx",
            "hspe",
            "epic",
            "bisque_scaled",
            "dwls_seurat",
        ],
        "cancer_cell_label": "Cancer_cell",
        "truth_df": bassez_et_al_truth_df,
    },
    "pal_et_al": {
        "prefix": pal_et_al_prefix,
        "methods": [
            "bprism_v2_no_marker_genes",
            "scaden",
            "music",
            "cbx",
            "hspe",
            "epic",
            "bisque_scaled",
            "dwls_seurat",
        ],
        "cancer_cell_label": "Cancer_epithelial",
        "truth_df": pal_et_al_truth_df,
    },
}

methods_order = [
    "BayesPrism",
    "Scaden",
    "MuSiC",
    "hspe",
    "CBX",
    "DWLS",
    "Bisque",
    "EPIC",
    "CPM",
]

In [None]:
cancer_pred_truth_l = []

for expt in tqdm(datasets.keys()):
    for method in tqdm(datasets[expt]["methods"]):

        # Loop through each experiment and read results for all methods
        if (
            Path(datasets[expt]["prefix"])
            .joinpath(f"data/results/{method}.csv")
            .exists()
        ):
            res_path = Path(datasets[expt]["prefix"]).joinpath(
                f"data/results/{method}.csv"
            )
        elif (
            Path(datasets[expt]["prefix"])
            .joinpath(f"data/results/{method}.tsv")
            .exists()
        ):
            res_path = Path(datasets[expt]["prefix"]).joinpath(
                f"data/results/{method}.tsv"
            )
        else:
            raise FileExistsError(
                f"Results for {method} under experiemnt {expt} does not exist            "
            )
        tmp_res_df = pd.read_csv(
            res_path,
            index_col=0,
            sep="\t",
        )
        tmp_res_df.clip(lower=0, inplace=True)

        # Get cancer predictions and groundtruth for each dataset and rename cancer cell column to pred
        cancer_cell_label = datasets[expt]["cancer_cell_label"]
        tmp_cancer_res_df = (
            tmp_res_df[cancer_cell_label]
            .to_frame()
            .rename(columns={datasets[expt]["cancer_cell_label"]: "pred"})
        )
        tmp_cancer_truth_df = (
            datasets[expt]["truth_df"][cancer_cell_label]
            .to_frame()
            .rename(columns={cancer_cell_label: "truth"})
        )

        # Merge prediction and groundtruth
        tmp_cancer_pred_truth_df = tmp_cancer_truth_df.merge(
            tmp_cancer_res_df, left_index=True, right_index=True, how="inner"
        )

        # Assign experiment and method
        tmp_cancer_pred_truth_df["expt"] = expt
        tmp_cancer_pred_truth_df["method"] = method

        # Append
        cancer_pred_truth_l.append(tmp_cancer_pred_truth_df)

In [None]:
# Concatenate all experiments and methods
cancer_pred_truth_df = pd.concat(cancer_pred_truth_l, axis=0)

# Rename methods and experiments
cancer_pred_truth_df.replace(
    {
        "music": "MuSiC",
        "cbx": "CBX",
        "scaden": "Scaden",
        "epic": "EPIC",
        "bisque": "Bisque",
        "bisque_scaled": "Bisque",
        "dwls": "DWLS",
        "bprism": "BayesPrism",
        "bprism_v2": "BayesPrism",
        "bprism_v2_no_marker_genes": "BayesPrism",
        "dwls_seurat": "DWLS",
        "wu_et_al": "Wu et al",
        "bassez_et_al": "Bassez et al",
        "pal_et_al": "Pal et al",
        "cpm": "CPM",
    },
    inplace=True,
)

# Only used reduced list of purity level
cancer_pred_truth_df = cancer_pred_truth_df[
    cancer_pred_truth_df["truth"].isin(reduced_purity_levels)
]

# Convert pred and truth to percentage
cancer_pred_truth_df["truth"] = (cancer_pred_truth_df["truth"] * 100).astype(int)
cancer_pred_truth_df["pred"] = cancer_pred_truth_df["pred"] * 100

In [None]:
gray_shades = [
    "rgb(211, 211, 211)",  # Light Gray
    "rgb(169, 169, 169)",  # Medium Gray
    "rgb(128, 128, 128)",  # Dark Gray
    "rgb(105, 105, 105)",  # Dim Gray
    "rgb(80, 80, 80)",  # Gray
    "rgb(50, 50, 50)",  # Darker Gray
    "rgb(0, 0, 0)",  # All Black
]

In [None]:
# Save source data
cancer_pred_truth_df.reset_index().rename(
    columns={
        "index": "Mixture ID",
        "truth": "Predicted cancer proportion (%)",
        "pred": "Actual cancer proportion (%)",
        "expt": "Dataset",
        "method": "Method",
    }
).to_csv(Path(viz_prefix).joinpath("source_data/supp_figure_9.tsv"), sep="\t")

In [None]:
fig = px.histogram(
    cancer_pred_truth_df,
    x="pred",
    facet_col="expt",
    facet_row="method",
    facet_col_wrap=3,
    facet_row_spacing=0.015,
    facet_col_spacing=0.025,
    color="truth",
    color_discrete_sequence=gray_shades,
    category_orders={
        "truth": reduced_purity_levels,
        "method": methods_order,
        "type": ["pred", "truth"],
    },
    nbins=100,
)

# Update axes
fig.update_xaxes(
    title="",
    title_font_size=7,
    title_standoff=3,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    tickfont_size=6,
    ticklen=2,
    range=[-1, 101],
    tickvals=[int(i * 100) for i in reduced_purity_levels],
    matches=None,
)
fig.update_yaxes(
    title="",
    title_font_size=1,
    title_standoff=3,
    linecolor="black",
    linewidth=0.5,
    ticks="outside",
    tickfont_size=6,
    ticklen=2,
    range=[0, 2501],
    dtick=500,
    matches=None,
    showgrid=True,
    gridwidth=0.25,
    gridcolor="lightgray",
)


# Update layout
fig["layout"].update(
    font=dict(size=7, color="black"),
    plot_bgcolor="rgba(0,0,0,0)",
    margin=dict(t=10, l=0, r=5, b=0),
    legend=dict(
        title="Tumour purity level (%)",
        title_font_size=7,
        font_size=6,
        orientation="h",
        yanchor="bottom",
        # y=-1,
        xanchor="center",
        # x=0.25,
    ),
    showlegend=True,
)

# Nicely formate subplot annotations
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

# Save into images
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_9").with_suffix(".svg"),
    scale=5,
    # height=700,
    # width=450,
    # Uncomment these dimensions for legend
    height=700,
    width=700,
)