# Visualize models performance 

In [None]:
import numpy as np
import pandas as pd
import anndata as adata

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity as skl_cosine

from scipy.stats import pearsonr
from scipy.spatial.distance import cosine as scipy_cosine
from scipy.spatial.distance import braycurtis, cdist
from math import sqrt

%load_ext blackcellmagic

In [None]:
# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

# Prefix to the experiment we're plotting
prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"

# Tumour purity levels
purity_levels = np.arange(0.05, 1, 0.05).round(3).tolist()

# Major cell types
c_types = [
    "Cancer Epithelial",
    "Normal Epithelial",
    "T-cells",
    "B-cells",
    "Myeloid",
    "CAFs",
    "Endothelial",
    "PVL",
    "Plasmablasts",
]

# Methods order are universal across figures
methods_order = [
    "BayesPrism",
    "Scaden",
    "MuSiC",
    "hspe",
    "DWLS",
    "CBX",
    "Bisque",
    "EPIC",
    "CPM",
]

### Load groundtruth

In [None]:
# Load truth.csv
truth_df = pd.read_csv(
    Path(prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
truth_df = truth_df[c_types]

# Pivot longer for when we need it
truth_copy_df = truth_df.copy().sample(frac=0.05, random_state=41)
truth_copy_df["purity_level"] = truth_copy_df["Cancer Epithelial"]

pivot_truth_df = (
    truth_copy_df.reset_index()
    .melt(id_vars=["index", "purity_level"], value_vars=c_types)
    .rename(columns={"index": "mixture_id", "variable": "cell_type", "value": "truth"})
    .set_index(["mixture_id", "cell_type"])
)

### Extract colour pallete

In [None]:
# Extract colour pallete
ctype_colour_pallete_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

# Convert to dictionary
ctype_colour_pallete_d = {
    row["all_celltype"]: {"fill": row["fill"], "line": row["line"]}
    for i, row in ctype_colour_pallete_df.iterrows()
}

## [Fig] True Errors over tumour purity levels

In [None]:
# We don't need to show all 19 tumour purities for each tool. Just half of it will be fine
# Getting tumour purity levels with intervals of 15% instead of 10%
reduced_purity_levels = np.arange(0.05, 1, 0.15).round(3).tolist()

methods = [
    "bprism_v2",
    "scaden",
    "music",
    "cbx",
    "bisque",
    "epic",
    "dwls",
    "hspe",
    "cpm",
]

In [None]:
pivot_diff_l = []

for method in tqdm(methods):
    res_df = pd.read_csv(
        Path(prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )

    # Some values can be -0.00000001. Round them up to 0
    res_df.clip(lower=0, inplace=True)

    # Calculate True Error for each tumour purity level
    for pur_lvl in reduced_purity_levels:
        # Get predictions and groundtruth at this purity level
        subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]
        subset_res_df = res_df[res_df.index.isin(subset_truth_df.index)]

        # Make sure that these 2 partitions have the same index
        assert subset_res_df.sort_index().index.equals(
            subset_truth_df.sort_index().index
        )

        # True Error
        diff_df = (
            subset_res_df[c_types].sort_index() - subset_truth_df[c_types].sort_index()
        )
        diff_df["Patient"] = [i.split("_")[0] for i in diff_df.index]

        pivot_diff_df = diff_df.melt(id_vars=["Patient"], value_vars=c_types).rename(
            columns={"variable": "Cell Type", "value": "Error"}
        )

        # Retrieve method and pur lvl
        pivot_diff_df["Method"] = method
        pivot_diff_df["Purity Level"] = pur_lvl
        pivot_diff_l.append(pivot_diff_df)

# Concatenate
boxplot_diff_df = pd.concat(pivot_diff_l, axis=0)
boxplot_diff_df["Error Pct"] = (boxplot_diff_df["Error"].astype(float) * 100).round(4)

# Replace method names
boxplot_diff_df.replace(
    {
        "scaden": "Scaden",
        "dwls": "DWLS",
        "cbx": "CBX",
        "epic": "EPIC",
        "music": "MuSiC",
        "cpm": "CPM",
        "bprism_v2": "BayesPrism",
        "bisque": "Bisque",
    },
    inplace=True,
)

In [None]:
# Decide which cell type to plot
plot_c_types = [
    "Cancer Epithelial",
    "Normal Epithelial",
    "T-cells",
    "B-cells",
    "Myeloid",
    "CAFs",
    "Endothelial",
    "PVL",
    "Plasmablasts",
]

In [None]:
boxplot_diff_df["Purity Level"] = (boxplot_diff_df["Purity Level"] * 100).astype(int)

In [None]:
# Save source data
boxplot_diff_df[
    (boxplot_diff_df["Cell Type"].isin(plot_c_types))
    & (
        ~boxplot_diff_df["Method"].isin(
            ["BayesPrism", "Scaden", "MuSiC", "CBX", "DWLS"]
        )
    )
].to_csv(Path(viz_prefix).joinpath("source_data/supp_figure_6.txt"), sep="\t")

In [None]:
fig = px.box(
    boxplot_diff_df[boxplot_diff_df["Cell Type"].isin(plot_c_types)],
    x="Purity Level",
    y="Error Pct",
    facet_col="Method",
    facet_col_wrap=2,
    facet_col_spacing=0.035,
    facet_row_spacing=0.04,
    color="Cell Type",
    color_discrete_map={i: v["line"] for i, v in ctype_colour_pallete_d.items()},
    notched=True,
    category_orders={"Method": methods_order},
)

# Update all traces
for c_type in c_types:
    fig.update_traces(
        fillcolor={i: v["fill"] for i, v in ctype_colour_pallete_d.items()}[c_type],
        line=dict(width=0.5),
        marker=dict(size=1),
        opacity=1,
        selector=dict(name=c_type),
    )

# Update axes
fig.update_xaxes(
    # title="Tumour purity level (%)",
    title_standoff=4,
    ticks="outside",
    # showticklabels=True,
    tickmode="array",
    tickfont_size=6,
    tickvals=[int(i * 100) for i in reduced_purity_levels],
    ticklen=3,
    tickwidth=0.5,
    linecolor="black",
    linewidth=0.5,
    side="bottom",
)
fig.update_yaxes(
    linecolor="black",
    linewidth=0.5,
    range=[-101, 101],
    tickfont_size=6,
    dtick=25,
    ticks="outside",
    ticklen=3,
    tickwidth=0.5,
    showgrid=True,
    gridwidth=0.5,
    gridcolor="lightgray",
)

# Update title of y-axis of the first subplot
fig.update_xaxes(
    title="Tumour purity levels (%)",
    row=1,
    title_font_size=8,
    title_standoff=3,
)
fig.update_yaxes(
    title="Raw prediction error (%)",
    col=1,
    title_font_size=8,
    title_standoff=2,
)

fig["layout"].update(
    margin_pad=10,
    boxmode="group",
    boxgroupgap=0.05,
    showlegend=False,
    plot_bgcolor="rgba(0,0,0,0)",
    legend=dict(
        title="Cell type",
        font_size=8,
        orientation="h",
        yanchor="bottom",
        y=-0.2,
        xanchor="center",
        x=0.5,
    ),
    margin=dict(t=10, l=0, r=0, b=0),
    font=dict(size=7, color="black"),
)

# Zero line
fig.add_hline(y=0, line_dash="dot", line_width=0.5)

# Format annotations
fig.for_each_annotation(
    lambda a: a.update(
        text=a.text.split("=")[-1],
    )
)

# Add x-axis label for EPIC
fig.update_xaxes(
    col=2,
    row=2,
    showticklabels=True,
    title="Tumour purity level",
    title_font_size=8,
    title_standoff=3,
)

# Save into png
fig.write_image(
    Path(viz_prefix).joinpath("figures/supp_figures/supp_fig_6").with_suffix(".svg"),
    width=500,
    height=750,
    scale=1,
)