# Visualize models performance 

In [None]:
import numpy as np
import pandas as pd
import anndata as adata

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity as skl_cosine

from scipy.stats import pearsonr
from scipy.spatial.distance import cosine as scipy_cosine
from scipy.spatial.distance import braycurtis, cdist
from math import sqrt

%load_ext blackcellmagic

In [None]:
# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

# Prefix to the experiment we're plotting
prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"

# Tumour purity levels
purity_levels = np.arange(0.05, 1, 0.05).round(3).tolist()

# Cell types
c_types = [
    "Cancer Epithelial",
    "Normal Epithelial",
    "T-cells",
    "B-cells",
    "Myeloid",
    "CAFs",
    "Endothelial",
    "PVL",
    "Plasmablasts",
]

# Methods order is universal across figures
methods_order = [
    "BayesPrism",
    "Scaden",
    "MuSiC",
    "hspe",
    "DWLS",
    "CBX",
    "Bisque",
    "EPIC",
    "CPM",
]

### Load groundtruth

In [None]:
# Load truth.csv
truth_df = pd.read_csv(
    Path(prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
truth_df = truth_df[c_types]

# Pivot longer for when we need it
truth_copy_df = truth_df.copy().sample(frac=0.05, random_state=41)
truth_copy_df["purity_level"] = truth_copy_df["Cancer Epithelial"]

pivot_truth_df = (
    truth_copy_df.reset_index()
    .melt(id_vars=["index", "purity_level"], value_vars=c_types)
    .rename(columns={"index": "mixture_id", "variable": "cell_type", "value": "truth"})
    .set_index(["mixture_id", "cell_type"])
)

### Extract colour pallete

In [None]:
# Extract colour pallete
ctype_colour_pallete_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

# Convert to dictionary
ctype_colour_pallete_d = {
    row["all_celltype"]: {"fill": row["fill"], "line": row["line"]}
    for i, row in ctype_colour_pallete_df.iterrows()
}

## [Fig 4a]. Confusion matrix of each method for each cell type

In [None]:
methods = [
    "bprism_v2",
    "scaden",
    "music",
    "cpm",
    "cbx",
    "hspe",
    "epic",
    "bisque",
    "dwls",
]

In [None]:
pred_truth_l = []

for method in tqdm(methods):
    res_df = pd.read_csv(
        Path(prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )
    res_df = res_df[c_types]

    # Clip tiny negative numbers to 0
    res_df.clip(lower=0, inplace=True)

    # We only use samples at 50% tumour purity level
    tmp_truth_df = truth_df[truth_df["Cancer Epithelial"] == 0.5].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Set index name to "mixture_id". We'll need it later
    tmp_truth_df.index.name = "mixture_id"
    tmp_res_df.index.name = "mixture_id"

    # Check if indexes match
    assert (tmp_res_df.sort_index().index == tmp_truth_df.sort_index().index).all()
    assert (tmp_res_df.sort_index().columns == tmp_truth_df.sort_index().columns).all()

    # Merge prediction and groundtruth
    tmp_pred_truth_df = (
        tmp_res_df.reset_index()
        .melt(id_vars=["mixture_id"], var_name="cell_type", value_name="pred")
        .merge(
            tmp_truth_df.reset_index().melt(
                id_vars=["mixture_id"], var_name="cell_type", value_name="truth"
            ),
            left_on=["mixture_id", "cell_type"],
            right_on=["mixture_id", "cell_type"],
        )
    )

    # Assign method
    tmp_pred_truth_df["method"] = method

    # Append to rmse_l
    pred_truth_l.append(tmp_pred_truth_df)

# Concatenate all rmse dataframes
pred_truth_df = pd.concat(pred_truth_l, axis=0)

# Rename method names
pred_truth_df.replace(
    {
        "scaden": "Scaden",
        "music": "MuSiC",
        "cbx": "CBX",
        "bisque": "Bisque",
        "dwls": "DWLS",
        "epic": "EPIC",
        "cpm": "CPM",
        "bprism_v2": "BayesPrism",
    },
    inplace=True,
)

# Bin predictions
bins = [-0.0000000001, 0.1, 100]
labels = ["<0.1", ">=0.1"]
pred_truth_df["pred_binned"] = pd.cut(
    pred_truth_df["pred"] * 100, bins=bins, labels=labels
)

# Bin truth
bins = [-0.0000000001, 0.1, 100]
labels = ["<0.1", ">=0.1"]
pred_truth_df["truth_binned"] = pd.cut(
    pred_truth_df["truth"] * 100, bins=bins, labels=labels
)

#### Plot only all cell types

In [None]:
# Create a subplots with 9 columns (methods) and 1 row
fig = make_subplots(rows=1, cols=9, vertical_spacing=0.025)

# Iterate over each method and create confusion matrix
for i, method in enumerate(methods_order):
    # Create confusion for all cell types
    method_pred_truth_df = pred_truth_df[(pred_truth_df["method"] == method)]

    confusion = confusion_matrix(
        method_pred_truth_df["truth_binned"], method_pred_truth_df["pred_binned"]
    )

    fig.add_trace(
        px.imshow(
            confusion.transpose() / 1000,
            text_auto=True,
            color_continuous_scale="Purples",
            x=["<0.1", ">=0.1"],
            y=["<0.1", ">=0.1"],
        ).data[0],
        1,
        i + 1,
    )

# # Specify the text template for the annotations
# fig.update_traces(texttemplate="%{text}")

fig.update_xaxes(
    # title="Actual proportions",
    titlefont_size=10,
    title_standoff=1,
    ticks="outside",
    showticklabels=True,
    tickmode="array",
    tickwidth=0.75,
    ticklen=2,
    tickfont_size=12,
    linecolor="black",
    linewidth=0.75,
    side="bottom",
)

fig.update_yaxes(
    # title="Predicted proportions",
    titlefont_size=10,
    title_standoff=1,
    linecolor="black",
    linewidth=0.75,
    ticks="outside",
    showticklabels=False,
    tickwidth=0.75,
    ticklen=2,
    tickfont_size=12,
)

# Add ticks to first column and first row
fig.update_yaxes(
    col=1,
    ticks="outside",
    showticklabels=True,
    tickwidth=0.75,
    ticklen=2,
    tickfont_size=12,
)

fig["layout"].update(
    coloraxis=dict(
        colorscale="Purples",
        showscale=False,
        cmax=10,
        cmin=0,
        colorbar=dict(
            ticks="outside",
            ticksuffix="",
            tickfont_size=16,
            dtick=1,
            orientation="h",
            yanchor="bottom",
            y=-0.5,
            xanchor="center",
            x=0.5,
        ),
    ),
    margin=dict(t=0, l=0, r=0, b=0),
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(size=10, color="black"),
)
fig.write_image(
    "figures/main_figures/main_fig_4a.svg",
    width=750,
    height=100,
    # # Uncomment this to display colorbar
    # height=300,
    scale=5,
)

In [None]:
# Save confusion matrices into source data
all_confusion_l = []

# Iterate over each method and create confusion matrix
for i, method in enumerate(methods_order):
    # Create confusion for all cell types
    method_pred_truth_df = pred_truth_df[(pred_truth_df["method"] == method)]

    confusion = confusion_matrix(
        method_pred_truth_df["truth_binned"], method_pred_truth_df["pred_binned"]
    )

    # Flatten and append to all_confusion_l
    all_confusion_l.append(
        pd.DataFrame(
            [confusion.flatten()],
            index=[method],
            columns=[
                "True negatives",
                "False positives",
                "False negatives",
                "True positives",
            ],
        )
    )

all_confusion_df = pd.concat(all_confusion_l, axis=0)
all_confusion_df.to_csv(
    Path(viz_prefix).joinpath("source_data/figure_4a.tsv"), sep="\t"
)