# Visualize models performance 

In [None]:
import numpy as np
import pandas as pd
import anndata as adata

import random
import itertools

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity as skl_cosine

from scipy.stats import pearsonr
from scipy.spatial.distance import braycurtis
from math import sqrt
from sklearn.metrics import confusion_matrix

%load_ext blackcellmagic

## Set up paths and load setting files

#### Path to subset/minor/major experiments

In [None]:
# Subset
subset_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/subset_level"
)

# Minor
minor_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/minor_level"
)

# Major
major_prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"

# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

#### List major/minor/subset cell types

In [None]:
subset_c_types = [
    "Endothelial",
    "CAFs",
    "PVL",
    "B cells Memory",
    "B cells Naive",
    "T_cells_c4_CD8+_ZFP36",
    "T_cells_c6_IFIT1",
    "T_cells_c7_CD8+_IFNG",
    "T_cells_c8_CD8+_LAG3",
    "T_cells_c0_CD4+_CCR7",
    "T_cells_c1_CD4+_IL7R",
    "T_cells_c2_CD4+_T-regs_FOXP3",
    "T_cells_c3_CD4+_Tfh_CXCL13",
    "T_cells_c9_NK_cells_AREG",
    "T_cells_c11_MKI67",
    "T_cells_c10_NKT_cells_FCGR3A",
    "Myeloid_c10_Macrophage_1_EGR1",
    "Myeloid_c12_Monocyte_1_IL1B",
    "Myeloid_c2_LAM2_APOE",
    "Myeloid_c1_LAM1_FABP5",
    "Cycling_Myeloid",
    "Myeloid_c4_DCs_pDC_IRF7",
    "Normal Epithelial",
    "Plasmablasts",
    "Myeloid_c8_Monocyte_2_S100A9",
    "Myeloid_c9_Macrophage_2_CXCL10",
    "Myeloid_c11_cDC2_CD1C",
    "Cancer Epithelial",
    "Myeloid_c3_cDC1_CLEC9A",
]
minor_c_types = [
    "Cancer Epithelial",
    "T cells CD4+",
    "T cells CD8+",
    "Endothelial",
    "CAFs",
    "Macrophage",
    "PVL",
    "Normal Epithelial",
    "Plasmablasts",
    "B cells Memory",
    "Monocyte",
    "Cycling_Myeloid",
    "Cycling T-cells",
    "NK cells",
    "NKT cells",
    "DCs",
    "B cells Naive",
]
major_c_types = [
    "B-cells",
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Myeloid",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
    "T-cells",
]

#### Load groundtruth

In [None]:
# We only use tumour purity = 50%
pur_lvl = 0.5

# Methods order are universal across figures
methods_order = [
    "BayesPrism",
    "Scaden",
    "MuSiC",
    "hspe",
    "DWLS",
    "CBX",
    "Bisque",
    "EPIC",
    "CPM",
]

In [None]:
# Subset
subset_truth_df = pd.read_csv(
    Path(subset_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
subset_truth_df = subset_truth_df[subset_c_types]

# Minor
minor_truth_df = pd.read_csv(
    Path(minor_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
minor_truth_df = minor_truth_df[minor_c_types]

# Major
major_truth_df = pd.read_csv(
    Path(major_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
major_truth_df = major_truth_df[major_c_types]

#### Extract lineages metadata

In [None]:
# Load lineage mapping of all cell types in the data
lineages_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_immune_lineages.tsv"),
    sep="\t",
)

# Replace columns
lineages_df.rename(
    columns={
        "celltype_major": "Major Cell Type",
        "celltype_major_short": "Annotated Major Cell Type",
        "celltype_minor": "Minor Cell Type",
        "celltype_minor_short": "Annotated Minor Cell Type",
        "celltype_subset": "Subset Cell Type",
        "celltype_subset_short": "Annotated Subset Cell Type",
        "marker_genes": "Marker Genes",
        "counts": "Cell Counts",
        "num_of_patients": "Patient Counts",
    },
    inplace=True,
)

In [None]:
# We removed Myeloid_c7_Monocyte_3_FCGR3A, exclude it from the lineages DataFrame
lineages_df = lineages_df[
    lineages_df["Subset Cell Type"] != "Myeloid_c7_Monocyte_3_FCGR3A"
]

# We also removed 3 cell types that MuSiC dropped
lineages_df = lineages_df[
    ~lineages_df["Subset Cell Type"].isin(
        [
            "Myeloid_c0_DC_LAMP3",
            "Myeloid_c5_Macrophage_3_SIGLEC1",
            "T_cells_c5_CD8+_GZMK",
        ]
    )
]

#### Extract colour pallete

In [None]:
# Load lineage mapping of all cell types in the data
colour_pallete_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

# Convert to dictionary
colour_pallete_d = {
    row["all_celltype"]: {"fill": row["fill"], "line": row["line"]}
    for i, row in colour_pallete_df.iterrows()
}

## [Fig] Stacked bar charts of false positives/negatives at major lineage level

In [None]:
# Specify methods and tumour purity levels
# methods = ["dwls", "scaden"]
methods = [
    "scaden",
    "bprism_v2",
    "dwls",
    "cbx",
    "music",
    "bisque",
    "hspe",
    "epic",
    "cpm",
]

#### Major cell types


In [None]:
major_preds_truth_l = []

for method in tqdm(methods):
    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(major_prefix).joinpath(f"data/results/{method}.csv"),
        sep="\t",
        index_col=0,
    )

    # Some predictions can be like this -0.00000000013
    # Round them up to 0%
    res_df.clip(lower=0, inplace=True)

    # Extract 50% tumour purity level
    tmp_truth_df = major_truth_df[
        major_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Iterate over cell types
    for c_type in tmp_truth_df.columns.tolist():
        ctype_truth_df = tmp_truth_df[c_type]
        ctype_preds_df = tmp_res_df[c_type]

        # Concatenate predictions and groundtruth into a DataFrame
        preds_truth_df = pd.concat(
            [ctype_truth_df.sort_index(), ctype_preds_df.sort_index()], axis=1
        )
        preds_truth_df.columns = ["truth", "preds"]
        preds_truth_df = preds_truth_df * 100
        preds_truth_df["Patient"] = [i.split("_")[0] for i in preds_truth_df.index]
        preds_truth_df["Cell Type"] = c_type
        preds_truth_df["Method"] = method

        major_preds_truth_l.append(preds_truth_df)

major_preds_truth_df = pd.concat(major_preds_truth_l, axis=0)

# Assign lineage level
major_preds_truth_df["lineage"] = "major"

#### Minor cell types

In [None]:
minor_preds_truth_l = []

for method in tqdm(methods):
    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(minor_prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )

    # Some predictions can be like this -0.00000000013
    # Round them up to 0%
    res_df.clip(lower=0, inplace=True)

    # Extract 50% tumour purity level
    tmp_truth_df = minor_truth_df[
        minor_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Iterate over cell types
    for c_type in tmp_truth_df.columns.tolist():
        ctype_truth_df = tmp_truth_df[c_type]
        ctype_preds_df = tmp_res_df[c_type]

        # Concatenate predictions and groundtruth into a DataFrame
        preds_truth_df = pd.concat(
            [ctype_truth_df.sort_index(), ctype_preds_df.sort_index()], axis=1
        )
        preds_truth_df.columns = ["truth", "preds"]
        preds_truth_df = preds_truth_df * 100
        preds_truth_df["Patient"] = [i.split("_")[0] for i in preds_truth_df.index]
        preds_truth_df["Cell Type"] = c_type
        preds_truth_df["Method"] = method

        minor_preds_truth_l.append(preds_truth_df)

minor_preds_truth_df = pd.concat(minor_preds_truth_l, axis=0)

In [None]:
# Extract minor immune cell types
minor_immune_ctypes = lineages_df[
    lineages_df["Major Cell Type"].isin(["T-cells", "B-cells", "Myeloid"])
]["Minor Cell Type"].unique()

minor_preds_truth_df = minor_preds_truth_df[
    minor_preds_truth_df["Cell Type"].isin(minor_immune_ctypes.tolist())
]

# Fix Cycling_Myeloid
minor_preds_truth_df.replace({"Cycling_Myeloid": "Cycling Myeloid"}, inplace=True)

# Assign lineage level
minor_preds_truth_df["lineage"] = "minor"

#### Subset cell types

In [None]:
subset_preds_truth_l = []

for method in tqdm(methods):
    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(subset_prefix).joinpath(f"data/results/{method}.csv"),
        sep="\t",
        index_col=0,
    )

    # Some predictions can be like this -0.00000000013
    # Round them up to 0%
    res_df.clip(lower=0, inplace=True)

    # Extract 50% tumour purity level
    tmp_truth_df = subset_truth_df[
        subset_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Iterate over cell types
    for c_type in tmp_truth_df.columns.tolist():
        ctype_truth_df = tmp_truth_df[c_type]
        ctype_preds_df = tmp_res_df[c_type]

        # Concatenate predictions and groundtruth into a DataFrame
        preds_truth_df = pd.concat(
            [ctype_truth_df.sort_index(), ctype_preds_df.sort_index()], axis=1
        )
        preds_truth_df.columns = ["truth", "preds"]
        preds_truth_df = preds_truth_df * 100
        preds_truth_df["Patient"] = [i.split("_")[0] for i in preds_truth_df.index]
        preds_truth_df["Cell Type"] = c_type
        preds_truth_df["Method"] = method

        subset_preds_truth_l.append(preds_truth_df)

subset_preds_truth_df = pd.concat(subset_preds_truth_l, axis=0)

In [None]:
# Extract subset immune cell types
subset_immune_ctypes = lineages_df[
    lineages_df["Minor Cell Type"].isin(
        ["T cells CD4+", "T cells CD8+", "Macrophage", "Monocyte", "DCs"]
    )
]["Subset Cell Type"].unique()

subset_preds_truth_df = subset_preds_truth_df[
    subset_preds_truth_df["Cell Type"].isin(subset_immune_ctypes.tolist())
]

# Get pretty subset annotations
subset_preds_truth_df.rename(columns={"Cell Type": "Subset Cell Type"}, inplace=True)
subset_preds_truth_df = (
    subset_preds_truth_df.reset_index()
    .merge(
        lineages_df[["Subset Cell Type", "Annotated Subset Cell Type"]],
        how="left",
        on="Subset Cell Type",
    )
    .set_index("index")
)

# Drop ugly names and add lineage level
subset_preds_truth_df = subset_preds_truth_df.drop(["Subset Cell Type"], axis=1).rename(
    columns={"Annotated Subset Cell Type": "Cell Type"}
)

# Assign lineage level
subset_preds_truth_df["lineage"] = "subset"

#### Concatenate and bin preds/truth

In [None]:
# Concatenate across lineage levels
all_preds_truth_df = pd.concat(
    [major_preds_truth_df, minor_preds_truth_df, subset_preds_truth_df], axis=0
)

# Bin predictions
bins = [-0.0000000001, 0.1, 1, 10, 100]
labels = ["<0.1", "0.1-1", "1-10", ">10"]
all_preds_truth_df["preds_binned"] = pd.cut(
    all_preds_truth_df["preds"], bins=bins, labels=labels
)

# Bin truth
bins = [-0.0000000001, 0.1, 1, 10, 100]
labels = ["<0.1", "0.1-1", "1-10", ">10"]
all_preds_truth_df["truth_binned"] = pd.cut(
    all_preds_truth_df["truth"], bins=bins, labels=labels
)

# Replace method with proper names
all_preds_truth_df.replace(
    {
        "scaden": "Scaden",
        "music": "MuSiC",
        "cbx": "CBX",
        "dwls": "DWLS",
        "cpm": "CPM",
        "epic": "EPIC",
        "bprism_v2": "BayesPrism",
        "bisque": "Bisque",
    },
    inplace=True,
)

#### Plot stacked bar chart of all cell types only

In [None]:
# X-axis order
xaxis_order_d = {
    "major": [
        "All Cell Type",
        "Cancer Epithelial",
        "Normal Epithelial",
        "T-cells",
        "B-cells",
        "Myeloid",
        "Endothelial",
        "CAFs",
        "PVL",
        "Plasmablasts",
    ],
    "minor": [
        "All Cell Type",
        "NK cells",
        "Cycling T-cells",
        "NKT cells",
        "T cells CD4+",
        "T cells CD8+",
        "B cells Memory",
        "B cells Naive",
        "Cycling Myeloid",
        "Macrophage",
        "Monocyte",
        "DCs",
    ],
    "subset": [
        "All Cell Type",
        "Naive/central Memory T Cells",
        "Effector Memory T Cells",
        "T-regs",
        "Tfh",
        "Chemokine-expressing T Cells",
        "IFN-I Signature T Cells",
        "T-cells:IFNG",
        "T-cells:LAG3",
        "M2-like Macrophage:EGR1",
        "LAM2*",
        "LAM1*",
        "M2-like Macrophage:CXCL10",
        "Mono:IL1B",
        "Mono:FCGR3A",
        "Myeloid:pDC/IRF7",
        "Myeloid:cDC2/CD1C",
        "Myeloid:cDC1/CLEC9A",
    ],
}

In [None]:
# Dimension for png image according to each lineage level
f_pos_d = {
    "major": {
        "width": 850,
        "height": 2000,
        "x_domains_all": 0.105,
        "x_domains_ctype": 0.13,
    },
    "minor": {
        "width": 800,
        "height": 650,
        "x_domains_all": 0.0925,
        "x_domains_ctype": 0.105,
    },
    "subset": {
        "width": 1200,
        "height": 825,
        "x_domains_all": 0.062,
        "x_domains_ctype": 0.07,
    },
}

f_neg_d = {
    "major": {
        "width": 850,
        "height": 2000,
        "x_domains_all": 0.105,
        "x_domains_ctype": 0.13,
    },
    "minor": {
        "width": 800,
        "height": 650,
        "x_domains_all": 0.0925,
        "x_domains_ctype": 0.105,
    },
    "subset": {
        "width": 1200,
        "height": 825,
        "x_domains_all": 0.062,
        "x_domains_ctype": 0.07,
    },
}

In [None]:
# Set lineage level and methods to plot
lineage_lvl = "major"

False positives

In [None]:
# False positive colour map
fp_tn_color_map = {
    "0.1-1": "rgb(221, 136, 172)",  # px.colors.sequential.Magenta[::2][1:]
    "1-10": "rgb(177, 77, 142)",
    ">10": "rgb(108, 33, 103)",
    "<0.1": "rgb(185, 219, 244)",
}

In [None]:
def plot_false_positives(
    df: pd.DataFrame,
    y_axis_val: str,
    y_axis_title: str,
    y_axis_range: List,
    d_tick: int,
    color_var: str,
    bins: List,
    lineage_lvl: str,
    color_map: Dict,
    counts_df: pd.DataFrame,
    methods_order: List,
    row_spacing: float = 0.025,
    col_spacing: float = 0.025,
) -> None:

    fig = px.bar(
        df,
        x="Method",
        y=y_axis_val,
        color=color_var,
        facet_col="Method",
        facet_col_spacing=row_spacing,
        category_orders={
            "Prediction bins": bins,
            "Method": methods_order,
        },
        color_discrete_map=color_map,
    )

    # Update axes
    fig.update_xaxes(
        ticks="outside",
        tickangle=90,
        tickfont_size=12,
        showticklabels=False,
        linecolor="black",
        side="bottom",
        # categoryorder="array",
        # categoryarray=methods_order,
        matches=None,
        title="",
        title_font_size=1,
    )
    fig.update_yaxes(
        ticks="outside",
        linecolor="black",
        range=y_axis_range,
        dtick=d_tick,
        tickfont_size=12,
        showgrid=True,
        gridwidth=0.5,
        gridcolor="lightgray",
        title="",
        title_font_size=1,
    )

    # Update layout
    fig["layout"].update(
        barmode="relative",
        plot_bgcolor="rgba(0,0,0,0)",
        showlegend=False,
        margin=dict(t=10, l=0, r=0, b=0),
        font=dict(color="black", size=12),
    )

    # Format method names. If not method names, remove it
    fig.for_each_annotation(lambda a: a.update(text=""))

    # Add annotation for each method
    for i, method in enumerate(methods_order):
        pct = counts_df.loc[
            (counts_df["Method"] == method)
            & (counts_df["Cell Type"] == "All Cell Type"),
            "Percentage",
        ].values[0]
        count = counts_df.loc[
            (counts_df["Method"] == method)
            & (counts_df["Cell Type"] == "All Cell Type"),
            "Count",
        ].values[0]

        fig.add_annotation(
            x=method,
            y=pct,
            text=f"<i>{int(count)}</i>",
            showarrow=False,
            yshift=7,
            font_size=10,
            row=1,
            col=i + 1,
        )

    # Save image
    fig.write_image(
        Path(viz_prefix)
        .joinpath(
            f"./figures/main_figures/main_fig_4b_false_positive_{lineage_lvl}_{y_axis_val.lower()}"
        )
        .with_suffix(".svg"),
        width=750,
        height=150,
        scale=5,
    )

In [None]:
# Extract partitions where truth = 0%
false_positive_df = all_preds_truth_df[
    (all_preds_truth_df["lineage"] == lineage_lvl) & (all_preds_truth_df["truth"] < 0.1)
]

# Counts number of unique prediction bins per method per cell type
counts_false_positive_df = (
    false_positive_df[["Cell Type", "Method", "preds_binned"]]
    .value_counts()
    .reset_index()
    .rename(columns={0: "Mixtures Count", "preds_binned": "Prediction bins"})
    # .replace(methods_annotation)
)
counts_false_positive_df.columns.name = None

# Pivot to get [method, cell type] as columns and [bins] as rows
pivot_counts_false_positive_df = pd.pivot_table(
    counts_false_positive_df,
    values="Mixtures Count",
    columns=["Method", "Cell Type"],
    index=["Prediction bins"],
).fillna(0)

### Calculate false positive for each cell type ####
# Normalize to 100% (i.e. sum total of each colum and divide each cell by its column total)
normalized_false_positive_df = pivot_counts_false_positive_df.div(
    pivot_counts_false_positive_df.sum(axis=0), axis=1
)

# "Melt" DataFrame from long to wide
normalized_false_positive_df = (
    normalized_false_positive_df.reset_index()
    .melt(id_vars=["Prediction bins"])
    .rename(columns={"value": "Percentage"})
)

# Merge normalized with counts to have normalized and counts in one DataFrame
# Also rename "Mixtures Count" to "Count"
per_ctype_false_positive_df = counts_false_positive_df.merge(
    normalized_false_positive_df,
    on=["Cell Type", "Method", "Prediction bins"],
    how="left",
).rename(columns={"Mixtures Count": "Count"})

# Assign column type (for plotting)
per_ctype_false_positive_df["Col Type"] = "Each Cell Type"

In [None]:
### Calculate false positive across all cell type ####
# Calculate total counts and percentages across all cell types
all_ctype_counts_false_positive_df = pivot_counts_false_positive_df.sum(axis=1, level=0)
all_ctype_normalized_false_positive_df = all_ctype_counts_false_positive_df.div(
    all_ctype_counts_false_positive_df.sum(axis=0), axis=1
)
all_ctype_false_positive_df = pd.merge(
    left=all_ctype_counts_false_positive_df.reset_index()
    .melt(id_vars=["Prediction bins"])
    .rename(columns={"value": "Count"}),
    right=all_ctype_normalized_false_positive_df.reset_index()
    .melt(id_vars=["Prediction bins"])
    .rename(columns={"value": "Percentage"}),
    on=["Prediction bins", "Method"],
    how="inner",
)
all_ctype_false_positive_df["Cell Type"] = "All Cell Type"

# Assign column type (for plotting)
all_ctype_false_positive_df["Col Type"] = "All Cell Type"

# Concatenate per-cell-type and all-cell-type false positives
total_false_positive_df = pd.concat(
    [per_ctype_false_positive_df, all_ctype_false_positive_df], axis=0
)
total_false_positive_df["Percentage"] = total_false_positive_df["Percentage"] * 100

In [None]:
# Prepare method-specific cell counts
counts_df = (
    total_false_positive_df[total_false_positive_df["Prediction bins"] != "<0.1"][
        ["Method", "Cell Type", "Count", "Percentage"]
    ]
    .groupby(by=["Method", "Cell Type"], dropna=False)
    .agg("sum")
    .reset_index()
)

# Prepare cell counts
total_counts_df = (
    total_false_positive_df[total_false_positive_df["Method"] == "BayesPrism"]
    .groupby(["Cell Type"])
    .sum()
    .reset_index()
)
total_counts_df["Method"] = "All methods"

# Concatenate all methods counts back into counts_df
counts_df = pd.concat([counts_df, total_counts_df], axis=0)

# If something doesn't exist, replace it by 0
for method in methods_order + ["All methods"]:
    for c_type in xaxis_order_d[lineage_lvl]:
        if counts_df[
            (counts_df["Method"] == method) & (counts_df["Cell Type"] == c_type)
        ].empty:
            counts_df.loc[len(counts_df.index)] = [method, c_type, 0, 0]

In [None]:
# Plot percentages while ignore True Positives for All Cell Type only
plot_false_positives(
    df=total_false_positive_df[
        (total_false_positive_df["Prediction bins"] != "<0.1")
        & (total_false_positive_df["Method"].isin(methods_order))
        & (total_false_positive_df["Col Type"] == "All Cell Type")
    ],
    y_axis_val="Percentage",
    y_axis_title="FPR (%)",  # "Percentage (%)",
    y_axis_range=[0, 101],
    d_tick=20,
    color_var="Prediction bins",
    bins=["0.1-1", "1-10", ">10", "<0.1"],
    lineage_lvl=lineage_lvl,
    color_map=fp_tn_color_map,
    counts_df=counts_df[
        (counts_df["Method"].isin(["All methods"] + methods_order))
        & (counts_df["Cell Type"] == "All Cell Type")
    ],
    methods_order=methods_order,
    # row_spacing=0.05,  # remove if plot major cell type
)

In [None]:
# Save source data
total_false_positive_df[
    (total_false_positive_df["Prediction bins"] != "<0.1")
    & (total_false_positive_df["Method"].isin(methods_order))
    & (total_false_positive_df["Col Type"] == "All Cell Type")
].to_csv(Path(viz_prefix).joinpath("source_data/figure_4b.tsv"), sep="\t")

False negative

In [None]:
# False negative color maps
fn_tp_color_map = {
    "0.1-1": "rgb(108, 192, 139)",  # px.colors.sequential.Emrld[::2][1:]
    "1-10": "rgb(33, 122, 121)",
    ">10": "rgb(7, 64, 80)",
    "tp": "rgb(222, 219, 238)",
}

In [None]:
def plot_false_negatives(
    df: pd.DataFrame,
    y_axis_val: str,
    y_axis_title: str,
    y_axis_range: List,
    d_tick: int,
    color_var: str,
    bins: List,
    lineage_lvl: str,
    color_map: Dict,
    counts_df: pd.DataFrame,
    methods_order: List,
    row_spacing: float = 0.025,
    col_spacing: float = 0.025,
) -> None:
    """

    Args:
        -

    """

    fig = px.bar(
        df,
        x="Method",
        y=y_axis_val,
        color=color_var,
        facet_col="Method",
        facet_col_spacing=row_spacing,
        category_orders={
            "Prediction bins": bins,
            "Method": methods_order,
        },
        color_discrete_map=color_map,
    )

    # Update axes
    fig.update_xaxes(
        ticks="outside",
        tickangle=90,
        tickfont_size=12,
        showticklabels=False,
        linecolor="black",
        side="bottom",
        # categoryorder="array",
        # categoryarray=methods_order,
        matches=None,
        title="",
        title_font_size=1,
    )
    fig.update_yaxes(
        ticks="outside",
        linecolor="black",
        range=y_axis_range,
        dtick=d_tick,
        tickfont_size=12,
        showgrid=True,
        gridwidth=0.5,
        gridcolor="lightgray",
        title="",
        title_font_size=1,
    )

    # Update layout
    fig["layout"].update(
        barmode="relative",
        plot_bgcolor="rgba(0,0,0,0)",
        showlegend=False,
        margin=dict(t=10, l=0, r=0, b=0),
        font=dict(color="black", size=12),
    )

    # Format method names. If not method names, remove it
    fig.for_each_annotation(lambda a: a.update(text=""))

    # Add annotation for each method
    for i, method in enumerate(methods_order):
        pct = counts_df.loc[
            (counts_df["Method"] == method)
            & (counts_df["Cell Type"] == "All Cell Type"),
            "Percentage",
        ].values[0]
        count = counts_df.loc[
            (counts_df["Method"] == method)
            & (counts_df["Cell Type"] == "All Cell Type"),
            "Count",
        ].values[0]

        fig.add_annotation(
            x=method,
            y=pct,
            text=f"<i>{int(count)}</i>",
            showarrow=False,
            yshift=7,
            font_size=10,
            row=1,
            col=i + 1,
        )

    # Save image
    fig.write_image(
        Path(viz_prefix)
        .joinpath(
            f"./figures/main_figures/main_fig_4b_false_negative_{lineage_lvl}_{y_axis_val.lower()}"
        )
        .with_suffix(".svg"),
        width=750,
        height=150,
        scale=5,
    )

In [None]:
# Extract partitions where truth = 0%
false_negative_df = all_preds_truth_df[
    (all_preds_truth_df["lineage"] == lineage_lvl)
    & (all_preds_truth_df["preds"] < 0.1)
    & (all_preds_truth_df["truth_binned"] != "<0.1")
]

# Counts number of unique prediction bins per method per cell type
counts_false_negative_df = (
    false_negative_df[["Cell Type", "Method", "truth_binned"]]
    .value_counts()
    .reset_index()
    .rename(columns={0: "Mixtures Count", "truth_binned": "Groundtruth bins"})
    # .replace(methods_annotation)
)
counts_false_negative_df.columns.name = None

# Calculate true positives
tp_df = all_preds_truth_df[
    (all_preds_truth_df["lineage"] == lineage_lvl)
    & (all_preds_truth_df["preds"] >= 0.1)
    & (all_preds_truth_df["truth_binned"] != "<0.1")
]

tp_df = (
    tp_df[["Cell Type", "Method"]]
    .value_counts()
    .reset_index()
    .rename(columns={0: "Mixtures Count"})
    # .replace(methods_annotation)
)
tp_df.columns.name = None

tp_df["Groundtruth bins"] = "tp"

# Concatenate true positives back into temp_preds_truth_df
counts_false_negative_df = pd.concat([counts_false_negative_df, tp_df], axis=0)

# Pivot to get [method, cell type] as columns and [bins] as rows
pivot_counts_false_negative_df = pd.pivot_table(
    counts_false_negative_df,
    values="Mixtures Count",
    columns=["Method", "Cell Type"],
    index=["Groundtruth bins"],
).fillna(0)

### Calculate false negative for each cell type ####
# Normalize to 100% (i.e. sum total of each colum and divide each cell by its column total)
normalized_false_negative_df = pivot_counts_false_negative_df.div(
    pivot_counts_false_negative_df.sum(axis=0), axis=1
)

# "Melt" DataFrame from long to wide
normalized_false_negative_df = (
    normalized_false_negative_df.reset_index()
    .melt(id_vars=["Groundtruth bins"])
    .rename(columns={"value": "Percentage"})
)

# Merge normalized with counts to have normalized and counts in one DataFrame
# Also rename "Mixtures Count" to "Count"
per_ctype_false_negative_df = counts_false_negative_df.merge(
    normalized_false_negative_df,
    on=["Cell Type", "Method", "Groundtruth bins"],
    how="left",
).rename(columns={"Mixtures Count": "Count"})

# Assign column type (for plotting)
per_ctype_false_negative_df["Col Type"] = "Each Cell Type"

In [None]:
### Calculate false negative across all cell type ####
# Calculate total counts and percentages across all cell types
all_ctype_counts_false_negative_df = pivot_counts_false_negative_df.sum(axis=1, level=0)
all_ctype_normalized_false_negative_df = all_ctype_counts_false_negative_df.div(
    all_ctype_counts_false_negative_df.sum(axis=0), axis=1
)
all_ctype_false_negative_df = pd.merge(
    left=all_ctype_counts_false_negative_df.reset_index()
    .melt(id_vars=["Groundtruth bins"])
    .rename(columns={"value": "Count"}),
    right=all_ctype_normalized_false_negative_df.reset_index()
    .melt(id_vars=["Groundtruth bins"])
    .rename(columns={"value": "Percentage"}),
    on=["Groundtruth bins", "Method"],
    how="inner",
)
all_ctype_false_negative_df["Cell Type"] = "All Cell Type"

# Assign column type (for plotting)
all_ctype_false_negative_df["Col Type"] = "All Cell Type"

# Concatenate per-cell-type and all-cell-type false negatives
total_false_negative_df = pd.concat(
    [per_ctype_false_negative_df, all_ctype_false_negative_df], axis=0
)
total_false_negative_df["Percentage"] = total_false_negative_df["Percentage"] * 100

In [None]:
# Give Scaden and CPM one entry so they appear
if lineage_lvl == "major":
    total_false_negative_df.loc[len(total_false_negative_df.index)] = [
        "Cancer Epithelial",
        "Scaden",
        "0.1-1",
        0,
        0,
        "Each Cell Type",
    ]
elif lineage_lvl == "minor":
    total_false_negative_df.loc[len(total_false_negative_df.index)] = [
        "DCs",
        "Scaden",
        "0.1-1",
        0,
        0,
        "Each Cell Type",
    ]
else:
    pass

total_false_negative_df.loc[len(total_false_negative_df.index)] = [
    "Cancer Epithelial",
    "CPM",
    "0.1-1",
    0,
    0,
    "Each Cell Type",
]

In [None]:
# Prepare method-specific cell counts
counts_df = (
    total_false_negative_df[total_false_negative_df["Groundtruth bins"] != "tp"][
        ["Method", "Cell Type", "Count", "Percentage"]
    ]
    .groupby(by=["Method", "Cell Type"], dropna=False)
    .agg("sum")
    .reset_index()
)

# Prepare cell counts
total_counts_df = (
    total_false_negative_df[total_false_negative_df["Method"] == "BayesPrism"]
    .groupby(["Cell Type"])
    .sum()
    .reset_index()
)
total_counts_df["Method"] = "All methods"

# Concatenate all methods counts back into counts_df
counts_df = pd.concat([counts_df, total_counts_df], axis=0)

# If something doesn't exist, replace it by 0
for method in methods_order + ["All methods"]:
    for c_type in xaxis_order_d[lineage_lvl]:
        if counts_df[
            (counts_df["Method"] == method) & (counts_df["Cell Type"] == c_type)
        ].empty:
            counts_df.loc[len(counts_df.index)] = [method, c_type, 0, 0]

In [None]:
# Plot percentages
fig = plot_false_negatives(
    df=total_false_negative_df[
        (total_false_negative_df["Groundtruth bins"] != "tp")
        & (total_false_negative_df["Method"].isin(methods_order))
        & (total_false_negative_df["Col Type"] == "All Cell Type")
    ],
    y_axis_val="Percentage",
    y_axis_title="FNR (%)",
    y_axis_range=[0, 101],
    d_tick=20,
    color_var="Groundtruth bins",
    bins=["0.1-1", "1-10", ">10", "tp"],
    lineage_lvl=lineage_lvl,
    color_map=fn_tp_color_map,
    counts_df=counts_df[counts_df["Cell Type"] == "All Cell Type"],
    methods_order=methods_order,
    # row_spacing=0.05,  # remove if plot major cell type
)

In [None]:
# Save source data
total_false_negative_df[
    (total_false_negative_df["Groundtruth bins"] != "tp")
    & (total_false_negative_df["Method"].isin(methods_order))
    & (total_false_negative_df["Col Type"] == "All Cell Type")
].to_csv(Path(viz_prefix).joinpath("source_data/figure_4c.tsv"), sep="\t")