# Visualization code for Extended Data Figure 5a + 5d + 5e

In [None]:
import numpy as np
import pandas as pd
import anndata as adata

import random
import itertools

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity as skl_cosine

from scipy.stats import pearsonr
from scipy.spatial.distance import braycurtis
from math import sqrt
from sklearn.metrics import confusion_matrix

%load_ext blackcellmagic

## Set up paths and load setting files

#### Path to subset/minor/major experiments

In [None]:
# Subset
subset_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/subset_level"
)

# Minor
minor_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/minor_level"
)

# Major
major_prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"

# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

#### List major/minor/subset cell types

In [None]:
subset_c_types = [
    "Endothelial",
    "CAFs",
    "PVL",
    "B cells Memory",
    "B cells Naive",
    "T_cells_c4_CD8+_ZFP36",
    "T_cells_c6_IFIT1",
    "T_cells_c7_CD8+_IFNG",
    "T_cells_c8_CD8+_LAG3",
    "T_cells_c0_CD4+_CCR7",
    "T_cells_c1_CD4+_IL7R",
    "T_cells_c2_CD4+_T-regs_FOXP3",
    "T_cells_c3_CD4+_Tfh_CXCL13",
    "T_cells_c9_NK_cells_AREG",
    "T_cells_c11_MKI67",
    "T_cells_c10_NKT_cells_FCGR3A",
    "Myeloid_c10_Macrophage_1_EGR1",
    "Myeloid_c12_Monocyte_1_IL1B",
    "Myeloid_c2_LAM2_APOE",
    "Myeloid_c1_LAM1_FABP5",
    "Cycling_Myeloid",
    "Myeloid_c4_DCs_pDC_IRF7",
    "Normal Epithelial",
    "Plasmablasts",
    "Myeloid_c8_Monocyte_2_S100A9",
    "Myeloid_c9_Macrophage_2_CXCL10",
    "Myeloid_c11_cDC2_CD1C",
    "Cancer Epithelial",
    "Myeloid_c3_cDC1_CLEC9A",
]
minor_c_types = [
    "Cancer Epithelial",
    "T cells CD4+",
    "T cells CD8+",
    "Endothelial",
    "CAFs",
    "Macrophage",
    "PVL",
    "Normal Epithelial",
    "Plasmablasts",
    "B cells Memory",
    "Monocyte",
    "DCs",
    "Cycling_Myeloid",
    "Cycling T-cells",
    "NK cells",
    "NKT cells",
    "B cells Naive",
]
major_c_types = [
    "T-cells",
    "Myeloid",
    "B-cells",
    "Endothelial",
    "CAFs",
    "PVL",
    "Plasmablasts",
    "Cancer Epithelial",
    "Normal Epithelial",
]

#### Load groundtruth

In [None]:
# We only use tumour purity = 50%
pur_lvl = 0.5

In [None]:
# Subset
subset_truth_df = pd.read_csv(
    Path(subset_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
subset_truth_df = subset_truth_df[subset_c_types]

# Minor
minor_truth_df = pd.read_csv(
    Path(minor_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
minor_truth_df = minor_truth_df[minor_c_types]

# Major
major_truth_df = pd.read_csv(
    Path(major_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
major_truth_df = major_truth_df[major_c_types]

#### Utilities

In [None]:
def calculate_metrics(
    source_truth_df: pd.DataFrame, source_res_df: pd.DataFrame, c_types: List
) -> pd.DataFrame:
    """Iterate over provided cell types and calculate peformance metrics of predictions against groundtruth
    The method assumes that provided cell types are consistent across both prediction and groundtruth DataFrame

    Args:
        - source_truth_df:     groundtruth DataFrame, purity-level-specific
        - source_res_df:     predictions DataFrame, purity-level-specific
        - c_types:             cell types to iterate over
    """
    # Copy source truth and source preds so we don't alter them
    temp_truth_df = source_truth_df.copy()
    temp_res_df = source_res_df.copy()

    # Create an empty list to hold peformance metrics of each cell type
    metrics_series_l = []

    # Iterate over cell types and calcuate RMSE + MAE + Cosine
    for c_type in c_types:
        # Extract predictions and groundtruth for each cell type
        # Also copy provided DataFrame so we don't alter them
        ctype_truth_df = temp_truth_df[c_type] * 100
        ctype_preds_df = temp_res_df[c_type] * 100

        # RMSE
        rmse = sqrt(mean_squared_error(ctype_truth_df, ctype_preds_df))

        # MAE
        mae = abs(ctype_truth_df - ctype_preds_df).mean()

        # Pearson's r
        pearson_r = pearsonr(
            ctype_truth_df.values.flatten(), ctype_preds_df.values.flatten()
        )[0]

        # Relative Proportion Error
        rpe = abs(
            (ctype_truth_df - ctype_preds_df) / ctype_truth_df.replace(0, 0.1)
        ).mean()

        metrics_series_l.append(pd.Series([rmse, mae, pearson_r, rpe], name=c_type))

    # Concatenate metrics across cell types
    method_metrics_df = pd.concat(metrics_series_l, axis=1)
    method_metrics_df.index = ["RMSE", "MAE", "Pearsonr", "RPE"]

    return method_metrics_df

#### Extract lineages metadata

In [None]:
# Load lineage mapping of all cell types in the data
lineages_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_immune_lineages.tsv"),
    sep="\t",
)

# Replace columns
lineages_df.rename(
    columns={
        "celltype_major": "Major Cell Type",
        "celltype_major_short": "Annotated Major Cell Type",
        "celltype_minor": "Minor Cell Type",
        "celltype_minor_short": "Annotated Minor Cell Type",
        "celltype_subset": "Subset Cell Type",
        "celltype_subset_short": "Annotated Subset Cell Type",
        "marker_genes": "Marker Genes",
        "counts": "Cell Counts",
        "num_of_patients": "Patient Counts",
    },
    inplace=True,
)

In [None]:
# We removed Myeloid_c7_Monocyte_3_FCGR3A, exclude it from the lineages DataFrame
lineages_df = lineages_df[
    lineages_df["Subset Cell Type"] != "Myeloid_c7_Monocyte_3_FCGR3A"
]

# We also removed 3 cell types that MuSiC dropped
lineages_df = lineages_df[
    ~lineages_df["Subset Cell Type"].isin(
        [
            "Myeloid_c0_DC_LAMP3",
            "Myeloid_c5_Macrophage_3_SIGLEC1",
            "T_cells_c5_CD8+_GZMK",
        ]
    )
]

#### Extract colour pallete

In [None]:
# Load lineage mapping of all cell types in the data
colour_pallete_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

# Convert to dictionary
colour_pallete_d = {
    row["all_celltype"]: {"fill": row["fill"], "line": row["line"]}
    for i, row in colour_pallete_df.iterrows()
}

## [Fig 5a,d-e] Sunburst of RMSE, RPE, and Pearson's r across cell subset lineages

In [None]:
methods = ["bprism_v2", "dwls"]
non_immune_ctypes = [
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
]

#### Calculate metrics for subset cell types

In [None]:
pivot_method_metrics_l = []

for method in tqdm(methods):
    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(subset_prefix).joinpath(f"data/results/{method}.csv"),
        sep="\t",
        index_col=0,
    )
    tmp_truth_df = subset_truth_df[
        subset_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Some vaues can be -0.0000000001. Make them 0s
    tmp_res_df[tmp_res_df < 0] = 0

    # Calculate peformance metrics
    method_metrics_df = calculate_metrics(
        source_truth_df=tmp_truth_df,
        source_res_df=tmp_res_df,
        c_types=subset_c_types,
    )

    pivot_method_metrics_df = method_metrics_df.T.reset_index().rename(
        columns={"index": "Subset Cell Type"}
    )
    pivot_method_metrics_df["Method"] = method
    pivot_method_metrics_l.append(pivot_method_metrics_df)

all_method_metrics_df = pd.concat(pivot_method_metrics_l, axis=0)

In [None]:
# Rename columns
subset_ctype_metrics_df = all_method_metrics_df.merge(
    lineages_df[
        ["Subset Cell Type", "Annotated Minor Cell Type", "Annotated Subset Cell Type"]
    ],
    how="inner",
    on="Subset Cell Type",
)
subset_ctype_metrics_df.drop(["Subset Cell Type"], axis=1, inplace=True)
subset_ctype_metrics_df.rename(
    columns={
        "Annotated Subset Cell Type": "Cell Type",
        "Annotated Minor Cell Type": "Parent",
    },
    inplace=True,
)

# Specify lineage level
subset_ctype_metrics_df["Lineage Level"] = "subset"

#### Calculate metrics for minor cell types

In [None]:
pivot_method_metrics_l = []

for method in tqdm(methods):
    # First read out predictions and groundtruth
    metrics_series_l = []

    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(minor_prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )
    tmp_truth_df = minor_truth_df[
        minor_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Some vaues can be -0.0000000001. Make them 0s
    tmp_res_df[tmp_res_df < 0] = 0

    # Calculate peformance metrics
    method_metrics_df = calculate_metrics(
        source_truth_df=tmp_truth_df,
        source_res_df=tmp_res_df,
        c_types=minor_c_types,
    )

    pivot_method_metrics_df = method_metrics_df.T.reset_index().rename(
        columns={"index": "Minor Cell Type"}
    )
    pivot_method_metrics_df["Method"] = method
    pivot_method_metrics_l.append(pivot_method_metrics_df)

all_method_metrics_df = pd.concat(pivot_method_metrics_l, axis=0)

In [None]:
# Merge metrics with lineages to get parent cell types
minor_ctype_metrics_df = all_method_metrics_df.merge(
    lineages_df[
        ["Minor Cell Type", "Major Cell Type", "Annotated Minor Cell Type"]
    ].drop_duplicates(),
    how="inner",
    on="Minor Cell Type",
)
minor_ctype_metrics_df.drop(["Minor Cell Type"], axis=1, inplace=True)
minor_ctype_metrics_df.rename(
    columns={"Annotated Minor Cell Type": "Cell Type", "Major Cell Type": "Parent"},
    inplace=True,
)

# Specify lineage level
minor_ctype_metrics_df["Lineage Level"] = "minor"

#### Calculate metrics for major cell types

In [None]:
pivot_method_metrics_l = []

for method in tqdm(methods):
    # First read out predictions and groundtruth
    metrics_series_l = []

    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(major_prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )
    tmp_truth_df = major_truth_df[
        major_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Some vaues can be -0.0000000001. Make them 0s
    tmp_res_df[tmp_res_df < 0] = 0

    # Calculate peformance metrics
    method_metrics_df = calculate_metrics(
        source_truth_df=tmp_truth_df,
        source_res_df=tmp_res_df,
        c_types=major_c_types,
    )

    pivot_method_metrics_df = method_metrics_df.T.reset_index().rename(
        columns={"index": "Major Cell Type"}
    )
    pivot_method_metrics_df["Method"] = method
    pivot_method_metrics_l.append(pivot_method_metrics_df)

all_method_metrics_df = pd.concat(pivot_method_metrics_l, axis=0)

In [None]:
# Rename columns and make parents
major_ctype_metrics_df = all_method_metrics_df.copy()

major_ctype_metrics_df.rename(columns={"Major Cell Type": "Cell Type"}, inplace=True)
major_ctype_metrics_df["Parent"] = " "

# Specify lineage level
major_ctype_metrics_df["Lineage Level"] = "major"

#### Concatenate metrics across 3 different lineages level and plot Sunburst beautifull

In [None]:
# Concatenate metrics across all 3 lineage levels
all_ctype_metrics_df = pd.concat(
    [major_ctype_metrics_df, minor_ctype_metrics_df, subset_ctype_metrics_df], axis=0
)

# Add ids column
all_ctype_metrics_df["ids"] = (
    all_ctype_metrics_df["Cell Type"] + " - " + all_ctype_metrics_df["Lineage Level"]
)

#### Get min and max of Pearson's r, RMSE and RPE

In [None]:
min_max_metric_l = []

for method in methods:
    method_ctype_metrics_df = (
        all_ctype_metrics_df[all_ctype_metrics_df["Method"] == method]
        .reset_index()
        .drop(["index"], axis=1)
    )

    for metric in ["RMSE", "Pearsonr", "RPE"]:
        # Major level
        min_major = method_ctype_metrics_df[
            (
                method_ctype_metrics_df["Cell Type"].isin(
                    ["T-cells", "B-cells", "Myeloid"]
                )
            )
            & (method_ctype_metrics_df["Lineage Level"] == "major")
        ][metric].min()
        max_major = method_ctype_metrics_df[:8][
            (
                method_ctype_metrics_df["Cell Type"].isin(
                    ["T-cells", "B-cells", "Myeloid"]
                )
            )
            & (method_ctype_metrics_df["Lineage Level"] == "major")
        ][metric].max()
        min_max_metric_l.append([method, metric, "major", min_major, max_major])

        # Minor level
        min_minor = method_ctype_metrics_df[
            (method_ctype_metrics_df["Lineage Level"] == "minor")
            & (
                method_ctype_metrics_df["Parent"].isin(
                    ["T-cells", "B-cells", "Myeloid"]
                )
            )
        ][metric].min()
        max_minor = method_ctype_metrics_df[
            (method_ctype_metrics_df["Lineage Level"] == "minor")
            & (
                method_ctype_metrics_df["Parent"].isin(
                    ["T-cells", "B-cells", "Myeloid"]
                )
            )
        ][metric].max()
        min_max_metric_l.append([method, metric, "minor", min_minor, max_minor])

        # Subset level
        min_subset = method_ctype_metrics_df[
            (method_ctype_metrics_df["Lineage Level"] == "subset")
            & (
                ~method_ctype_metrics_df["Parent"].isin(
                    [
                        "Cancer Epithelial",
                        "Normal Epithelial",
                        "CAFs",
                        "Endothelial",
                        "PVL",
                        "Plasmablasts",
                    ]
                )
            )
        ][metric].min()
        max_subset = method_ctype_metrics_df[
            (method_ctype_metrics_df["Lineage Level"] == "subset")
            & (
                ~method_ctype_metrics_df["Parent"].isin(
                    [
                        "Cancer Epithelial",
                        "Normal Epithelial",
                        "CAFs",
                        "Endothelial",
                        "PVL",
                        "Plasmablasts",
                    ]
                )
            )
        ][metric].max()
        min_max_metric_l.append([method, metric, "subset", min_subset, max_subset])

# Concatenate
min_max_metric_df = pd.DataFrame(
    min_max_metric_l, columns=["Method", "Metric", "Lineage_level", "Min", "Max"]
)
min_max_metric_df[min_max_metric_df["Method"].isin(["dwls", "bprism_v2"])].sort_values(
    ["Metric", "Lineage_level"]
)

#### Plot Metric Sunbursts
Plot Sunburst of all methods in the same metric in one plot

In [None]:
# Attributes to attach to Sunburst plot for each metrics
metrics_mapping = {
    "RMSE": {
        "column": "RMSE",
        "colorscale": "reds",
        "cmin": 0,
        "cmax": 16,
        "title": "RMSE",
        "tick_suffix": "",
        "d_tick": 2.5,
        "line_color": "#6E0007",
        "file_suffix": "rmse",
    },
    "Pearson's r": {
        "column": "Pearsonr",
        "colorscale": "blues_r",
        "cmin": 0,
        "cmax": 1,
        "title": "Pearson's r",
        "tick_suffix": "",
        "d_tick": 0.1,
        "line_color": "#003364",
        "file_suffix": "pearsonr",
    },
    "RPE": {
        "column": "RPE",
        "colorscale": "ylorbr",  # mix of bright yellow and orange
        "cmin": 0,
        "cmax": 26,
        "title": "Relative Proportion Error",
        "tick_suffix": "",
        "d_tick": 5,
        "line_color": "#591D00",
        "file_suffix": "rpe",
    },
}

In [None]:
# For this plot, we do dwls and scaden
methods = ["bprism_v2", "dwls"]

In [None]:
def plot_metric_sunburst(
    fig: object,
    sb_attrs: Dict,
    show_scale: bool = False,
) -> None:
    """Plot sunburst

    Args:
        - fig:              Figure object containing all subplots
        - sb_attrs:         attributes to attach to the Sunburst plot
    """
    fig.update_layout(
        margin=dict(t=0, l=0, r=0, b=2),
        coloraxis=dict(
            colorscale=sb_attrs["colorscale"],
            cmin=sb_attrs["cmin"],
            cmax=sb_attrs["cmax"],
            showscale=show_scale,
            colorbar=dict(
                titlefont_size=25,
                ticksuffix=sb_attrs["tick_suffix"],
                ticks="outside",
                ticklen=5,
                tickwidth=2,
                dtick=sb_attrs["d_tick"],
                tickfont_size=20,
                orientation="h",
            ),
        ),
    )

    # Update traces
    fig.update_traces(
        root=dict(color="white"),
        sort=True,
    )

    # Update layout
    fig["layout"].update(
        title={
            # 'text': "",
            "y": 0.99,
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
        },
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(size=20, color="black"),
    )

    fig.write_image(
        Path(viz_prefix)
        .joinpath(f"figures/main_figures/main_fig_5_{sb_attrs['file_suffix']}")
        .with_suffix(".svg"),
        height=300,
        width=600,
        scale=5,
    )

In [None]:
# Parents of minor cell types should have the suffix " - major"
minor_ctype_parents = all_ctype_metrics_df.loc[
    all_ctype_metrics_df["Lineage Level"] == "minor", "Parent"
].apply(lambda x: f"{x} - major")
all_ctype_metrics_df.loc[
    all_ctype_metrics_df["Lineage Level"] == "minor", "Parent"
] = minor_ctype_parents

# Parents of subset cell types should have the suffix " - minor"
subset_ctype_parents = all_ctype_metrics_df.loc[
    all_ctype_metrics_df["Lineage Level"] == "subset", "Parent"
].apply(lambda x: f"{x} - minor")
all_ctype_metrics_df.loc[
    all_ctype_metrics_df["Lineage Level"] == "subset", "Parent"
] = subset_ctype_parents

In [None]:
# Save Supplementary Table 3
all_ctype_metrics_df.replace({"bprism_v2": "BayesPrism", "dwls": "DWLS"}).sort_values(
    ["Method", "Lineage Level", "Cell Type"]
).to_excel("publication/supp_tables/Supplementary_Table_3.xlsx", index=False)

In [None]:
# Save source data
all_ctype_metrics_df.replace({"bprism_v2": "BayesPrism", "dwls": "DWLS"}).sort_values(
    ["Method", "Lineage Level", "Cell Type"]
)[["Lineage Level", "Cell Type", "RMSE"]].to_csv(
    Path(viz_prefix).joinpath("source_data/figure_5d.tsv"), sep="\t"
)
# Save source data
all_ctype_metrics_df.replace({"bprism_v2": "BayesPrism", "dwls": "DWLS"}).sort_values(
    ["Method", "Lineage Level", "Cell Type"]
)[["Lineage Level", "Cell Type", "RPE"]].to_csv(
    Path(viz_prefix).joinpath("source_data/figure_5e.tsv"), sep="\t"
)

In [None]:
# Create as many rows as number of methods
n_rows = len(methods)

for metric, sb_attrs in metrics_mapping.items():
    # Make subplots objectt
    specs = np.full((1, n_rows), {"type": "domain"}).tolist()

    fig = make_subplots(
        cols=n_rows,
        rows=1,
        specs=specs,
        vertical_spacing=0.005,
        horizontal_spacing=0.005,
        # row_titles=methods
    )

    for i, method in enumerate(methods):
        # First extract method-specific metrics as input fir Sunburst
        sunburst_df = (
            all_ctype_metrics_df[all_ctype_metrics_df["Method"] == method]
            .reset_index()
            .drop(["index"], axis=1)
        )

        # We want the center to be white, i.e. value to be zeroes. Fake this data point
        sunburst_df = pd.concat(
            [
                pd.DataFrame(
                    data=[[" ", 0, 1, 0, 0, method, "", "centre", " "]],
                    columns=sunburst_df.columns,
                ),
                sunburst_df,
            ],
            axis=0,
        )

        # Reset index
        sunburst_df = sunburst_df.reset_index().drop(["index"], axis=1)

        # sunburst_df = sunburst_df[:45]

        fig.add_trace(
            go.Sunburst(
                ids=sunburst_df["ids"],
                labels=sunburst_df["Cell Type"],
                parents=sunburst_df["Parent"],
                hovertext=sunburst_df[sb_attrs["column"]],
                marker=dict(
                    colors=sunburst_df[sb_attrs["column"]],
                    line=dict(color="white", width=1.5),
                    coloraxis="coloraxis",
                ),
            ),
            col=i + 1,
            row=1,
        )
    plot_metric_sunburst(fig=fig, sb_attrs=sb_attrs, show_scale=False)

#### Plot colour wheel

In [None]:
colour_wheel_df = all_ctype_metrics_df[all_ctype_metrics_df["Method"] == "bprism_v2"][
    ["Cell Type", "Parent", "ids"]
]

# Merge with colour pallete to get colour codes
colour_wheel_df = colour_wheel_df.merge(
    colour_pallete_df.rename(columns={"all_celltype": "Cell Type"}),
    on=["Cell Type"],
    how="inner",
)

In [None]:
colour_wheel_df

In [None]:
# Create Sunburst figure with labels, parents and no values
fig = go.Figure(
    go.Sunburst(
        ids=colour_wheel_df["ids"],
        labels=colour_wheel_df["Cell Type"],
        parents=colour_wheel_df["Parent"],
        hovertext=colour_wheel_df["lineage"],
    )
)

# Update traces
fig.update_traces(
    marker=dict(
        colors=colour_wheel_df["fill"],
        line=dict(color="black", width=1),
    ),
    root=dict(color="white"),
    leaf=dict(opacity=1),
    sort=False,
)

# Tight margin
fig.update_layout(margin=dict(t=0, l=0, r=0, b=0))

# Update layout
fig["layout"].update(
    title={
        # 'text': "",
        "y": 0.99,
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    },
    plot_bgcolor="rgba(0,0,0,0)",
    font=dict(
        size=20,
    ),
)

fig.write_image(
    Path(viz_prefix).joinpath(f"figures/main_figures/main_fig_5a.svg"),
    width=500,
    height=500,
    scale=5,
)