# Visualize models performance 

In [None]:
import numpy as np
import pandas as pd
import anndata as adata

import random

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity as skl_cosine

from scipy.stats import pearsonr
from scipy.spatial.distance import cosine as scipy_cosine
from math import sqrt
from sklearn.metrics import confusion_matrix

%load_ext blackcellmagic

## Set up paths and load setting files

#### Path to subset/minor/major experiments

In [None]:
# Subset
subset_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/subset_level"
)

# Minor
minor_prefix = (
    "???/deconvolution_benchmarking/03_immune_lineages_experiment/minor_level"
)

# Major
major_prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/include_normal_epithelial"

# Prefix to visualizations folder
viz_prefix = "???/deconvolution_benchmarking/visualizations"

#### List major/minor/subset cell types

In [None]:
subset_c_types = [
    "Endothelial",
    "CAFs",
    "PVL",
    "B cells Memory",
    "B cells Naive",
    "T_cells_c4_CD8+_ZFP36",
    "T_cells_c6_IFIT1",
    "T_cells_c7_CD8+_IFNG",
    "T_cells_c8_CD8+_LAG3",
    "T_cells_c0_CD4+_CCR7",
    "T_cells_c1_CD4+_IL7R",
    "T_cells_c2_CD4+_T-regs_FOXP3",
    "T_cells_c3_CD4+_Tfh_CXCL13",
    "T_cells_c9_NK_cells_AREG",
    "T_cells_c11_MKI67",
    "T_cells_c10_NKT_cells_FCGR3A",
    "Myeloid_c10_Macrophage_1_EGR1",
    "Myeloid_c12_Monocyte_1_IL1B",
    "Myeloid_c2_LAM2_APOE",
    "Myeloid_c1_LAM1_FABP5",
    "Cycling_Myeloid",
    "Myeloid_c4_DCs_pDC_IRF7",
    "Normal Epithelial",
    "Plasmablasts",
    "Myeloid_c8_Monocyte_2_S100A9",
    "Myeloid_c9_Macrophage_2_CXCL10",
    "Myeloid_c11_cDC2_CD1C",
    "Cancer Epithelial",
    "Myeloid_c3_cDC1_CLEC9A",
]
minor_c_types = [
    "Cancer Epithelial",
    "T cells CD4+",
    "T cells CD8+",
    "Endothelial",
    "CAFs",
    "Macrophage",
    "PVL",
    "Normal Epithelial",
    "Plasmablasts",
    "B cells Memory",
    "Monocyte",
    "Cycling_Myeloid",
    "Cycling T-cells",
    "NK cells",
    "NKT cells",
    "DCs",
    "B cells Naive",
]
major_c_types = [
    "B-cells",
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Myeloid",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
    "T-cells",
]

#### Load groundtruth

In [None]:
# We only use tumour purity = 50%
pur_lvl = 0.5

In [None]:
# Subset
subset_truth_df = pd.read_csv(
    Path(subset_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
subset_truth_df = subset_truth_df[subset_c_types]

# Minor
minor_truth_df = pd.read_csv(
    Path(minor_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
minor_truth_df = minor_truth_df[minor_c_types]

# Major
major_truth_df = pd.read_csv(
    Path(major_prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
major_truth_df = major_truth_df[major_c_types]

#### Utilities

In [None]:
def apply_clr_transform(
    source_df: pd.DataFrame, threshold: float = 0.0001
) -> pd.DataFrame:
    """Apply Centered Log-ratio Transformation on a provide DataFrame

    Args:
        - source_df:     provided DataFrame, row as samples, cols as cell types
        - threshold:     we replace any fractions below this threshold to 0
    """
    # Copy provided DataFrame so we don't alter it
    df = source_df.copy()

    # Replace any values below the threshold with 0
    df[df < threshold] = 0

    # Apply CLR transformation on groundtruth
    geo_mean_df = (df.replace({0: threshold}).prod(axis=1) ** (1 / 9)).to_frame()
    clr_df = np.log(
        df.replace({0: threshold}).T.div(geo_mean_df.T.iloc[0], axis="columns").T
    )

    return clr_df

In [None]:
def calculate_aitcs_dist(
    source_truth_df: pd.DataFrame, source_res_df: pd.DataFrame
) -> pd.DataFrame:
    """Compute Aitchison distance between predictions and groundtruth
    The function assumes that rows as mixtures, columns are cell types

    Args:
        - source_truth_df:      groundtruth DataFrame
        - sourceres _df:        predictions DataFrame
    """
    aitcs_dist_l = []

    # Copy prediction and groundtruth DataFrames so we don't accidentally alter them
    temp_truth_df = source_truth_df.copy()
    temp_res_df = source_res_df.copy()

    # Make sure both DataFrames have the same columns order
    temp_res_df = temp_res_df[temp_truth_df.columns]

    # # Calculate Aitchison distance
    # aitcs_dist_df = pd.DataFrame(columns=["Mixtures", "Aitchison Distance"])

    # for mixt_id in temp_truth_df.index.tolist():
    #     aitcs_dist = np.linalg.norm(
    #         temp_truth_df.loc[mixt_id] - temp_res_df.loc[mixt_id]
    #     )

    #     aitcs_dist_df.loc[len(aitcs_dist_df)] = [mixt_id, aitcs_dist]

    # avg_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()
    # aitcs_dist_l.append(pd.Series(data=avg_aitcs_dist, name=method).to_frame())

    # Calculate Aitchison distance
    aitcs_dist_df = (
        np.sqrt(
            ((temp_res_df.sort_index() - temp_truth_df.sort_index()) ** 2).sum(axis=1)
        )
        .to_frame()
        .rename(columns={0: "Aitchison Distance"})
    )

    median_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()
    aitcs_dist_l.append(pd.Series(data=median_aitcs_dist, name=method).to_frame())

    return aitcs_dist_df

#### Extract lineages metadata

In [None]:
# Load lineage mapping of all cell types in the data
lineages_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_immune_lineages.tsv"),
    sep="\t",
)

# Replace columns
lineages_df.rename(
    columns={
        "celltype_major": "Major Cell Type",
        "celltype_major_short": "Annotated Major Cell Type",
        "celltype_minor": "Minor Cell Type",
        "celltype_minor_short": "Annotated Minor Cell Type",
        "celltype_subset": "Subset Cell Type",
        "celltype_subset_short": "Annotated Subset Cell Type",
        "marker_genes": "Marker Genes",
        "counts": "Cell Counts",
        "num_of_patients": "Patient Counts",
    },
    inplace=True,
)

In [None]:
# We removed Myeloid_c7_Monocyte_3_FCGR3A, exclude it from the lineages DataFrame
lineages_df = lineages_df[
    lineages_df["Subset Cell Type"] != "Myeloid_c7_Monocyte_3_FCGR3A"
]

# We also removed 3 cell types that MuSiC dropped
lineages_df = lineages_df[
    ~lineages_df["Subset Cell Type"].isin(
        [
            "Myeloid_c0_DC_LAMP3",
            "Myeloid_c5_Macrophage_3_SIGLEC1",
            "T_cells_c5_CD8+_GZMK",
        ]
    )
]

#### Extract colour pallete

In [None]:
# Load lineage mapping of all cell types in the data
colour_pallete_df = pd.read_csv(
    Path(viz_prefix).joinpath("data/Whole_miniatlas_colour_pallete.csv"), sep="\t"
)

## [Fig] Heatmaps of Aitchison distances across methods

In [None]:
# For this plot, we do all methods
methods = [
    "epic",
    "cpm",
    "dwls",
    "cbx",
    "scaden",
    "bisque",
    "hspe",
    "music",
    "bprism_v2",
]
non_immune_ctypes = [
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
]

# Set clipping threshold for Aitchison distance calculation
aitcs_dist_threshold = 0.001

### Calculate Aitchison distance for subset cell types

In [None]:
subset_aitcs_dist_l = []

for method in tqdm(methods):
    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(subset_prefix).joinpath(f"data/results/{method}.csv"),
        sep="\t",
        index_col=0,
    )
    tmp_truth_df = subset_truth_df[
        subset_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Apply CLR transformation on tmp predictions and groundtruth
    clr_tmp_res_df = apply_clr_transform(
        source_df=tmp_res_df, threshold=aitcs_dist_threshold
    )
    clr_tmp_truth_df = apply_clr_transform(
        source_df=tmp_truth_df, threshold=aitcs_dist_threshold
    )

    # Calculate median Aitchison distance when all cell types are included
    aitcs_dist_df = calculate_aitcs_dist(
        source_res_df=clr_tmp_res_df, source_truth_df=clr_tmp_truth_df
    )
    all_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()

    # Calculate median Aitchison distance when only leukocytes are includes
    aitcs_dist_df = calculate_aitcs_dist(
        source_res_df=clr_tmp_res_df.drop(non_immune_ctypes, axis=1),
        source_truth_df=clr_tmp_truth_df.drop(non_immune_ctypes, axis=1),
    )
    immune_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()

    # Concatenate into Aitchison distance list
    method_subset_aitcs_dist_df = pd.DataFrame(
        data=[
            [all_aitcs_dist, "All Cell Types", "subset"],
            [immune_aitcs_dist, "Immune Only", "subset"],
        ],
        columns=[method, "expr", "lineage_lvl"],
    )
    subset_aitcs_dist_l.append(method_subset_aitcs_dist_df)

# Concatenate and drop duplicated columns
subset_aitcs_dist_df = pd.concat(subset_aitcs_dist_l, axis=1)
subset_aitcs_dist_df = subset_aitcs_dist_df.loc[
    :, ~subset_aitcs_dist_df.columns.duplicated()
]

### Calculate Aitchison distance for minor cell types

In [None]:
minor_aitcs_dist_l = []

for method in tqdm(methods):
    print(method)
    # First read out predictions and groundtruth
    metrics_series_l = []

    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(minor_prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )
    tmp_truth_df = minor_truth_df[
        minor_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Apply CLR transformation on tmp predictions and groundtruth
    clr_tmp_res_df = apply_clr_transform(
        source_df=tmp_res_df, threshold=aitcs_dist_threshold
    )
    clr_tmp_truth_df = apply_clr_transform(
        source_df=tmp_truth_df, threshold=aitcs_dist_threshold
    )

    # Calculate median Aitchison distance when all cell types are included
    aitcs_dist_df = calculate_aitcs_dist(
        source_res_df=clr_tmp_res_df, source_truth_df=clr_tmp_truth_df
    )
    all_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()

    # Calculate median Aitchison distance when only leukocytes are includes
    aitcs_dist_df = calculate_aitcs_dist(
        source_res_df=clr_tmp_res_df.drop(non_immune_ctypes, axis=1),
        source_truth_df=clr_tmp_truth_df.drop(non_immune_ctypes, axis=1),
    )
    immune_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()

    # Concatenate into Aitchison distance list
    method_minor_aitcs_dist_df = pd.DataFrame(
        data=[
            [all_aitcs_dist, "All Cell Types", "minor"],
            [immune_aitcs_dist, "Immune Only", "minor"],
        ],
        columns=[method, "expr", "lineage_lvl"],
    )
    minor_aitcs_dist_l.append(method_minor_aitcs_dist_df)

# Concatenate and drop duplicated columns
minor_aitcs_dist_df = pd.concat(minor_aitcs_dist_l, axis=1)
minor_aitcs_dist_df = minor_aitcs_dist_df.loc[
    :, ~minor_aitcs_dist_df.columns.duplicated()
]

### Calculate Aitchison distance for major cell types

In [None]:
major_aitcs_dist_l = []

for method in tqdm(methods):
    # First read out predictions and groundtruth
    metrics_series_l = []

    # Read predictions and filter predictions and groundtruth of 50% tumour
    res_df = pd.read_csv(
        Path(major_prefix).joinpath(f"data/results/{method}.csv"), sep="\t", index_col=0
    )
    tmp_truth_df = major_truth_df[
        major_truth_df["Cancer Epithelial"] == pur_lvl
    ].sort_index()
    tmp_res_df = res_df[res_df.index.isin(tmp_truth_df.index)].sort_index()

    # Apply CLR transformation on tmp predictions and groundtruth
    clr_tmp_res_df = apply_clr_transform(
        source_df=tmp_res_df, threshold=aitcs_dist_threshold
    )
    clr_tmp_truth_df = apply_clr_transform(
        source_df=tmp_truth_df, threshold=aitcs_dist_threshold
    )

    # Calculate median Aitchison distance when all cell types are included
    aitcs_dist_df = calculate_aitcs_dist(
        source_res_df=clr_tmp_res_df, source_truth_df=clr_tmp_truth_df
    )
    all_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()

    # Calculate median Aitchison distance when only leukocytes are includes
    aitcs_dist_df = calculate_aitcs_dist(
        source_res_df=clr_tmp_res_df.drop(non_immune_ctypes, axis=1),
        source_truth_df=clr_tmp_truth_df.drop(non_immune_ctypes, axis=1),
    )
    immune_aitcs_dist = aitcs_dist_df["Aitchison Distance"].median()

    # Concatenate into Aitchison distance list
    method_major_aitcs_dist_df = pd.DataFrame(
        data=[
            [all_aitcs_dist, "All Cell Types", "major"],
            [immune_aitcs_dist, "Immune Only", "major"],
        ],
        columns=[method, "expr", "lineage_lvl"],
    )
    major_aitcs_dist_l.append(method_major_aitcs_dist_df)

# Concatenate and drop duplicated columns
major_aitcs_dist_df = pd.concat(major_aitcs_dist_l, axis=1)
major_aitcs_dist_df = major_aitcs_dist_df.loc[
    :, ~major_aitcs_dist_df.columns.duplicated()
]

### Concatenate all Aitchison distance DataFrame

In [None]:
aitcs_dist_df = pd.concat(
    [subset_aitcs_dist_df, minor_aitcs_dist_df, major_aitcs_dist_df], axis=0
)

aitcs_dist_df.rename(
    columns={
        "scaden": "Scaden",
        "music": "MuSiC",
        "cbx": "CBX",
        "dwls": "DWLS",
        "epic": "EPIC",
        "cpm": "CPM",
        "bisque": "Bisque",
        "bprism_v2": "BayesPrism",
    },
    inplace=True,
)

#### Create annotated heatmap

In [None]:
# Save source data
aitcs_dist_df[aitcs_dist_df["expr"] == "All Cell Types"].drop(["expr"], axis=1).to_csv(
    Path(viz_prefix).joinpath("source_data/figure_5b.tsv"), sep="\t"
)
aitcs_dist_df[aitcs_dist_df["expr"] == "Immune Only"].drop(["expr"], axis=1).to_csv(
    Path(viz_prefix).joinpath("source_data/figure_5c.tsv"), sep="\t"
)

In [None]:
new_methods = [
    "BayesPrism",
    "DWLS",
    "MuSiC",
    "CBX",
    "Scaden",
    "Bisque",
    "EPIC",
    "hspe",
    "CPM",
]

for expr in aitcs_dist_df["expr"].unique():
    expr_aitcs_df = aitcs_dist_df[aitcs_dist_df["expr"] == expr]

    fig = ff.create_annotated_heatmap(
        z=expr_aitcs_df[new_methods].round(2).values,
        annotation_text=expr_aitcs_df[new_methods]
        .round(2)
        .values,  # Annotate with AitcsDist values
        zmin=0,
        zmax=25,
        x=new_methods,  # Rows are methods
        y=expr_aitcs_df["lineage_lvl"].tolist(),  # Columns are cell types
        colorscale="teal",
        hoverinfo="text",
        text=expr_aitcs_df[new_methods].round(2).values,
        showscale=False,
        colorbar=dict(
            ticks="outside",
            ticksuffix="",
            dtick=5,
            orientation="h",
            yanchor="bottom",
            y=-2,
            xanchor="center",
            x=0.5,
        ),
    )

    # Update layout
    fig["layout"].update(
        plot_bgcolor="rgba(0,0,0,0)",
        font=dict(size=18, color="black"),
        xaxis=dict(
            title="Methods",
            title_standoff=2,
            title_font_size=18,
            ticks="outside",
            showticklabels=True,
            tickmode="array",
            tickvals=new_methods,
            tickfont_size=17,
            tickangle=25,
            linecolor="black",
            side="bottom",
        ),
        yaxis=dict(
            title="Lineage Levels",
            title_standoff=5,
            title_font_size=18,
            linecolor="black",
            categoryorder="array",
            categoryarray=["subset", "minor", "major"],  # Order cell types by linages
            tickfont_size=17,
            ticks="outside",
            ticklen=2,
        ),
        margin=dict(t=0, l=0, r=0, b=0),
    )

    fig.write_image(
        Path(f"figures/main_figures/main_fig_5_{expr}").with_suffix(".svg"),
        width=650,
        height=275,
        scale=5,
    )