# Orthogonal validation of deconvolution prediction

In [None]:
import numpy as np
import pandas as pd
import anndata as adata
import scanpy as sc

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px

from sklearn.metrics import mean_squared_error

from scipy.stats import pearsonr
from math import sqrt

%load_ext blackcellmagic

In [None]:
# Directory prefixes
prefix = "???/deconvolution_benchmarking/04_tcga_bulk_validation"

# Cell types
c_types = [
    "B-cells",
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Myeloid",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
    "T-cells",
]

## Load data

#### Barcodes & PAM 50 subtypes

In [None]:
# Load sample uuid - donor uuid - patient barcode mapping
donor_sample_barcode_mapping_df = pd.read_csv(
    Path(prefix).joinpath("data/donor_sample_barcode_mapping.tsv"),
    index_col=0,
    sep="\t",
)
donor_sample_barcode_mapping_df.drop(["entity"], axis=1, inplace=True)

# Get sample id from sampleBarcode
donor_sample_barcode_mapping_df["sampleID"] = [
    i[:16] for i in donor_sample_barcode_mapping_df["sampleBarcode"]
]
donor_sample_barcode_mapping_df.head(2)

In [None]:
# Load PAM50 subtypes
subtypes_meta_df = pd.read_csv(
    Path(prefix).joinpath("data/validation_data/TCGA_PAM50_hg38_v84.tsv"),
    index_col=0,
    sep="\t",
)
subtypes_meta_df.rename(columns={"subtype": "PAM50.RNAseq"}, inplace=True)

# Convert PAM50.RNAseq into categorical
subtypes_meta_df["PAM50.RNAseq"] = subtypes_meta_df["PAM50.RNAseq"].astype("category")
subtypes_meta_df["PAM50.RNAseq"].cat.reorder_categories(
    ["Normal", "Her2", "LumA", "LumB", "Basal"]
)

# Merge into donor_sample_barcode_mapping_df
donor_sample_barcode_mapping_df = donor_sample_barcode_mapping_df.merge(
    subtypes_meta_df[["PAM50.RNAseq"]], left_index=True, right_index=True, how="inner"
)
donor_sample_barcode_mapping_df.head(2)

#### Saltz et al's TIL estimations

In [None]:
saltz_til_df = pd.read_csv(
    Path(prefix).joinpath("data/validation_data/TCGA_BRCA_TIL_H&E.txt"),
    sep="\t",
)
saltz_til_df

#### Tumor purity call

ABSOLUTE

In [None]:
absolute_tum_pur_df = pd.read_csv(
    Path(prefix).joinpath("data/validation_data/TCGA_BRCA_purity.txt"),
    sep="\t",
)

# Extract sample type
absolute_tum_pur_df["sampleType"] = [
    i.split("-")[3] for i in absolute_tum_pur_df["sample"]
]

# Only keep primary tumour
absolute_tum_pur_df = absolute_tum_pur_df[
    absolute_tum_pur_df["sampleType"].isin(["01A", "01B"])
]

absolute_tum_pur_df.head(2)

ESTIMATE, LUMP and IHC

In [None]:
aran_et_al_df = pd.read_excel(
    Path(prefix).joinpath(
        "data/validation_data/41467_2015_BFncomms9971_MOESM1236_ESM.xlsx"
    ),
    sheet_name="Supp Data 1",
)

# "Sample ID" is unique, set as index
aran_et_al_df.set_index(["Sample ID"], inplace=True)

# ABSOLUTE column is empty, drop it
aran_et_al_df.drop(["ABSOLUTE"], axis=1, inplace=True)

# Extract sample type + patient barcode
aran_et_al_df["sampleType"] = [i.split("-")[-1] for i in aran_et_al_df.index]
aran_et_al_df["patientBarcode"] = [i[: i.rfind("-")] for i in aran_et_al_df.index]

# Keep only primary breast cancer samples & primary tumours
aran_et_al_df = aran_et_al_df[
    (aran_et_al_df["Cancer type"] == "BRCA")
    & (aran_et_al_df["sampleType"].isin(["01A", "01B", "01C"]))
]
aran_et_al_df.head(2)

## Tumour purity validation

In [None]:
# List models
models_l = [
    "bisque_linear",
    "cbx",
    # "bprism_v2_marker_genes_cell_states",
    # "bprism_v2_no_marker_genes_cell_states",
    "cpm",
    "dwls_seurat",
    "epic",
    "hspe",
    "music",
    "scaden",
    "bprism_v2_no_marker_genes_no_cell_states",
]

In [None]:
preds_l = []

# Iterate over list of models and extract prediction results
for model in models_l:
    preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/results/{model}").with_suffix(".csv"),
        index_col=0,
        sep="\t",
    )

    if model == "cbx":
        preds_df.drop(["P-value", "Correlation", "RMSE"], axis=1, inplace=True)

    # Some predictions can be very small negative numbers, e.g. -0.00000001
    # Round them up to 0
    preds_df[preds_df < 0] = 0

    # Clone cancer epithelial column to tumour purity
    preds_df["tumour_purity"] = preds_df["Cancer Epithelial"]

    # Drop all cell types columns
    preds_df.drop(c_types, axis=1, inplace=True)

    # Merge with donor_sample_barcode_mapping_df
    barcode_preds_df = preds_df.merge(
        donor_sample_barcode_mapping_df[
            [
                "sampleBarcode",
                "sampleID",
                "patientBarcode",
                "sampleType",
                "PAM50.RNAseq",
            ]
        ],
        left_index=True,
        right_index=True,
        how="inner",
    )

    # Only keep primary tumour
    barcode_preds_df = barcode_preds_df[
        barcode_preds_df["sampleType"].isin(["01A", "01B"])
    ]

    # Merge with ABSOLUTE results
    filtered_barcode_preds_df = (
        barcode_preds_df.merge(
            absolute_tum_pur_df[["bcr_patient_barcode", "purity"]],
            left_on="patientBarcode",
            right_on="bcr_patient_barcode",
            how="left",
        )
        .drop(["bcr_patient_barcode"], axis=1)
        .rename(columns={"purity": "ABSOLUTE"})
    )

    # Merge with ESTIMATE, IHC and LUMP results
    filtered_barcode_preds_df = filtered_barcode_preds_df.merge(
        aran_et_al_df.reset_index().drop(["sampleType", "patientBarcode"], axis=1),
        left_on="sampleID",
        right_on="Sample ID",
        how="left",
    ).drop(["Sample ID", "Cancer type"], axis=1)

    # Assign method
    filtered_barcode_preds_df["method"] = model

    # Drop patientBarcode and sampleType
    filtered_barcode_preds_df = filtered_barcode_preds_df.drop(
        ["patientBarcode", "sampleType"], axis=1
    )

    # Append to preds_l
    preds_l.append(filtered_barcode_preds_df)

# Concatenate reuslts across methods
all_preds_df = pd.concat(preds_l, axis=0)

all_preds_df.replace(
    {
        "scaden": "Scaden",
        "music": "MuSiC",
        "cbx": "CBX",
        "bisque_linear": "Bisque",
        "dwls_seurat": "DWLS",
        "epic": "EPIC",
        "cpm": "CPM",
        "bprism_v2_no_marker_genes_no_cell_states": "BayesPrism",
    },
    inplace=True,
)

# Convert tumour purity and absolute call to % scale
all_preds_df["tumour_purity"] = all_preds_df["tumour_purity"] * 100

# Rename "IHC" to "Pathology"
all_preds_df.rename(columns={"IHC": "Pathology"}, inplace=True)

# Conver to percentage
for i in ["ABSOLUTE", "ESTIMATE", "LUMP", "Pathology", "CPE"]:
    all_preds_df[i] = all_preds_df[i] * 100

In [None]:
# Collate RMSE and Pearson's across metrics and method
stats_l = []

for metric in tqdm(["ABSOLUTE", "ESTIMATE", "LUMP", "Pathology", "CPE"]):
    metric_preds_df = all_preds_df.dropna(subset=[metric])

    for method in metric_preds_df["method"].unique():
        rmse = sqrt(
            mean_squared_error(
                metric_preds_df[metric_preds_df["method"] == method]["tumour_purity"],
                metric_preds_df[metric_preds_df["method"] == method][metric],
            )
        )

        r = pearsonr(
            metric_preds_df[metric_preds_df["method"] == method]["tumour_purity"],
            metric_preds_df[metric_preds_df["method"] == method][metric],
        )[0]

        # Append to stats_l
        stats_l.append([metric, method, rmse, r])

stats_df = pd.DataFrame(data=stats_l, columns=["Metric", "Method", "RMSE", "Pearson's"])
stats_df

Plot ABSOLUTE individually

In [None]:
# Initiate figure object
fig = px.scatter(
    all_preds_df,
    x="ABSOLUTE",
    y="tumour_purity",
    facet_col="method",
    facet_col_spacing=0.05,
    facet_col_wrap=3,
    facet_row_spacing=0.05,
    # facet_row_wrap=3,
    color="PAM50.RNAseq",
    color_discrete_sequence=px.colors.qualitative.D3,
    # trendline="ols",
    # trendline_scope="trace",
    category_orders={
        "method": [
            "BayesPrism",
            "Scaden",
            "MuSiC",
            "CBX",
            "Bisque",
            "DWLS",
            "EPIC",
            "hspe",
            "CPM",
        ]
    },
)

fig.update_traces(marker=dict(size=5), opacity=0.75)

# Update titles of x- and y-axis
fig.update_xaxes(
    linecolor="black",
    linewidth=1,
    ticks="outside",
    tickmode="array",
    tickvals=[0, 25, 50, 75, 100],
    ticklen=2,
    tickwidth=1,
    tickfont_size=7,
    matches=None,
    range=[-1, 101],
)
fig.update_yaxes(
    linecolor="black",
    linewidth=1,
    ticks="outside",
    tickmode="array",
    tickvals=[0, 25, 50, 75, 100],
    ticklen=2,
    tickwidth=1,
    tickfont_size=7,
    matches=None,
    range=[-1, 101],
)

# Update titles of x-axis of bottom row and y-axis of left-most column
fig.update_yaxes(
    title="Predicted tumour purity (%)",
    col=1,
    title_font_size=8,
    title_standoff=5,
)
fig.update_xaxes(
    title="ABSOLUTE purity call (%)",
    row=1,
    title_font_size=8,
    title_standoff=5,
)

# Update layout
fig["layout"].update(
    font=dict(size=8, color="black"),
    plot_bgcolor="rgba(0,0,0,0)",
    showlegend=False,
    legend_traceorder="reversed",
    legend=dict(
        title="PAM50 subtypes",
        title_font_size=8,
        font_size=7,
        orientation="h",
        yanchor="bottom",
        y=-0.5,
        xanchor="center",
        x=0.5,
    ),
    newshape=dict(opacity=1),
    margin=dict(t=12, l=10, r=5, b=0),
)

# Add 45-degree line to all subplots
fig.add_shape(
    dict(type="line", y0=0, y1=100, x0=0, x1=100),
    row="all",
    col="all",
    line=dict(color="darkgray", dash="dot", width=1),
)

# For this plot, we remote annotations and annotate manually
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for suffix in [".png", ".svg"]:  #
    fig.write_image(
        Path("./figures/tumour_purity/absolute_validation").with_suffix(suffix),
        # width=325,
        # height=325,
        # Use these dimensions for legend
        width=550,
        height=325,
        scale=5,
    )

Plot CPE, ABSOLUTE, ESTIMATE, LUMP and IHC together

In [None]:
pivot_longer_all_preds_df = all_preds_df[
    [
        "tumour_purity",
        "sampleBarcode",
        "PAM50.RNAseq",
        "ABSOLUTE",
        "ESTIMATE",
        "LUMP",
        "Pathology",
        "CPE",
        "method",
    ]
].melt(
    id_vars=["sampleBarcode", "method", "PAM50.RNAseq", "tumour_purity"],
    value_vars=["ABSOLUTE", "ESTIMATE", "LUMP", "Pathology", "CPE"],
    var_name="metric",
    value_name="estimated_purity",
)
pivot_longer_all_preds_df.head(2)

In [None]:
pearson_stats_df = stats_df[["Metric", "Method", "Pearson's"]].pivot(
    index="Metric", columns="Method", values="Pearson's"
)
pearson_stats_df.index = pd.CategoricalIndex(
    pearson_stats_df.index,
    categories=["CPE", "ABSOLUTE", "ESTIMATE", "LUMP", "Pathology"],
    ordered=True,
)
pearson_stats_df.columns = pd.CategoricalIndex(
    pearson_stats_df.columns,
    categories=[
        "BayesPrism",
        "Scaden",
        "MuSiC",
        "hspe",
        "CBX",
        "Bisque",
        "DWLS",
        "EPIC",
        "CPM",
    ],
    ordered=True,
)

# Sort index and column
pearson_stats_df.sort_index(inplace=True)
pearson_stats_df.sort_index(axis=1, inplace=True)

pearson_stats_df.round(2)

In [None]:
pearson_stats_df = stats_df[["Metric", "Method", "RMSE"]].pivot(
    index="Metric", columns="Method", values="RMSE"
)
pearson_stats_df.index = pd.CategoricalIndex(
    pearson_stats_df.index,
    categories=["CPE", "ABSOLUTE", "ESTIMATE", "LUMP", "Pathology"],
    ordered=True,
)
pearson_stats_df.columns = pd.CategoricalIndex(
    pearson_stats_df.columns,
    categories=[
        "BayesPrism",
        "Scaden",
        "MuSiC",
        "hspe",
        "CBX",
        "Bisque",
        "DWLS",
        "EPIC",
        "CPM",
    ],
    ordered=True,
)

# Sort index and column
pearson_stats_df.sort_index(inplace=True)
pearson_stats_df.sort_index(axis=1, inplace=True)

pearson_stats_df.round(2)

In [None]:
# Initiate figure object
fig = px.scatter(
    pivot_longer_all_preds_df,
    x="estimated_purity",
    y="tumour_purity",
    facet_row="metric",
    facet_row_spacing=0.035,
    facet_col="method",
    facet_col_spacing=0.015,
    color="PAM50.RNAseq",
    color_discrete_sequence=px.colors.qualitative.D3,
    # trendline="ols",
    # trendline_scope="group",
    category_orders={
        "metric": ["CPE", "ABSOLUTE", "ESTIMATE", "LUMP", "Pathology"],
        "method": [
            "BayesPrism",
            "Scaden",
            "MuSiC",
            "hspe",
            "CBX",
            "Bisque",
            "DWLS",
            "EPIC",
            "CPM",
        ],
    },
)

fig.update_traces(
    marker=dict(size=1.5),
    line=dict(width=1, color="darkgray"),
    opacity=0.75,
    textfont_size=5,
)

# Update titles of x- and y-axis
fig.update_xaxes(
    linecolor="black",
    linewidth=1,
    ticks="outside",
    tickmode="array",
    tickvals=[0, 25, 50, 75, 100],
    ticklen=2,
    tickwidth=1,
    matches=None,
    range=[-1, 101],
)
fig.update_yaxes(
    linecolor="black",
    linewidth=1,
    ticks="outside",
    tickmode="array",
    tickvals=[0, 25, 50, 75, 100],
    ticklen=2,
    tickwidth=1,
    matches=None,
    range=[-1, 101],
)

# Update titles of x-axis of bottom row and y-axis of left-most column
fig.update_yaxes(
    # title="ABSOLUTE purity call (%)",
    col=1,
    title_font_size=1,
    title_standoff=5,
)
fig.update_xaxes(
    # title="Predicted tumour purity (%)",
    row=1,
    title_font_size=1,
    title_standoff=5,
)

# Update layout
fig["layout"].update(
    font_size=8,
    font_color="black",
    plot_bgcolor="rgba(0,0,0,0)",
    showlegend=False,
    legend_traceorder="reversed",
    legend=dict(
        title="PAM50 subtypes",
        title_font_size=8,
        font_size=7,
        orientation="h",
        yanchor="bottom",
        y=-0.5,
        xanchor="center",
        x=0.5,
    ),
    newshape=dict(opacity=1),
    margin=dict(t=10, l=0, r=0, b=0),
)

# Add 45-degree line to all subplots
fig.add_shape(
    dict(type="line", y0=0, y1=100, x0=0, x1=100),
    row="all",
    col="all",
    line=dict(color="darkgray", dash="dot", width=1),
)

# For this plot, we remote annotations and annotate manually
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for suffix in [".png", ".svg"]:
    fig.write_image(
        Path("./figures/tumour_purity/all_validation").with_suffix(suffix),
        width=750,
        height=425,
        scale=5,
    )

## TIL Saltz et al validation

In [None]:
# List models
models_l = [
    "bisque_linear",
    "cbx",
    # "bprism_v2_marker_genes_cell_states",
    # "bprism_v2_no_marker_genes_cell_states",
    "cpm",
    "dwls_seurat",
    "epic",
    "hspe",
    "music",
    "scaden",
    "bprism_v2_no_marker_genes_no_cell_states",
]

In [None]:
preds_l = []

# Iterate over list of models and extract prediction results
for model in models_l:
    preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/results/{model}").with_suffix(".csv"),
        index_col=0,
        sep="\t",
    )

    if model == "cbx":
        preds_df.drop(["P-value", "Correlation", "RMSE"], axis=1, inplace=True)

    # Some predictions can be very small negative numbers, e.g. -0.00000001
    # Round them up to 0
    preds_df.clip(lower=0)

    # TIL = T-cells + B-cells
    preds_df["til"] = (preds_df["T-cells"] + preds_df["B-cells"]) * 100

    # Drop all cell types columns
    preds_df.drop(c_types, axis=1, inplace=True)

    # Merge with donor_sample_barcode_mapping_df
    barcode_preds_df = preds_df.merge(
        donor_sample_barcode_mapping_df[
            [
                "sampleBarcode",
                "sampleID",
                "patientBarcode",
                "sampleType",
                "PAM50.RNAseq",
            ]
        ],
        left_index=True,
        right_index=True,
        how="inner",
    )

    # Only keep primary tumour
    barcode_preds_df = barcode_preds_df[
        barcode_preds_df["sampleType"].isin(["01A", "01B"])
    ]

    # Merge with saltz et al til results
    filtered_barcode_preds_df = (
        barcode_preds_df.merge(
            saltz_til_df,
            left_on="patientBarcode",
            right_on="ParticipantBarcode",
            how="inner",
        )
        .drop(["ParticipantBarcode", "Study"], axis=1)
        .rename(columns={"til_percentage": "saltz_til_percentage"})
    )

    # Assign method
    filtered_barcode_preds_df["method"] = model

    # Drop patientBarcode and sampleType
    filtered_barcode_preds_df = filtered_barcode_preds_df.drop(
        ["patientBarcode", "sampleType"], axis=1
    )

    # Append to preds_l
    preds_l.append(filtered_barcode_preds_df)

# Concatenate reuslts across methods
all_preds_df = pd.concat(preds_l, axis=0)

all_preds_df.replace(
    {
        "scaden": "Scaden",
        "music": "MuSiC",
        "cbx": "CBX",
        "bisque_linear": "Bisque",
        "dwls_seurat": "DWLS",
        "epic": "EPIC",
        "cpm": "CPM",
        "bprism_v2_no_marker_genes_no_cell_states": "BayesPrism",
    },
    inplace=True,
)

# Clip lower end to 0.001 (0.01%)
all_preds_df["til"].clip(lower=0.001, inplace=True)
all_preds_df["saltz_til_percentage"].clip(lower=0.001, inplace=True)

In [None]:
# Collate RMSE and Pearson's across metrics and method
stats_l = []

for metric in tqdm(["saltz_til_percentage"]):
    metric_preds_df = all_preds_df.dropna(subset=[metric])

    for method in metric_preds_df["method"].unique():
        rmse = sqrt(
            mean_squared_error(
                metric_preds_df[metric_preds_df["method"] == method]["til"],
                metric_preds_df[metric_preds_df["method"] == method][metric],
            )
        )

        r = pearsonr(
            metric_preds_df[metric_preds_df["method"] == method]["til"],
            metric_preds_df[metric_preds_df["method"] == method][metric],
        )[0]

        # Append to stats_l
        stats_l.append([metric, method, rmse, r])

stats_df = pd.DataFrame(data=stats_l, columns=["Metric", "Method", "RMSE", "Pearson's"])
stats_df

In [None]:
# Initiate figure object
fig = px.scatter(
    all_preds_df,
    x="saltz_til_percentage",
    log_x=True,
    y="til",
    log_y=True,
    facet_col="method",
    facet_col_spacing=0.05,
    facet_col_wrap=3,
    facet_row_spacing=0.05,
    color="PAM50.RNAseq",
    color_discrete_sequence=px.colors.qualitative.D3,
    # trendline="ols",
    # trendline_options=dict(log_x=True, log_y=True),
    category_orders={
        "method": [
            "BayesPrism",
            "Scaden",
            "MuSiC",
            "CBX",
            "Bisque",
            "DWLS",
            "EPIC",
            "hspe",
            "CPM",
        ]
    },
)

fig.update_traces(marker=dict(size=1.5), opacity=0.75)

# Update titles of x- and y-axis
fig.update_xaxes(
    linecolor="black",
    linewidth=1,
    ticks="outside",
    tickmode="array",
    tickvals=[0.001, 0.1, 1, 10, 100],
    ticklen=2,
    tickwidth=1,
    matches=None,
    range=[-3.1, 2],
)
fig.update_yaxes(
    linecolor="black",
    linewidth=1,
    ticks="outside",
    tickmode="array",
    tickvals=[0.001, 0.01, 0.1, 1, 10, 100],
    ticklen=2,
    tickwidth=1,
    matches=None,
    range=[-3.1, 2],
)

# Update titles of x-axis of bottom row and y-axis of left-most column
fig.update_yaxes(
    title="Predicted TIL (%)",
    col=1,
    title_font_size=8,
    title_standoff=5,
)
fig.update_xaxes(
    title="Saltz et al TIL est (%)",
    row=1,
    title_font_size=8,
    title_standoff=5,
)

# Update layout
fig["layout"].update(
    font=dict(size=8, color="black"),
    font_color="black",
    plot_bgcolor="rgba(0,0,0,0)",
    showlegend=False,
    newshape=dict(opacity=1),
    margin=dict(t=10, l=0, r=0, b=0),
)

# Add 45-degree line to all subplots
fig.add_shape(
    dict(type="line", y0=0.001, y1=99, x0=0.001, x1=99),
    row="all",
    col="all",
    line=dict(color="darkgray", dash="dot", width=1),
)

# For this plot, we remote annotations and annotate manually
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

for suffix in [".png", ".svg"]:
    fig.write_image(
        Path("./figures/til/saltz_et_al/saltz_et_al_validation").with_suffix(suffix),
        width=325,
        height=325,
        scale=5,
    )

# fig.write_html(
#     Path("./figures/til/saltz_et_al/saltz_et_al_validation").with_suffix(".html"),
#     auto_open=True
# )