# Collate model predictions from purity-level partitions

In [None]:
import numpy as np
import pandas as pd
import anndata as adata
import scanpy as sc

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px

%load_ext blackcellmagic

In [None]:
# Prefix
prefix = "???/deconvolution_benchmarking/06_batch_effect_validation/bassez_et_al"

In [None]:
purity_levels = np.arange(0.05, 1, 0.05).round(3).tolist()

### Bassez et al

In [None]:
# Training patient IDs
train_p_ids = [
    # HER2+
    "BIOKEY_13",
    # ER+
    "BIOKEY_3",
    "BIOKEY_5",
    "BIOKEY_12",
    "BIOKEY_18",
    "BIOKEY_22",
    "BIOKEY_24",
    "BIOKEY_27",
    "BIOKEY_29",
    "BIOKEY_30",
    # "BIOKEY_32",
    "BIOKEY_40",
    "BIOKEY_42",
    # TNBC
    "BIOKEY_2",
    "BIOKEY_9",
    "BIOKEY_10",
    "BIOKEY_11",
    "BIOKEY_14",
    "BIOKEY_15",
    "BIOKEY_33",
    "BIOKEY_35",
    "BIOKEY_36",
    "BIOKEY_41",
]
# Test patient IDs
test_p_ids = [
    # HER2+
    "BIOKEY_28",
    # ER+
    "BIOKEY_4",
    "BIOKEY_6",
    "BIOKEY_7",
    "BIOKEY_17",
    "BIOKEY_21",
    "BIOKEY_37",
    # TNBC
    "BIOKEY_1",
    "BIOKEY_16",
    "BIOKEY_19",
    "BIOKEY_26",
    "BIOKEY_31",
]

# Major cell types
bassez_et_al_c_types = [
    "Cancer_cell",
    "T_cell",
    "B_cell",
    "Myeloid_cell",
    "Endothelial_cell",
    "Fibroblast",
    "Mast_cell",
    "pDC",
]

### Wu et al

In [None]:
# Training patient IDs
wu_et_al_train_p_ids = [
    "CID3586",
    "CID3941",
    "CID3963",
    "CID44041",
    "CID4530N",
    "CID3838",
    "CID3946",
    "CID4040",
    "CID4461",
    "CID44991",
    "CID45171",
    "CID4535",
    "CID3948",
    "CID4398",
    "CID4463",
    "CID4495",
    "CID4513",
    "CID4465",
]
# Training patient IDs
wu_et_al_test_p_ids = [
    "CID4067",
    "CID4290A",
    "CID4471",
    "CID3921",
    "CID4066",
    "CID4523",
    "CID44971",
    "CID4515",
]
wu_et_al_c_types = [
    "B_cells",
    "CAFs",
    "Cancer_Epithelial",
    "Endothelial",
    "Myeloid",
    "Normal_Epithelial",
    "PVL",
    "Plasmablasts",
    "T_cells",
]

## Prepare our groundtruth

If we haven't extracted groundtruth from test AnnData object

In [None]:
test_adata = sc.read_h5ad(
    Path(prefix).joinpath("data/test/filtered_test_sim_mixts.h5ad")
)
truth_df = test_adata.obs.drop(["batch"], axis=1).fillna(0)
truth_df["PVL"] = 0
truth_df["Normal_Epithelial"] = 0
truth_df["Plasmablasts"] = 0
truth_df = truth_df[wu_et_al_c_types]

In [None]:
# Save into csv beautifully
truth_df.to_csv(Path(prefix).joinpath("data/results/truth.tsv"), sep="\t")

If we have already extracted the groundtruth

In [None]:
# Load truth.tsv
truth_df = pd.read_csv(
    Path(prefix).joinpath("data/results/truth.tsv"), sep="\t", index_col=0
)
truth_df = truth_df[wu_et_al_c_types]
truth_df.head(2)

### CIBERSORTx

In [None]:
# If we run in normal mode, the results file is called CIBERSORTx_Results
# If we run in Smode or Bmode, the results file will be called CIBERSORTx_Adjusted.txt
# Adjust the filename accordingy
results_f = "CIBERSORTx_Results.txt"

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cbx/results/{pur_lvl}/{results_f}"),
        sep="\t",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]
    subset_preds_df.drop(["P-value", "Correlation", "RMSE"], axis=1, inplace=True)
    preds_l.append(subset_preds_df)

    # Calcuate preds-truth for each purity level
    diff_df = abs(
        subset_preds_df[wu_et_al_c_types].sort_index() - subset_truth_df.sort_index()
    )
    # diff_df = subset_preds_df[wu_et_al_c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/cbx.tsv"), sep="\t")

### Scaden

In [None]:
avg_diff_l = []
preds_l = []

for pur_lvl in tqdm(purity_levels):
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/scaden/{pur_lvl}/results_{pur_lvl}.tsv"),
        sep="\t",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(
        subset_preds_df[wu_et_al_c_types].sort_index() - subset_truth_df.sort_index()
    )
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

preds_df = pd.concat(preds_l, axis=0)
avg_diff_df = pd.concat(avg_diff_l, axis=1)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/scaden.tsv"), sep="\t")

### EPIC

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(
            f"data/epic/cbx_sig_matrix/results/{pur_lvl}/results.csv"
        ),
        sep=",",
        index_col=0,
    )

    # Replace otherCells in predictions by Cancer Epithelial
    subset_preds_df.rename(
        columns={
            "otherCells": "Cancer_Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(
        subset_preds_df[wu_et_al_c_types].sort_index() - subset_truth_df.sort_index()
    )
    # diff_df = subset_preds_df[wu_et_al_c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/epic.tsv"), sep="\t")

### bisque

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
experiments = [""]  # , "_scaled"

for expt in experiments:

    avg_diff_l = []
    preds_l = []

    # Iterate over purity levels
    for pur_lvl in tqdm(purity_levels):
        # Read predictions
        subset_preds_df = pd.read_csv(
            Path(prefix).joinpath(f"data/bisque/results{expt}/{pur_lvl}/results.csv"),
            sep=",",
            index_col=0,
        ).T

        # Get correct groundtruth subset
        subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]

        # Calcuate preds-truth for each purity level
        diff_df = abs(
            subset_preds_df[wu_et_al_c_types].sort_index()
            - subset_truth_df.sort_index()
        )
        # diff_df = subset_preds_df[wu_et_al_c_types] - subset_truth_df
        avg_diff = diff_df.mean().to_frame()
        avg_diff.columns = [pur_lvl]

        preds_l.append(subset_preds_df)
        avg_diff_l.append(avg_diff)

    preds_df = pd.concat(preds_l, axis=0)
    avg_diff_df = pd.concat(avg_diff_l, axis=1)

    # Save predictions
    preds_df.to_csv(Path(prefix).joinpath(f"data/results/bisque{expt}.tsv"), sep="\t")

### DWLS

In [None]:
# Make list of differential expession analysis methods
de_methods = ["_seurat"]  # "_mast"

for de_method in de_methods:
    # Make an empty list to store average of (preds - truth) of each purity levels
    avg_diff_l = []
    preds_l = []

    # Iterate over purity levels
    for pur_lvl in tqdm(purity_levels):
        # Read and reorganize  index and columns to match truth_df
        subset_preds_df = pd.read_csv(
            Path(prefix).joinpath(
                f"data/dwls/results{de_method}/{pur_lvl}/results.csv"
            ),
            sep=",",
            index_col=0,
        ).T

        subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]

        # Calcuate preds-truth for each purity level
        diff_df = abs(
            subset_preds_df[wu_et_al_c_types].sort_index()
            - subset_truth_df.sort_index()
        )
        # diff_df = subset_preds_df[wu_et_al_c_types] - subset_truth_df
        avg_diff = diff_df.mean().to_frame()
        avg_diff.columns = [pur_lvl]

        avg_diff_l.append(avg_diff)
        preds_l.append(subset_preds_df)

    avg_diff_df = pd.concat(avg_diff_l, axis=1)
    preds_df = pd.concat(preds_l, axis=0)

    # Save predictions
    preds_df.to_csv(
        Path(prefix).joinpath(f"data/results/dwls{de_method}.tsv"), sep="\t"
    )

## MuSiC

In [None]:
# List the experiments we want to collect results from
# By default, standard runs are blank ("") experiment
experiments = [""]

for expt in experiments:
    # Make an empty list to store average of (preds - truth) of each purity levels
    avg_diff_l = []
    preds_l = []

    # Iterate over purity levels
    for pur_lvl in tqdm(purity_levels):
        # Read and reorganize  index and columns to match truth_df
        subset_preds_df = pd.read_csv(
            Path(prefix).joinpath(f"data/music/results{expt}/{pur_lvl}/results.csv"),
            sep=",",
            index_col=0,
        )

        subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]

        # Calcuate preds-truth for each purity level
        diff_df = abs(
            subset_preds_df[wu_et_al_c_types].sort_index()
            - subset_truth_df.sort_index()
        )
        # diff_df = subset_preds_df[wu_et_al_c_types] - subset_truth_df
        avg_diff = diff_df.mean().to_frame()
        avg_diff.columns = [pur_lvl]

        # avg_diff_l.append(avg_diff)
        preds_l.append(subset_preds_df)

    # avg_diff_df = pd.concat(avg_diff_l, axis=1)
    preds_df = pd.concat(preds_l, axis=0)

    # Save predictions
    preds_df.to_csv(Path(prefix).joinpath(f"data/results/music{expt}.tsv"), sep="\t")

## hspe

In [None]:
# List the experiments we want to collect results from
# By default, standard runs are blank ("") experiment
experiments = [""]

for expt in experiments:
    # Make an empty list to store average of (preds - truth) of each purity levels
    avg_diff_l = []
    preds_l = []

    # Iterate over purity levels
    for pur_lvl in tqdm(purity_levels):

        # Iterate over each of the 20 partitions
        for partition in list(range(0, 20, 1)):

            # Read and reorganize  index and columns to match truth_df
            subset_preds_df = pd.read_csv(
                Path(prefix).joinpath(
                    f"data/hspe/results{expt}/{pur_lvl}/{partition}/results.csv"
                ),
                sep=",",
                index_col=0,
            )

            preds_l.append(subset_preds_df)

    preds_df = pd.concat(preds_l, axis=0)

    # Calcuate preds-truth for each purity level
    avg_diff_l = []
    for pur_lvl in tqdm(purity_levels):
        subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]
        subset_preds_df = preds_df[preds_df.index.isin(subset_truth_df.index)]

        diff_df = abs(
            subset_preds_df[wu_et_al_c_types].sort_index()
            - subset_truth_df.sort_index()
        )
        avg_diff = diff_df.mean().to_frame()
        avg_diff.columns = [pur_lvl]
        avg_diff_l.append(avg_diff)

    avg_diff_df = pd.concat(avg_diff_l, axis=1)

    # Save results to csv
    preds_df = pd.concat(preds_l, axis=0)

    preds_df.to_csv(Path(prefix).joinpath(f"data/results/hspe{expt}.tsv"), sep="\t")

## BayesPrism v2

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# We run BayesPrism v2 in different experiments. Decided which set of results to pull
# Experiment name will also be results file's suffix
expt = ""  # "_no_marker_genes"

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/bprism_v2/results{expt}/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    subset_truth_df = truth_df[truth_df["Cancer_Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(
        subset_preds_df[wu_et_al_c_types].sort_index() - subset_truth_df.sort_index()
    )
    # diff_df = subset_preds_df[wu_et_al_c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)


avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

# Save predictions
preds_df.to_csv(Path(prefix).joinpath(f"data/results/bprism_v2{expt}.tsv"), sep="\t")