# Collate model predictions from purity-level partitions

In [None]:
import numpy as np
import pandas as pd
import anndata as adata
import scanpy as sc

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px

In [None]:
prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/exclude_normal_epithelial"
purity_levels = np.arange(0.05, 1, 0.05).round(3).tolist()
c_types = [
    "Cancer Epithelial",
    "T-cells",
    "B-cells",
    "Myeloid",
    "Endothelial",
    "CAFs",
    "PVL",
    "Plasmablasts",
]

## Prepare our groundtruth

If we haven't extracted groundtruth from test AnnData object

In [None]:
test_adata = sc.read_h5ad(Path(prefix).joinpath("data/test/test_sim_mixts.h5ad"))
truth_df = test_adata.obs.drop(["batch"], axis=1).fillna(0)
truth_df = truth_df[c_types]

In [None]:
# Make results/ directory if it hasn't existed yet
Path(prefix).joinpath("data/results/").mkdir(exist_ok=True, parents=True)

In [None]:
# Save into csv beautifully
truth_df.to_csv(Path(prefix).joinpath("data/results/truth.csv"), sep="\t")

If we have already extracted the groundtruth

In [None]:
# Load truth.csv
truth_df = pd.read_csv(
    Path(prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
truth_df.columns = c_types
truth_df.head(2)

### CIBERSORTx

In [None]:
# If we run in normal mode, the results file is called CIBERSORTx_Results
# If we run in Smode or Bmode, the results file will be called CIBERSORTx_Adjusted.txt
# Adjust the filename accordingy
results_f = "CIBERSORTx_Results.txt"

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cbx/results/{pur_lvl}/{results_f}"),
        sep="\t",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]
    subset_preds_df.drop(["P-value", "Correlation", "RMSE"], axis=1, inplace=True)
    preds_l.append(subset_preds_df)

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/cbx.csv"), sep="\t")

### Scaden

In [None]:
avg_diff_l = []
preds_l = []

for pur_lvl in tqdm(purity_levels):
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/scaden/{pur_lvl}/results_{pur_lvl}.txt"),
        sep="\t",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

preds_df = pd.concat(preds_l, axis=0)
avg_diff_df = pd.concat(avg_diff_l, axis=1)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/scaden.csv"), sep="\t")

### EPIC
We're using CBX-derived reference profiles and remove Cancer Epithelial from reference profiles<br>
We're treating Other Cells in the output as cancer cells

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(
            f"data/epic/cbx_sig_matrix/results/{pur_lvl}/results.csv"
        ),
        sep=",",
        index_col=0,
    )

    # Replace otherCells in predictions by Cancer Epithelial
    subset_preds_df.rename(
        columns={
            "otherCells": "Cancer Epithelial",
            "B.cells": "B-cells",
            "T.cells": "T-cells",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

#### Save predictions

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/epic.csv"), sep="\t")

### CPM

In [None]:
# Select which experiments we'd like to generate results for
experiment = "expr_1_original_cellstate_1330_per_ctype/"

# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

In [None]:
# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cpm/{experiment}/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/cpm.csv"), sep="\t")

### bisque

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/bisque/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    ).T

    # Get correct groundtruth subset
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

preds_df = pd.concat(preds_l, axis=0)
avg_diff_df = pd.concat(avg_diff_l, axis=1)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/bisque.csv"), sep="\t")

### DWLS

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/dwls/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    ).T

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "T_cells": "T-cells",
            "B_cells": "B-cells",
            "Cancer_Epithelial": "Cancer Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/dwls.csv"), sep="\t")

## MuSiC

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/music/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/music.csv"), sep="\t")

## hspe

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):

    # Iterate over each of the 20 partitions
    for partition in list(range(0, 20, 1)):

        # Read and reorganize  index and columns to match truth_df
        subset_preds_df = pd.read_csv(
            Path(prefix).joinpath(
                f"data/hspe/results/{pur_lvl}/{partition}/results.csv"
            ),
            sep=",",
            index_col=0,
        )

        # Fix up column names
        subset_preds_df.rename(
            columns={
                "T_cells": "T-cells",
                "B_cells": "B-cells",
                "Cancer_Epithelial": "Cancer Epithelial",
            },
            inplace=True,
        )

        preds_l.append(subset_preds_df)

preds_df = pd.concat(preds_l, axis=0)

# Calcuate preds-truth for each purity level
avg_diff_l = []
for pur_lvl in tqdm(purity_levels):
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]
    subset_preds_df = preds_df[preds_df.index.isin(subset_truth_df.index)]

    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]
    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/hspe.csv"), sep="\t")

## BayesPrism

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/bprism/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "T_cells": "T-cells",
            "B_cells": "B-cells",
            "Cancer_Epithelial": "Cancer Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

In [None]:
# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/bprism.csv"), sep="\t")