# Collate model predictions from purity-level partitions

In [None]:
import numpy as np
import pandas as pd
import anndata as adata
import scanpy as sc

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px

%load_ext blackcellmagic

In [None]:
prefix = "???/03_immune_lineages_experiment/subset_level"
purity_levels = [0.5]
c_types = [
    "Endothelial",
    "CAFs",
    "PVL",
    "B cells Memory",
    "B cells Naive",
    "T_cells_c4_CD8+_ZFP36",
    "T_cells_c6_IFIT1",
    "T_cells_c7_CD8+_IFNG",
    "T_cells_c8_CD8+_LAG3",
    "T_cells_c0_CD4+_CCR7",
    "T_cells_c1_CD4+_IL7R",
    "T_cells_c2_CD4+_T-regs_FOXP3",
    "T_cells_c3_CD4+_Tfh_CXCL13",
    "T_cells_c9_NK_cells_AREG",
    "T_cells_c11_MKI67",
    "T_cells_c10_NKT_cells_FCGR3A",
    "Myeloid_c10_Macrophage_1_EGR1",
    "Myeloid_c12_Monocyte_1_IL1B",
    "Myeloid_c2_LAM2_APOE",
    "Myeloid_c1_LAM1_FABP5",
    "Cycling_Myeloid",
    "Myeloid_c4_DCs_pDC_IRF7",
    "Normal Epithelial",
    "Plasmablasts",
    "Myeloid_c8_Monocyte_2_S100A9",
    "Myeloid_c9_Macrophage_2_CXCL10",
    "Myeloid_c11_cDC2_CD1C",
    "Cancer Epithelial",
    "Myeloid_c3_cDC1_CLEC9A",
]

## Prepare our groundtruth

If we haven't extracted groundtruth from test AnnData object

In [None]:
test_adata = sc.read_h5ad(Path(prefix).joinpath("data/test/test_sim_mixts.h5ad"))
truth_df = test_adata.obs.drop(["batch"], axis=1).fillna(0)

In [None]:
# There are cell types ('T_cells_c5_CD8+_GZMK', 'Myeloid_c5_Macrophage_3_SIGLEC1')
# that don't exist in the test data
# set these cell types to be 0% before sorting columns
truth_df["T_cells_c5_CD8+_GZMK"] = 0
truth_df["Myeloid_c5_Macrophage_3_SIGLEC1"] = 0
truth_df = truth_df[c_types]

In [None]:
# Make results/ directory if it hasn't existed yet
Path(prefix).joinpath("data/results/").mkdir(exist_ok=True, parents=True)

In [None]:
# Save into csv beautifully
truth_df.to_csv(Path(prefix).joinpath("data/results/truth.csv"), sep="\t")

If we have already extracted the groundtruth

In [None]:
# Load truth.csv
truth_df = pd.read_csv(
    Path(prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
truth_df = truth_df[c_types]
truth_df.head(2)

### CIBERSORTx

In [None]:
# If we run in normal mode, the results file is called CIBERSORTx_Results
# If we run in Smode or Bmode, the results file will be called CIBERSORTx_Adjusted.txt
# Adjust the filename accordingy
results_f = "CIBERSORTx_Results.txt"

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cbx/results_perm_100/{pur_lvl}/{results_f}"),
        sep="\t",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]
    subset_preds_df.drop(["P-value", "Correlation", "RMSE"], axis=1, inplace=True)
    preds_l.append(subset_preds_df)

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

### Scaden

In [None]:
avg_diff_l = []
preds_l = []

for pur_lvl in tqdm(purity_levels):
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/scaden/kondrashova_train/results_{pur_lvl}.txt"),
        sep="\t",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

preds_df = pd.concat(preds_l, axis=0)
avg_diff_df = pd.concat(avg_diff_l, axis=1)

### EPIC

We are using CBX-derived reference profiles and remove Cancer Epithelial from reference profiles <br><br>
In this case, we're treating Other Cells in the output as cancer cells

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(
            f"data/epic/cbx_sig_matrix/results/{pur_lvl}/results.csv"
        ),
        sep=",",
        index_col=0,
    )

    # Replace otherCells in predictions by Cancer Epithelial
    subset_preds_df.rename(
        columns={
            "otherCells": "Cancer Epithelial",
            "B.cells.Memory": "B cells Memory",
            "B.cells.Naive": "B cells Naive",
            "T_cells_c4_CD8._ZFP36": "T_cells_c4_CD8+_ZFP36",
            "T_cells_c7_CD8._IFNG": "T_cells_c7_CD8+_IFNG",
            "T_cells_c8_CD8._LAG3": "T_cells_c8_CD8+_LAG3",
            "T_cells_c0_CD4._CCR7": "T_cells_c0_CD4+_CCR7",
            "T_cells_c1_CD4._IL7R": "T_cells_c1_CD4+_IL7R",
            "T_cells_c2_CD4._T.regs_FOXP3": "T_cells_c2_CD4+_T-regs_FOXP3",
            "T_cells_c3_CD4._Tfh_CXCL13": "T_cells_c3_CD4+_Tfh_CXCL13",
            "Normal.Epithelial": "Normal Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

### CPM

We ran 4 CPM experiments:<br>
- Only selected 1,330 cells per type (for computational reasons) <br>
    - [Done] with cell state space generated from only training patients <br>
    - [] with cell state space generated from all patients (with test patients filtered out after cell state space is constructed). <br><br>
- Selected all cells per type <br>
    - [Done] with cell state space generated from only training patients <br>
    - [] with cell state space generated from all patients (with test patients filtered out after cell state space is constructed). 

In [None]:
# Select which experiments we'd like to generate results for
# experiment = "expr_1_train_p_cell_state_1330_per_ctype"
experiment = "expr_2_original_cellstate_1330_per_ctype/"

# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
presd_l = []

In [None]:
# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cpm/{experiment}/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    presd_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(presd_l, axis=0)

### bisque

In [None]:
# Choose whether we're extracting results from scaled or non-scaled data
results_dir = "results_minmaxscale"

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/bisque/{results_dir}/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    ).T

    # Get correct groundtruth subset
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

preds_df = pd.concat(preds_l, axis=0)
avg_diff_df = pd.concat(avg_diff_l, axis=1)

### DWLS

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/dwls/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    ).T

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "Normal_Epithelial": "Normal Epithelial",
            "Cancer_Epithelial": "Cancer Epithelial",
            "T_cells_c1_CD4_IL7R": "T_cells_c1_CD4+_IL7R",
            "T_cells_c4_CD8_ZFP36": "T_cells_c4_CD8+_ZFP36",
            "T_cells_c0_CD4_CCR7": "T_cells_c0_CD4+_CCR7",
            "T_cells_c7_CD8_IFNG": "T_cells_c7_CD8+_IFNG",
            "T_cells_c3_CD4_Tfh_CXCL13": "T_cells_c3_CD4+_Tfh_CXCL13",
            "T_cells_c8_CD8_LAG3": "T_cells_c8_CD8+_LAG3",
            "T_cells_c5_CD8_GZMK": "T_cells_c5_CD8+_GZMK",
            "T_cells_c2_CD4_T_regs_FOXP3": "T_cells_c2_CD4+_T-regs_FOXP3",
            "B_cells_Memory": "B cells Memory",
            "B_cells_Naive": "B cells Naive",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

## MuSiC

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/music/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

## hspe

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):

    # Iterate over each of the 20 partitions
    for partition in list(range(0, 20, 1)):

        # Read and reorganize  index and columns to match truth_df
        subset_preds_df = pd.read_csv(
            Path(prefix).joinpath(
                f"data/hspe/results/{pur_lvl}/{partition}/results.csv"
            ),
            sep=",",
            index_col=0,
        )

        # Fix up column names
        subset_preds_df.rename(
            columns={
                "Normal_Epithelial": "Normal Epithelial",
                "Cancer_Epithelial": "Cancer Epithelial",
                "T_cells_c1_CD4_IL7R": "T_cells_c1_CD4+_IL7R",
                "T_cells_c4_CD8_ZFP36": "T_cells_c4_CD8+_ZFP36",
                "T_cells_c0_CD4_CCR7": "T_cells_c0_CD4+_CCR7",
                "T_cells_c7_CD8_IFNG": "T_cells_c7_CD8+_IFNG",
                "T_cells_c3_CD4_Tfh_CXCL13": "T_cells_c3_CD4+_Tfh_CXCL13",
                "T_cells_c8_CD8_LAG3": "T_cells_c8_CD8+_LAG3",
                "T_cells_c5_CD8_GZMK": "T_cells_c5_CD8+_GZMK",
                "T_cells_c2_CD4_T_regs_FOXP3": "T_cells_c2_CD4+_T-regs_FOXP3",
                "B_cells_Memory": "B cells Memory",
                "B_cells_Naive": "B cells Naive",
            },
            inplace=True,
        )

        preds_l.append(subset_preds_df)

preds_df = pd.concat(preds_l, axis=0)

## BayesPrism

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/bprism/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "Normal_Epithelial": "Normal Epithelial",
            "Cancer_Epithelial": "Cancer Epithelial",
            "T_cells_c1_CD4_IL7R": "T_cells_c1_CD4+_IL7R",
            "T_cells_c4_CD8_ZFP36": "T_cells_c4_CD8+_ZFP36",
            "T_cells_c0_CD4_CCR7": "T_cells_c0_CD4+_CCR7",
            "T_cells_c7_CD8_IFNG": "T_cells_c7_CD8+_IFNG",
            "T_cells_c3_CD4_Tfh_CXCL13": "T_cells_c3_CD4+_Tfh_CXCL13",
            "T_cells_c8_CD8_LAG3": "T_cells_c8_CD8+_LAG3",
            "T_cells_c5_CD8_GZMK": "T_cells_c5_CD8+_GZMK",
            "T_cells_c2_CD4_T_regs_FOXP3": "T_cells_c2_CD4+_T-regs_FOXP3",
            "B_cells_Memory": "B cells Memory",
            "B_cells_Naive": "B cells Naive",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)