# Collate model predictions from purity-level partitions

In [None]:
import numpy as np
import pandas as pd
import anndata as adata
import scanpy as sc

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px

In [None]:
prefix = "/working/lab_nicw/khoaT/deep_tme/tme_profiling/tme_benchmarking/deconvolution_benchmarking/2021_12_05_normal_lineages"
# prefix = "/home/khoat/Development/deep_tme/data/scRNA/Swarbrick_GSE176078/2021_12_05_normal_lineages"
purity_levels = np.arange(0.45, 0.65, 0.05).round(3).tolist()
c_types = [
    "Endothelial",
    "CAFs",
    "PVL",
    "B-cells",
    "T-cells",
    "Myeloid",
    "Plasmablasts",
    "Cancer Epithelial",
    "Luminal Progenitors",
    "Mature Luminal",
    "Myoepithelial",
]

In [None]:
# Load truth.csv
truth_df = pd.read_csv(
    Path(prefix).joinpath("data/results/truth.csv"), sep="\t", index_col=0
)
truth_df = truth_df[c_types]
truth_df.head(2)

### CIBERSORTx

In [None]:
# If we run in normal mode, the results file is called CIBERSORTx_Results
# If we run in Smode or Bmode, the results file will be called CIBERSORTx_Adjusted.txt
# Adjust the filename accordingy
results_f = "CIBERSORTx_Results.txt"
# results_f = "CIBERSORTx_Adjusted.txt"

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cbx/results/{pur_lvl}/{results_f}"),
        sep="\t",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]
    subset_preds_df.drop(["P-value", "Correlation", "RMSE"], axis=1, inplace=True)
    preds_l.append(subset_preds_df)

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

### Scaden

In [None]:
models = [
    "kondrashova_train",  # "normal_train"
]

for model in tqdm(models):
    avg_diff_l = []
    preds_l = []

    for pur_lvl in tqdm(purity_levels):
        subset_preds_df = pd.read_csv(
            Path(prefix).joinpath(f"data/scaden/{model}/results_{pur_lvl}.txt"),
            sep="\t",
            index_col=0,
        )
        subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

        # Calcuate preds-truth for each purity level
        diff_df = abs(
            subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index()
        )
        avg_diff = diff_df.mean().to_frame()
        avg_diff.columns = [pur_lvl]

        preds_l.append(subset_preds_df)
        avg_diff_l.append(avg_diff)

    preds_df = pd.concat(preds_l, axis=0)
    avg_diff_df = pd.concat(avg_diff_l, axis=1)

### EPIC

#### If we're using default reference profiles

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/epic/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    print(subset_preds_df.mean())

    preds_l.append(subset_preds_df)

preds_df = pd.concat(preds_l, axis=0)

#### If we're using CBX-derived reference profiles and remove Cancer Epithelial from reference profiles
In this case, we're treating Other Cells in the output as cancer cells

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(
            f"data/epic/cbx_sig_matrix/results/{pur_lvl}/results.csv"
        ),
        sep=",",
        index_col=0,
    )

    # Replace otherCells in predictions by Cancer Epithelial
    subset_preds_df.rename(
        columns={
            "otherCells": "Cancer Epithelial",
            "B.cells": "B-cells",
            "T.cells": "T-cells",
            "Luminal.Progenitors": "Luminal Progenitors",
            "Mature.Luminal": "Mature Luminal",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

#### If we're using CBX-derived reference profiles and keep Cancer Epithelial in reference profiles
In this case, we're treating Other Cells in the output as anything that the model cannot deconvolute

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(
            f"data/epic/cbx_sig_matrix_with_cancer/results/{pur_lvl}/results.csv"
        ),
        sep=",",
        index_col=0,
    )

    # Replace otherCells in predictions by Cancer Epithelial
    subset_preds_df.rename(
        columns={
            "Cancer.Epithelial": "Cancer Epithelial",
            "B.cells": "B-cells",
            "T.cells": "T-cells",
            "Luminal.Progenitors": "Luminal Progenitors",
            "Mature.Luminal": "Mature Luminal",
        },
        inplace=True,
    )

    # Drop otherCells columns
    subset_preds_df.drop(["otherCells"], axis=1, inplace=True)

    # Replace "." by "-" in indexes
    subset_preds_df.index = [
        i.split(".")[0] + "." + i.split(".")[1] + "-" + i.split(".")[2]
        for i in subset_preds_df.index
    ]

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)

### CPM

We ran 4 CPM experiments:<br>
- Only selected 1,330 cells per type (for computational reasons) <br>
    - [Done] with cell state space generated from only training patients <br>
    - [] with cell state space generated from all patients (with test patients filtered out after cell state space is constructed). <br><br>
- Selected all cells per type <br>
    - [Done] with cell state space generated from only training patients <br>
    - [] with cell state space generated from all patients (with test patients filtered out after cell state space is constructed). 

In [None]:
# Select which experiments we'd like to generate results for
# experiment = "expr_1_train_p_cell_state_1330_per_ctype"
experiment = "expr_2_original_cellstate_1330_per_ctype/"

# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
presd_l = []

In [None]:
# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cpm/{experiment}/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    presd_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(presd_l, axis=0)

### bisque

In [None]:
# Choose whether we're extracting results from scaled or non-scaled data
results_dir = "results_minmaxscale"

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read predictions
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/bisque/{results_dir}/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    ).T

    # Get correct groundtruth subset
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    preds_l.append(subset_preds_df)
    avg_diff_l.append(avg_diff)

preds_df = pd.concat(preds_l, axis=0)
avg_diff_df = pd.concat(avg_diff_l, axis=1)

### CIBERSORT

In [None]:
experiment = "expr2_cbx_sig_matrix"

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
presd_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/cb/{experiment}/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )
    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]
    presd_l.append(subset_preds_df)

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_cpm.csvframe()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(presd_l, axis=0)

### DWLS

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/dwls/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    ).T

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "T_cells": "T-cells",
            "B_cells": "B-cells",
            "Luminal_Progenitors": "Luminal Progenitors",
            "Mature_Luminal": "Mature Luminal",
            "Cancer_Epithelial": "Cancer Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

## MuSiC

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/music/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    # Replace "." by "-" in indexes
    subset_preds_df.index = [
        i.split(".")[0] + "." + i.split(".")[1] + "-" + i.split(".")[2]
        for i in subset_preds_df.index
    ]

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

## dtangle

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/dtangle/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "T_cells": "T-cells",
            "B_cells": "B-cells",
            "Luminal_Progenitors": "Luminal Progenitors",
            "Mature_Luminal": "Mature Luminal",
            "Cancer_Epithelial": "Cancer Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

## hspe

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm(purity_levels):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/hspe/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "T_cells": "T-cells",
            "B_cells": "B-cells",
            "Luminal_Progenitors": "Luminal Progenitors",
            "Mature_Luminal": "Mature Luminal",
            "Cancer_Epithelial": "Cancer Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)

## BayesPrism

In [None]:
# Make an empty list to store average of (preds - truth) of each purity levels
avg_diff_l = []
preds_l = []

# Iterate over purity levels
for pur_lvl in tqdm([0.5]):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/bprism/results/{pur_lvl}/results.csv"),
        sep=",",
        index_col=0,
    )

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "T_cells": "T-cells",
            "B_cells": "B-cells",
            "Luminal_Progenitors": "Luminal Progenitors",
            "Mature_Luminal": "Mature Luminal",
            "Cancer_Epithelial": "Cancer Epithelial",
        },
        inplace=True,
    )

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)