# Collate model predictions from purity-level partitions

In [None]:
import numpy as np
import pandas as pd
import anndata as adata
import scanpy as sc

from tqdm import tqdm
from pathlib import Path
from typing import Tuple, List, Dict

import plotly.graph_objects as go
import plotly as plotly
import plotly.express as px

%load_ext blackcellmagic

In [None]:
prefix = "???/deconvolution_benchmarking/04_tcga_bulk_validation"
c_types = [
    "Cancer Epithelial",
    "Normal Epithelial",
    "T-cells",
    "B-cells",
    "Myeloid",
    "Endothelial",
    "CAFs",
    "PVL",
    "Plasmablasts",
]

### CIBERSORTx

In [None]:
# If we run in normal mode, the results file is called CIBERSORTx_Results
# If we run in Smode or Bmode, the results file will be called CIBERSORTx_Adjusted.txt
# Adjust the filename accordingy
results_f = "CIBERSORTx_Adjusted.txt"

# Read and reorganize  index and columns to match truth_df
preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/cbx/results/{results_f}"),
    sep="\t",
    index_col=0,
)

# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/cbx.csv"), sep="\t")

### Scaden

In [None]:
model = "filtered_ensembl_id"

preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/scaden/{model}/results.txt"),
    sep="\t",
    index_col=0,
)

# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/scaden.csv"), sep="\t")

### EPIC

#### If we're using default reference profiles

In [None]:
# Read and reorganize  index and columns to match truth_df
preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/epic/cbx_sig_matrix/results/results.csv"),
    sep=",",
    index_col=0,
)

# EPIC replaces "-" by "." and adds "X" to the beginning of uuid that starts with a number
# Revert these
preds_df.index = [i.replace(".", "-") for i in preds_df.index]
preds_df.index = [i.split("X")[1] if i[0] == "X" else i for i in preds_df.index]

# Fix up column names
preds_df.rename(
    columns={
        "otherCells": "Cancer Epithelial",
        "B.cells": "B-cells",
        "T.cells": "T-cells",
        "Normal.Epithelial": "Normal Epithelial",
    },
    inplace=True,
)

# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/epic.csv"), sep="\t")

### CPM

We ran 4 CPM experiments:<br>
- Only selected 1,330 cells per type (for computational reasons) <br>
    - [Done] with cell state space generated from only training patients <br>
    - [] with cell state space generated from all patients (with test patients filtered out after cell state space is constructed). <br><br>
- Selected all cells per type <br>
    - [Done] with cell state space generated from only training patients <br>
    - [] with cell state space generated from all patients (with test patients filtered out after cell state space is constructed). 

In [None]:
# Select which experiments we'd like to generate results for
experiment = "expr_2_original_cellstate_1330_per_ctype/"

# Read and reorganize  index and columns to match truth_df
preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/cpm/{experiment}/results/results.csv"),
    sep=",",
    index_col=0,
)

# CPM replaces "-" by "." and adds "X" to the beginning of uuid that starts with a number
# Revert these
preds_df.index = [i.replace(".", "-") for i in preds_df.index]
preds_df.index = [i.split("X")[1] if i[0] == "X" else i for i in preds_df.index]

# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/cpm.csv"), sep="\t")

### bisque

In [None]:
expt = "linear"

# Read predictions
preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/bisque/results_{expt}/results.csv"),
    sep=",",
    index_col=0,
).T

# CPM replaces "-" by "." and adds "X" to the beginning of uuid that starts with a number
# Revert these
preds_df.index = [i.replace(".", "-") for i in preds_df.index]
preds_df.index = [i.split("X")[1] if i[0] == "X" else i for i in preds_df.index]

# Save predictions
preds_df.to_csv(Path(prefix).joinpath(f"data/results/bisque_{expt}.csv"), sep="\t")

### DWLS

In [None]:
sig_matrix_build_method = "seurat"

# Read and reorganize  index and columns to match truth_df
preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/dwls/results_{sig_matrix_build_method}/results.csv"),
    sep=",",
    index_col=0,
).T

# CPM replaces "-" by "." and adds "X" to the beginning of uuid that starts with a number
# Revert these
preds_df.index = [i.replace(".", "-") for i in preds_df.index]
preds_df.index = [i.split("X")[1] if i[0] == "X" else i for i in preds_df.index]

# Fix up column names
preds_df.rename(
    columns={
        "T_cells": "T-cells",
        "B_cells": "B-cells",
        "Normal_Epithelial": "Normal Epithelial",
        "Cancer_Epithelial": "Cancer Epithelial",
    },
    inplace=True,
)
# Save predictions

preds_df.to_csv(
    Path(prefix).joinpath(f"data/results/dwls_{sig_matrix_build_method}.csv"), sep="\t"
)

## MuSiC

In [None]:
# Read and reorganize  index and columns to match truth_df
preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/music/results/results.csv"),
    sep=",",
    index_col=0,
)

# CPM replaces "-" by "." and adds "X" to the beginning of uuid that starts with a number
# Revert these
preds_df.index = [i.replace(".", "-") for i in preds_df.index]
preds_df.index = [i.split("X")[1] if i[0] == "X" else i for i in preds_df.index]

# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/music.csv"), sep="\t")

## hspe

In [None]:
# Make an empty list to store predictions of each partition
preds_l = []

# Iterate over each of the 20 partitions
for partition in list(range(0, 20, 1)):
    # Read and reorganize  index and columns to match truth_df
    subset_preds_df = pd.read_csv(
        Path(prefix).joinpath(f"data/hspe/results/{partition}/results.csv"),
        sep=",",
        index_col=0,
    )

    # Fix up column names
    subset_preds_df.rename(
        columns={
            "T_cells": "T-cells",
            "B_cells": "B-cells",
            "Normal_Epithelial": "Normal Epithelial",
            "Cancer_Epithelial": "Cancer Epithelial",
        },
        inplace=True,
    )

    preds_l.append(subset_preds_df)

preds_df = pd.concat(preds_l, axis=0)

# Save predictions
preds_df.to_csv(Path(prefix).joinpath("data/results/hspe.csv"), sep="\t")

## BayesPrism v2

In [None]:
# We run BayesPrism v2 in different experiments. Decided which set of results to pull
# Experiment name will also be results file's suffix
expt = "marker_genes_cell_states"

# Read and reorganize  index and columns to match truth_df
preds_df = pd.read_csv(
    Path(prefix).joinpath(f"data/bprism_v2/results_{expt}/results.csv"),
    sep=",",
    index_col=0,
)

# CPM replaces "-" by "." and adds "X" to the beginning of uuid that starts with a number
# Revert these
preds_df.index = [i.replace(".", "-") for i in preds_df.index]
preds_df.index = [i.split("X")[1] if i[0]=="X" else i for i in preds_df.index]

# Fix up column names
preds_df.rename(
    columns={
        "T_cells": "T-cells",
        "B_cells": "B-cells",
        "Normal_Epithelial": "Normal Epithelial",
        "Cancer_Epithelial": "Cancer Epithelial",
    },
    inplace=True,
)

    subset_truth_df = truth_df[truth_df["Cancer Epithelial"] == pur_lvl]

    # Calcuate preds-truth for each purity level
    diff_df = abs(subset_preds_df[c_types].sort_index() - subset_truth_df.sort_index())
    # diff_df = subset_preds_df[c_types] - subset_truth_df
    avg_diff = diff_df.mean().to_frame()
    avg_diff.columns = [pur_lvl]

    avg_diff_l.append(avg_diff)
    preds_l.append(subset_preds_df)

avg_diff_df = pd.concat(avg_diff_l, axis=1)
preds_df = pd.concat(preds_l, axis=0)