# Prepare training and test data for specific tools

In [None]:
import json
import shutil

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as adata

from tqdm import tqdm
from pathlib import Path
from sklearn import preprocessing as pp
from sklearn.preprocessing import LabelEncoder

In [None]:
prefix = "???/deconvolution_benchmarking/01_purity_levels_experiment/exclude_normal_epithelial"

In [None]:
# Training patient IDs
train_p_ids = [
    "CID3586",
    "CID3941",
    "CID3963",
    "CID44041",
    "CID4530N",
    "CID3838",
    "CID3946",
    "CID4040",
    "CID4461",
    "CID44991",
    "CID45171",
    "CID4535",
    "CID3948",
    "CID4398",
    "CID4463",
    "CID4495",
    "CID4513",
    "CID4465",
]

# Training patient IDs
test_p_ids = [
    "CID4067",
    "CID4290A",
    "CID4471",
    "CID3921",
    "CID4066",
    "CID4523",
    "CID44971",
    "CID4515",
]

# 19 tumour purity levels: [5%,95%,10%]
pur_lvls = np.arange(0.05, 1, 0.05).round(3).tolist()

# 8 major cell types (no Normal Epithelial)
c_types = [
    "B-cells",
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Myeloid",
    "PVL",
    "Plasmablasts",
    "T-cells",
]

## 0. Process simulated test mixtures
- Grab the .h5ad file containing all test mixture we previously generated 
- Also save it into partitions corresponding to purity levels (which will be used for all tools)

In [None]:
# Load up test mixture AnnData object
test_adata = sc.read_h5ad(Path(prefix).joinpath("data/test/test_sim_mixts.h5ad"))
test_counts_df = test_adata.to_df()
test_labels_df = test_adata.obs.copy()

# Drop the "batch" column and fill NaN by 0
test_labels_df.drop(["batch"], axis=1, inplace=True)
test_labels_df.fillna(0, inplace=True)

In [None]:
for pur_lvl in tqdm(pur_lvls):
    subset_obs_df = test_labels_df[test_labels_df["Cancer Epithelial"] == pur_lvl]
    subset_test_counts_df = test_counts_df.loc[subset_obs_df.index, :]

    subset_test_counts_df.T.to_csv(
        Path(prefix).joinpath(f"data/test/test_counts_{pur_lvl}_pur_lvl.txt"), sep="\t"
    )

## 1. CIBERSORTx

#### Prepare scRNA-Seq training data

In [None]:
# Read from prepared AnnData object
train_sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))
train_sc_df = train_sc_adata.to_df()

# Rename index
train_sc_df.index.name = "gene_symbol"

CBX requires a single-cell reference matrix with cell labels as columns and gene symbols as rows <br>
train_sc_df already has cell ids as columns, we just need to replace this by cell labels

In [None]:
# First make very sure that cell ids in train_sc_df and train_sc_adata.var are in the same order
assert np.array_equal(
    train_sc_df.columns.values, train_sc_adata.var["cell_labels"].index.values
)

# Then simply replace columns with cell labels
train_sc_df.columns = train_sc_adata.var["cell_labels"].astype(str).values

In [None]:
# Make directory for CBX first if it doesn't exist yet
Path(prefix).joinpath("data/cbx/").mkdir(exist_ok=True, parents=True)

# Save output beautifully
train_sc_df.to_csv(
    Path(prefix).joinpath("data/cbx/scRNA_ref.txt"), sep="\t", chunksize=5000
)

## 2. Scaden

#### Prepare AnnData training data

In [None]:
# First load the anndata files that have been prepared in previous steps
train_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/training_sim_mixts.h5ad"))
train_counts_df = train_adata.to_df()
train_labels_df = train_adata.obs

In [None]:
# First remove "batch" column in obs and replace NaN by 0
scaden_train_adata = train_adata.copy()
scaden_train_adata.obs = scaden_train_adata.obs.drop(["batch"], axis=1).fillna(0)

# Scaden requires cell fractions DataFrame to have a column call "ds"
# This column is supposed to store info on what dataset each row comes from
# And the during training we can delect which dataset gets used for training, which is quite handy
# However, in this case, there is only 1 dataset
# Make all row ds="Wu_et_al_GSE176078"
scaden_train_adata.obs["ds"] = "Wu_et_al_GSE176078"

In [None]:
# add cell types and signature genes
scaden_train_adata.uns["cell_types"] = [
    "T-cells",
    "B-cells",
    "Myeloid",
    "CAFs",
    "Plasmablasts",
    "Cancer Epithelial",
    "Endothelial",
    "PVL",
]
scaden_train_adata.uns["unknown"] = ""

In [None]:
# Rename index and columns properly
scaden_train_adata.obs.index.name = "mixture_id"
scaden_train_adata.var.index.name = "gene_symbol"

In [None]:
# Make directory for Scaden first if it doesn't exist yet
Path(prefix).joinpath("data/scaden/").mkdir(exist_ok=True, parents=True)

# Save training data beautiful
scaden_train_adata.write_h5ad(Path(prefix).joinpath("data/scaden/train_counts.h5ad"))

### 3. CPM

For CPM, we need to prepare 3 files (in addition to bulk counts):
- single-cell reference:    rows as genes, columns as cells
- cell labels:              one single column with cell labels
- UMAP/tSNE:                first column is cell labels, next 2 columns are UMAP/tSNE coordinates

#### Single cell reference data

In [None]:
# Load the single-cell reference matrix we generated for the with-Normal experiment and remove Normal Epithelial cells
with_normal_prefix = "???/01_purity_levels_experiment/include_normal_epithelial"
experiment = "expr_2_original_cellstate_1330_per_ctype"

sc_with_normal_df = pd.read_csv(
    Path(with_normal_prefix).joinpath(
        f"data/cpm/{experiment}/scRNA_ref_1330_per_ctype.txt"
    ),
    index_col=0,
    sep="\t",
)

#### UMAP coordinates

In [None]:
# Load up UMAP nanifold coordinates created by Seurat
umap_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_umap.coords.tsv"),
    index_col=0,
    sep="\t",
)

# Drop second row which contains datatype
umap_df.drop(["TYPE"], axis=0, inplace=True)
umap_df = umap_df.astype(float)

#### Single cell labels

In [None]:
# First load up all manifold coordinates created by Seurat
meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_meta_no_normal.csv"),
    index_col=0,
    sep="\t",
)

train_meta_df = meta_df[meta_df["Patient"].isin(train_p_ids)]

#### Filter out Normal Epithelial in meta_df to grab cell ids

In [None]:
# First retrieve indexes of the cells we randomly sampled
sampled_cell_ids = [
    i.split("_")[1] + "_" + i.split("_")[2] for i in sc_with_normal_df.columns
]

In [None]:
# Grab these ids in meta_df and umap_df
# Filter out Normal Epithelial cells
# Then sort_index() to make sure they are in the same order
train_meta_without_normal_df = train_meta_df[
    (train_meta_df.index.isin(sampled_cell_ids))
    & (train_meta_df["cell_labels"] != "Normal Epithelial")
]

# Cell labels
sc_without_normal_labels_df = train_meta_without_normal_df.sort_index()[
    "cell_labels"
].to_frame()


# UMAP coordinates
umap_without_normal_df = umap_df[
    umap_df.index.isin(train_meta_without_normal_df.index)
].sort_index()

# Rearrange columsn in single cell reference data to match order of UMAP and labels DataFrames
sc_withouth_normal_df = sc_with_normal_df[
    sc_without_normal_labels_df["cell_labels"].values
    + "_"
    + sc_without_normal_labels_df.index.values
]

In [None]:
# Use this save function if we are using original cell-state from Wu et all and only 1,330 cells per type
experiment = "expr_1_original_cellstate_1330_per_ctype"
Path(prefix).joinpath(f"data/cpm/{experiment}").mkdir(exist_ok=True, parents=True)

sc_without_normal_labels_df.to_csv(
    Path(prefix).joinpath(f"data/cpm/{experiment}/single_cell_label.csv"), sep="\t"
)

umap_without_normal_df.to_csv(
    Path(prefix).joinpath(f"data/cpm/{experiment}/cell_state.csv"), sep=","
)

sc_withouth_normal_df.to_csv(
    Path(prefix).joinpath(
        f"data/cpm/{experiment}/scRNA_ref_1330_per_ctype_without_normal.txt"
    ),
    sep="\t",
    chunksize=1000,
)

### 4. bisque
bisque expect a .h5ad file holding non-logs single-cell gene counts in the bique/ folder <br>
This file would have been previously generated for CPM

In [None]:
# bisque also requires a DataFrame containing cell ids, cell labels, and patient id
# All of this information can be extracted from the original metadata csv

# First load up all metadata created by Seurat
meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_meta_no_normal.csv"),
    index_col=0,
    sep="\t",
)

# Filter out test patients and Normal Epithelial cells
train_meta_df = meta_df[
    (meta_df["Patient"].isin(train_p_ids))
    & (meta_df["cell_labels"] != "Normal Epithelial")
]

In [None]:
# Make method-specific directory for bisque if it doesn't exist yet
Path(prefix).joinpath("data/bisque/").mkdir(exist_ok=True, parents=True)

In [None]:
# Extract patieint id, cell labels and cell ids into a phenotype DataFrame
pheno_df = train_meta_df[["Patient", "cell_labels"]].reset_index()
pheno_df.columns = ["cell_ids", "patient_ids", "cell_labels"]

pheno_df.to_csv(Path(prefix).joinpath("data/bisque/phenotypes.csv"), sep="\t")

In [None]:
# Load single-cell data and log it
sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))
sc_df = sc_adata.to_df()

# Normalize dataa
mms = pp.MinMaxScaler(feature_range=(0, 1), copy=True)
scaled_sc_arr = mms.fit_transform(sc_df.T).T
scaled_sc_df = pd.DataFrame(scaled_sc_arr, index=sc_df.index, columns=sc_df.columns)

# Save scaled linear counts
scaled_sc_df.to_csv(
    Path(prefix).joinpath("data/bisque/scaled_scRNA_ref.csv"), sep="\t", chunksize=5000
)

### 5. DWLS
DWLS only expects single cell labels accompanying the single-cell data

In [None]:
# First load single cell counts
sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))
sc_df = sc_adata.to_df()

# Then load up metadata, select training patient ids, and sort_index
meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_meta_no_normal.csv"),
    index_col=0,
    sep="\t",
)

train_meta_df = meta_df[meta_df["Patient"].isin(train_p_ids)]

In [None]:
# Make method-specific directory for dwls if it doesn't exist yet
Path(prefix).joinpath("data/dwls/").mkdir(exist_ok=True, parents=True)

In [None]:
# Extract cell labels into a DataFrame
labels_df = train_meta_df[["cell_labels"]].sort_index()

# Apparently R/3.5.0 doesn't understand how to parse the character "-"
# meaning "T-cells" will be read as a vector of "T" and "cells"
# Also R/3.5.0 can't parse " "
# Replace all cell types with these characters by "_"
labels_df["cell_labels"].replace(
    {
        "T-cells": "T_cells",
        "B-cells": "B_cells",
        "Cancer Epithelial": "Cancer_Epithelial",
    },
    inplace=True,
)

labels_df.to_csv(Path(prefix).joinpath("data/dwls/single_cell_labels.csv"), sep="\t")

In [None]:
# Re-arrange single-cell DataFrame to match the same order of cell ids as phenotype DataFrame
sc_df = sc_df[labels_df.index]

sc_df.to_csv(Path(prefix).joinpath("data/dwls/scRNA_ref.csv"), sep="\t", chunksize=5000)

### 6. EPIC

In [None]:
# Make method-specific directory for epic if it doesn't exist yet
Path(prefix).joinpath("data/epic/cbx_sig_matrix/").mkdir(exist_ok=True, parents=True)

In [None]:
# EPIC relies on the signature matrix and marker genes generated by CIBERSORTx to run
# We need to copy the signature matrix generated by CIBERSORT'x first before running EPIC
# CIBERSORTx generate its signature matrix using the single-cell reference
# => Signature matrices across all tumour purity levels are identical, we just need to pick one for EPIC
cbx_sig_mat_f = "CIBERSORTx_scRNA_ref_inferred_phenoclasses.CIBERSORTx_scRNA_ref_inferred_refsample.bm.K999.txt"
shutil.copy(
    Path(prefix).joinpath(f"data/cbx/results/{pur_lvls[0]}/{cbx_sig_mat_f}"),
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/cbx_sig_matrix.txt"),
)

In [None]:
# Load signature matrix and marker genes profiles
cbx_sig_matrix_df = pd.read_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/cbx_sig_matrix.txt"),
    index_col=0,
    sep="\t",
)

# EPIC assumes the "unknown" cells in a tumour is cancer cells
# Therefore we need to drop Cancer Epithelial from the signature matrix
cbx_sig_matrix_df.drop(["Cancer Epithelial"], axis=1, inplace=True)

# Save signature matrix beautifully
cbx_sig_matrix_df.to_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/reference_profiles.csv"), sep="\t"
)

# Extract marker genes from marker gene profiles and save into a .csv
marker_gene_labels_df = cbx_sig_matrix_df.index.to_frame()
marker_gene_labels_df.rename(columns={"NAME": "gene_symbol"}, inplace=True)

marker_gene_labels_df.to_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/marker_gene_symbols.csv"), sep="\t"
)

### 7. hspe
hspe performs tumour deconvolution by first building a list of marker genes for each cell types. Both methods assume that each cell type has a unique list of marker genes. For each cell type, hspe uses log2-transformed expressions of the cell type's marker genes to deconvolve the cell type's proportion within the mixture using a linear mix equation 

In [None]:
# Read from prepared AnnData object
train_sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))
train_sc_df = train_sc_adata.to_df()

# Rename index
train_sc_df.index.name = "gene_symbol"

# Apply log1p (i.e. add 1 and apply log2)
# Both dtangle and hspe only mention log2 without + 1. This will lead to undefined output, as log2(0) = infinity. We therefore added 1 to gene expressions to avoid this
# 0 gene expression values will stil return 0 after log1p transformation
log_train_sc_df = np.log2(train_sc_df + 1)

# Also oth dtangle and hspe require bulk mixtures and single-cell reference to have genes as columns and rows as samples. We need to tranpose it
log_train_sc_df = log_train_sc_df.T

In [None]:
# Load test count DataFrames and transpose them so genes are columns and samples are rows
test_adata = sc.read_h5ad(Path(prefix).joinpath("data/test/test_sim_mixts.h5ad"))
test_counts_df = test_adata.to_df()
test_labels_df = test_adata.obs

# Drop the "batch" column and fill NaN by 0
test_labels_df.drop(["batch"], axis=1, inplace=True)
test_labels_df.fillna(0, inplace=True)

# Apply log1p one test counts
log_test_counts_df = np.log2(test_counts_df + 1)

##### Save train & test counts

In [None]:
# Before saving train and test counts , do a sanity check to make sure train and test DataFrames have the same genes in the same order
assert np.array_equal(
    log_train_sc_df.columns.to_numpy(), log_test_counts_df.columns.to_numpy()
)

In [None]:
# Make method-specific directory for hspe if it doesn't exist yet
Path(prefix).joinpath("data/hspe/").mkdir(exist_ok=True, parents=True)

In [None]:
# Save test data by purity levels
for pur_lvl in tqdm(pur_lvls):
    subset_obs_df = test_labels_df[test_labels_df["Cancer Epithelial"] == pur_lvl]
    subset_test_counts_df = log_test_counts_df.loc[subset_obs_df.index, :]

    # Within each tumour purity, split data into 10 shards
    # This allows us to paralellize the run into 190-fold
    for shard in tqdm(list(range(0, 20, 1))):
        shard_obs_df = np.array_split(subset_obs_df, 20)[shard]
        shard_test_counts_df = subset_test_counts_df.loc[shard_obs_df.index, :]

        shard_test_counts_df.to_csv(
            Path(prefix).joinpath(
                f"data/hspe/logged_test_counts_{pur_lvl}_pur_lvl_{shard}.txt"
            ),
            sep="\t",
        )

##### Extract pure samples
Both dtangle and hspe require a pure_samples variable. This is a list variable, in which each item corresponds to one cell type and indexes of all cells of the same type in the single-cell reference DataFrame <br>

We need to retrieve cell type of the single-cell reference data and save this information into a .json file

In [None]:
# First load up all metadata created by Seurat
meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_meta_no_normal.csv"),
    index_col=0,
    sep="\t",
)

train_meta_df = meta_df[
    (meta_df["Patient"].isin(train_p_ids))
    & (meta_df["cell_labels"] != "Normal Epithelial")
]

In [None]:
# Reset index of log_train_sc_df() so we have order of cell ids as the indexes
reset_log_train_sc_df = log_train_sc_df.reset_index().rename(
    columns={"index": "cell_ids"}
)

# Iterate over cell types and extract cell indexes from single-cell reference
pure_samples_d = {}

for c_type in tqdm(train_meta_df["cell_labels"].unique()):
    c_ids = (train_meta_df[train_meta_df["cell_labels"] == c_type]).index.tolist()
    c_indexes = reset_log_train_sc_df[
        reset_log_train_sc_df["cell_ids"].isin(c_ids)
    ].index

    # Python starts indexes from 0 and R starts from 1
    # Add 1 to index and add to pure_samples_d
    pure_samples_d[c_type] = (c_indexes + 1).tolist()

# Remap keys containing spaces and hyphens
pure_samples_d["T_cells"] = pure_samples_d.pop("T-cells")
pure_samples_d["B_cells"] = pure_samples_d.pop("B-cells")
pure_samples_d["Cancer_Epithelial"] = pure_samples_d.pop("Cancer Epithelial")

In [None]:
# Save pure_samples_d into a json file
json.dump(
    pure_samples_d,
    open(Path(prefix).joinpath(f"data/hspe/pure_samples.json"), "w"),
    indent=4,
)

In [None]:
# Save single-cell datta
log_train_sc_df.to_csv(
    Path(prefix).joinpath("data/hspe/scRNA_ref.csv"), sep="\t", chunksize=5000
)

### 8. MuSiC
MuSiC requires single-cell and bulk expressions in ExpressionSet objects <br>
The single-cell ExpressionSet also needs to a phenoType item containing
- **sampleID**        index of patient
- **SubjectName**      patient id
- **cellTypeID**       index of cell type
- **cellType**         cell annotation labels


In [None]:
# Read from prepared AnnData object
train_sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))
train_sc_df = train_sc_adata.to_df()

# Rename index
train_sc_df.index.name = "gene_symbol"

In [None]:
# Load up all metadata created by Seurat
meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_meta_no_normal.csv"),
    index_col=0,
    sep="\t",
)

train_meta_df = meta_df[
    (meta_df["Patient"].isin(train_p_ids))
    & (meta_df["cell_labels"] != "Normal Epithelial")
]

In [None]:
# Extract "Patient" + "celltype_major columns" and rename columns to match MuSiC requirements
pheno_df = train_meta_df[["Patient", "cell_labels"]].rename(
    columns={"Patient": "SubjectName", "cell_labels": "cellType"}
)

pheno_df.index.name = None

In [None]:
# Encode cell labels into number to use as cellTypeID
l_encoder = LabelEncoder()
l_encoder.fit(c_types)
pheno_df["cellTypeID"] = l_encoder.transform(pheno_df["cellType"]) + 1

# Encode patient ids into number to use as sampleID
l_encoder = LabelEncoder()
l_encoder.fit(pheno_df["SubjectName"].unique())
pheno_df["sampleID"] = l_encoder.transform(pheno_df["SubjectName"]) + 1

In [None]:
# Make method-specific directory for music if it doesn't exist yet
Path(prefix).joinpath("data/music/").mkdir(exist_ok=True, parents=True)

In [None]:
# Save pheno DataFrame
pheno_df.to_csv(Path(prefix).joinpath("data/music/pheno.csv"), sep="\t")

# Save train counts
train_sc_df = train_sc_df[pheno_df.index]
train_sc_df.to_csv(
    Path(prefix).joinpath("data/music/scRNA_ref.csv"), sep="\t", chunksize=5000
)

### 9. BayesPrism

In [None]:
# Read from prepared AnnData object
train_sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))
train_sc_df = train_sc_adata.to_df()

# Rename index
train_sc_df.index.name = "gene_symbol"

In [None]:
# Load up all metadata created by Seurat
meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Whole_miniatlas_meta_no_normal.csv"),
    index_col=0,
    sep="\t",
)
train_meta_df = meta_df[meta_df["Patient"].isin(train_p_ids)]

# Rearrange indexes in meta DF to match order of counts DataFrame
train_meta_df = train_meta_df.reindex(train_sc_df.columns)

In [None]:
# Extract cell labels into a DataFrame
labels_df = train_meta_df[["cell_labels"]]

# Apparently R/3.5.0 doesn't understand how to parse the character "-"
# meaning "T-cells" will be read as a vector of "T" and "cells"
# Also R/3.5.0 can't parse " "
# Replace all cell types with these characters by "_"
labels_df["cell_labels"].replace(
    {
        "T-cells": "T_cells",
        "B-cells": "B_cells",
        "Cancer Epithelial": "Cancer_Epithelial",
    },
    inplace=True,
)

In [None]:
# Make method-specific directory for music if it doesn't exist yet
Path(prefix).joinpath("data/bprism/").mkdir(exist_ok=True, parents=True)

In [None]:
# Save single-cell counts and labels
labels_df.to_csv(Path(prefix).joinpath("data/bprism/single_cell_labels.csv"), sep="\t")
train_sc_df.T.to_csv(
    Path(prefix).joinpath("data/bprism/scRNA_ref.csv"), sep="\t", chunksize=5000
)