# Prepare training and test data for specific tools

In [None]:
import json

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as adata

from tqdm import tqdm
from pathlib import Path
from collections import ChainMap
from sklearn import preprocessing as pp
from sklearn.preprocessing import LabelEncoder

%load_ext blackcellmagic

In [None]:
# Prefixes
prefix = "???/deconvolution_benchmarking/06_batch_effect_validation/bassez_et_al"

## Metadata

In [None]:
pur_lvls = np.arange(0.05, 1, 0.05).round(3).tolist()

### Pal et al

In [None]:
# Test patient IDs
pal_et_al_test_p_ids = [
    # HER2+
    "HER2_0176",
    "HER2_0031",
    # ER+
    "ER_0319",
    "ER_0043_T",
    "ER_0025",
    "ER_0001",
    # TNBC
    "TN_B1_0554",
    "TN_0114_T2",
]

# Training patient IDs
pal_et_al_train_p_ids = [
    # HER2+
    "HER2_0308",
    "HER2_0337",
    "HER2_0161",
    # ER+
    "ER_0032",
    "ER_0114_T3",
    "ER_0167_T",
    "ER_0151",
    "ER_0360",
    "ER_0042",
    "ER_0040_T",
    "ER_0163",
    "ER_0125",
    # TNBC
    "TN_B1_0177",
    "TN_0135",
    "TN_B1_0131",
    "TN_0126",
]

# Major cell types
pal_et_al_c_types = [
    "Cancer_epithelial",
    "Normal_epithelial",
    "T_cells",
    "B_cells",
    "Myeloid",
    "TAMs",
    "DCs",
    "Endothelial",
    "CAFs",
    "Pericytes",
    "Plasma_cells",
]

### Wu et al

In [None]:
# Training patient IDs
wu_et_al_train_p_ids = [
    "CID3586",
    "CID3941",
    "CID3963",
    "CID44041",
    "CID4530N",
    "CID3838",
    "CID3946",
    "CID4040",
    "CID4461",
    "CID44991",
    "CID45171",
    "CID4535",
    "CID3948",
    "CID4398",
    "CID4463",
    "CID4495",
    "CID4513",
    "CID4465",
]
# Training patient IDs
wu_et_al_test_p_ids = [
    "CID4067",
    "CID4290A",
    "CID4471",
    "CID3921",
    "CID4066",
    "CID4523",
    "CID44971",
    "CID4515",
]
wu_et_al_c_types = [
    "B-cells",
    "CAFs",
    "Cancer Epithelial",
    "Endothelial",
    "Myeloid",
    "Normal Epithelial",
    "PVL",
    "Plasmablasts",
    "T-cells",
]

## 1. Process train and test data

In [None]:
# Load up simulatedtest mixture AnnData object
test_adata = sc.read_h5ad(Path(prefix).joinpath("data/test/test_sim_mixts.h5ad"))

#### Unify Wu et al and Pal et al data

In [None]:
# Load train single-cell reference profiles and simulated mixtures from Wu et al
wu_et_al_sc_adata = sc.read_h5ad(
    Path(prefix).joinpath("data/wu_et_al/scRNA_ref.h5ad")
).T
wu_et_al_train_mixts_adata = sc.read_h5ad(
    Path(prefix).joinpath("data/wu_et_al/training_sim_mixts.h5ad")
)

# Filter single-cell reference from Wu et al and simulated test mixtures with intersecting genes between 2 datasets
wu_et_al_pal_et_al_intersect_genes = np.intersect1d(
    test_adata.var["gene_symbol"].values, wu_et_al_sc_adata.var["gene_symbol"].values
)

# Filter simulated test mixtures
filtered_test_adata = test_adata[:, wu_et_al_pal_et_al_intersect_genes]
filtered_wu_et_al_sc_adata = wu_et_al_sc_adata[:, wu_et_al_pal_et_al_intersect_genes]
filtered_wu_et_al_train_mixts_adata = wu_et_al_train_mixts_adata[
    :, wu_et_al_pal_et_al_intersect_genes
]

In [None]:
# Rename and hyphenated cell-type labels
filtered_wu_et_al_sc_adata.obs["cell_labels"] = (
    filtered_wu_et_al_sc_adata.obs["cell_labels"]
    .str.decode(encoding="utf-8")
    .replace(
        {
            "B-cells": "B_cells",
            "Cancer Epithelial": "Cancer_Epithelial",
            "Normal Epithelial": "Normal_Epithelial",
            "T-cells": "T_cells",
        }
    )
)
filtered_wu_et_al_train_mixts_adata.obs.rename(
    columns={
        "B-cells": "B_cells",
        "Cancer Epithelial": "Cancer_Epithelial",
        "Normal Epithelial": "Normal_Epithelial",
        "T-cells": "T_cells",
    },
    inplace=True,
)

In [None]:
# Save anndata file of all training data
filtered_test_adata.uns = {}
filtered_test_adata.write_h5ad(
    Path(prefix).joinpath(f"data/test/intersected_test_sim_mixts").with_suffix(".h5ad")
)

# Convert gene counts and metadata to DataFrame
filtered_test_counts_df = filtered_test_adata.to_df()
filtered_test_labels_df = filtered_test_adata.obs.copy()

# Drop the "batch" column and fill NaN by 0
filtered_test_labels_df.drop(["batch"], axis=1, inplace=True)
filtered_test_labels_df.fillna(0, inplace=True)

# Save test data into 19 patitions corresponding to 19 purity levels
for pur_lvl in tqdm(pur_lvls):
    subset_obs_df = filtered_test_labels_df[
        filtered_test_labels_df["Cancer_Epithelial"] == pur_lvl
    ]
    subset_test_counts_df = filtered_test_counts_df.loc[subset_obs_df.index, :]

    subset_test_counts_df.T.to_csv(
        Path(prefix).joinpath(f"data/test/test_counts_{pur_lvl}_pur_lvl.tsv"), sep="\t"
    )

# Save filtered single-cell refernece
filtered_wu_et_al_sc_adata.write_h5ad(
    Path(prefix).joinpath(f"data/train/scRNA_ref").with_suffix(".h5ad")
)

# Save filtered train simulated mixtures
filtered_wu_et_al_train_mixts_adata.write_h5ad(
    Path(prefix).joinpath(f"data/train/train_sim_mixts").with_suffix(".h5ad")
)

## 2. Prepare data for each method

#### Load single-cell metadata

In [None]:
# Train metadata is from Wu et al
wu_et_al_meta_df = pd.read_csv(
    Path(prefix).joinpath("data/wu_et_al/Whole_miniatlas_meta_9_10.csv"),
    index_col=0,
    sep="\t",
)
train_meta_df = wu_et_al_meta_df[wu_et_al_meta_df["Patient"].isin(wu_et_al_train_p_ids)]

# Hyphenate cell-types labels
train_meta_df["cell_labels"] = train_meta_df["cell_labels"].replace(
    {
        "T-cells": "T_cells",
        "B-cells": "B_cells",
        "Normal Epithelial": "Normal_Epithelial",
        "Cancer Epithelial": "Cancer_Epithelial",
    }
)

# Test metadata is from Pal et al
pal_et_al_meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Miniatlas_major_immune_lineage.tsv"),
    index_col=0,
    sep="\t",
)
test_meta_df = pal_et_al_meta_df[
    pal_et_al_meta_df["Patient"].isin(pal_et_al_test_p_ids)
]

#### Load single-cell reference from Wu et al

In [None]:
# Load AnnData object (rows are cells, columns are genes)
train_sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))

# Most methods require single-cell reference with cells as columns and genes as columns
# Transpose the anndata
train_sc_df = train_sc_adata.to_df().T

# Rename index and column names
train_sc_df.index.name = "gene_symbol"
train_sc_df.columns.name = "cell_id"

#### 1. CIBERSORTx

In [None]:
# Copy single-cell reference
cbx_sc_df = train_sc_df.copy()

# Rename index
cbx_sc_df.index.name = "gene_symbol"

In [None]:
# CBX requires a single-cell reference matrix with cell labels as columns and gene symbols as rows
# Reorganise single-cell reference to the same order as train_meta_df indexes
cbx_sc_df = cbx_sc_df[train_meta_df.index]

assert np.array_equal(cbx_sc_df.columns.values, train_meta_df.index.values)

# Then simply replace columns with cell labels
cbx_sc_df.columns = train_meta_df["cell_labels"]

In [None]:
# Save output beautifully
cbx_sc_df.to_csv(
    Path(prefix).joinpath("data/cbx/scRNA_ref.tsv"), sep="\t", chunksize=5000
)

#### 2. Scaden

#### Prepare AnnData training data

In [None]:
# First load the concatenate AnnData object that contains all train simulated mixtures
scaden_train_adata = sc.read_h5ad(
    Path(prefix).joinpath("data/train/train_sim_mixts.h5ad")
)

# Remove "batch" column in obs and replace NaN by 0
scaden_train_adata.obs = scaden_train_adata.obs.drop(["batch"], axis=1).fillna(0)

# Scaden requires cell fractions DataFrame to have a column call "ds"
# This column is supposed to store info on what dataset each row comes from
# And the during training we can delect which dataset gets used for training, which is quite handy
# However, in this case, there is only 1 dataset
# Make all row ds="Pal_et_al"
scaden_train_adata.obs["ds"] = "Pal_et_al"

# add cell types and signature genes
scaden_train_adata.uns["cell_types"] = train_meta_df["cell_labels"].unique().tolist()
scaden_train_adata.uns["unknown"] = ""

# Rename index and columns properly
scaden_train_adata.obs.index.name = "mixture_id"

In [None]:
# Save training data beautifully
scaden_train_adata.write_h5ad(Path(prefix).joinpath("data/scaden/train_counts.h5ad"))

### 4. bisque

In [None]:
# Copy train metadata
bisque_meta_df = train_meta_df.copy()

In [None]:
# Extract patieint id, cell labels and cell ids into a phenotype DataFrame
pheno_df = bisque_meta_df[["Patient", "cell_labels"]].reset_index()
pheno_df.columns = ["cell_ids", "patient_ids", "cell_labels"]

pheno_df.to_csv(Path(prefix).joinpath("data/bisque/phenotypes.tsv"), sep="\t")

In [None]:
# Clone single-cell reference
bisque_sc_df = train_sc_df.copy()

In [None]:
# Re-arrange single-cell DataFrame to match the same order of cell ids as phenotype DataFrame
bisque_sc_df = bisque_sc_df[pheno_df["cell_ids"].values]

# Save linear counts
bisque_sc_df.to_csv(
    Path(prefix).joinpath("data/bisque/scRNA_ref.tsv"), sep="\t", chunksize=5000
)

In [None]:
# Re-arrange single-cell DataFrame to match the same order of cell ids as phenotype DataFrame
bisque_scaled_log_sc_df = bisque_scaled_log_sc_df[pheno_df["cell_ids"].values]

bisque_scaled_log_sc_df.to_csv(
    Path(prefix).joinpath("data/bisque/scaled_logged_scRNA_ref.tsv"),
    sep="\t",
    chunksize=5000,
)

### 5. DWLS
DWLS only expects single cell labels accompanying the single-cell data

In [None]:
# Clone single-cell reference and metadata
dwls_sc_df = train_sc_df.copy()
dwls_meta_df = train_meta_df.copy()

In [None]:
# Extract cell labels into a DataFrame
labels_df = dwls_meta_df[["cell_labels"]].sort_index()
labels_df.to_csv(Path(prefix).joinpath("data/dwls/single_cell_labels.tsv"), sep="\t")

# Re-arrange single-cell DataFrame to match the same order of cell ids as phenotype DataFrame
dwls_sc_df = dwls_sc_df[labels_df.index]

In [None]:
dwls_sc_df.to_csv(
    Path(prefix).joinpath("data/dwls/scRNA_ref.tsv"), sep="\t", chunksize=5000
)

### 6. EPIC
EPIC relies on the signature matrix and marker genes generated by CIBERSORTx to run <br>
This processing script assumes that these 2 files have already been put in the data/epic folders
- Signature matrix (containing all genes): cbx_sig_matrix.txt
- Marker genes (a subset of signature matrix): cbx_sig_matrix.txt

In [None]:
# All CBX signature matrices are the same across tumour purity levels
# Grab one
cbx_sig_matrix_df = pd.read_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/CIBERSORTx_sigmatrix.txt"),
    index_col=0,
    sep="\t",
)

# EPIC assumes the "unknown" cells in a tumour is cancer cells
# Therefore we need to drop Cancer Epithelial from the signature matrix
cbx_sig_matrix_df.drop(["Cancer_Epithelial"], axis=1, inplace=True)

# Save signature matrix beautifully
cbx_sig_matrix_df.to_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/reference_profiles.tsv"), sep="\t"
)

# Extract marker genes from marker gene profiles and save into a .csv
marker_gene_labels_df = cbx_sig_matrix_df.index.to_frame()
marker_gene_labels_df.rename(columns={"NAME": "gene_symbol"}, inplace=True)

marker_gene_labels_df.to_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/marker_gene_symbols.tsv"), sep="\t"
)

### 7. hspe

In [None]:
# Clone single-cell reference and metadata
hspe_sc_df = train_sc_df.copy()
hspe_meta_df = train_meta_df.copy()

In [None]:
# Apply log1p (i.e. add 1 and apply log2)
# Both dtangle and hspe only mention log2 without + 1. This will lead to undefined output, as log2(0) = infinity. We therefore added 1 to gene expressions to avoid this
# 0 gene expression values will stil return 0 after log1p transformation
hspe_log_sc_df = np.log2(hspe_sc_df + 1)

# Also oth dtangle and hspe require bulk mixtures and single-cell reference to have genes as columns and rows as samples. We need to tranpose it
hspe_log_sc_df = hspe_log_sc_df.T

In [None]:
# Load test count DataFrames and transpose them so genes are columns and samples are rows
test_adata = sc.read_h5ad(
    Path(prefix).joinpath("data/test/intersected_test_sim_mixts.h5ad")
)
test_counts_df = test_adata.to_df()
test_labels_df = test_adata.obs

# Drop the "batch" column and fill NaN by 0
test_labels_df.drop(["batch"], axis=1, inplace=True)
test_labels_df.fillna(0, inplace=True)

# Apply log1p one test counts
log_test_counts_df = np.log2(test_counts_df + 1)

##### Save train & test counts

In [None]:
# Before saving train and test counts , do a sanity check to make sure train and test DataFrames have the same genes in the same order
assert np.array_equal(
    hspe_log_sc_df.columns.to_numpy(), log_test_counts_df.columns.to_numpy()
)

In [None]:
# Train metadata is from Wu et al
wu_et_al_meta_df = pd.read_csv(
    Path(prefix).joinpath("data/wu_et_al/Whole_miniatlas_meta_9_10.csv"),
    index_col=0,
    sep="\t",
)
train_meta_df = wu_et_al_meta_df[wu_et_al_meta_df["Patient"].isin(wu_et_al_train_p_ids)]
# Hyphenate cell-types labels
train_meta_df["cell_labels"] = train_meta_df["cell_labels"].replace(
    {
        "T-cells": "T_cells",
        "B-cells": "B_cells",
        "Normal Epithelial": "Normal_Epithelial",
        "Cancer Epithelial": "Cancer_Epithelial",
    }
)

# Test metadata is from Pal et al
pal_et_al_meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Miniatlas_major_immune_lineage.tsv"),
    index_col=0,
    sep="\t",
)
test_meta_df = pal_et_al_meta_df[
    pal_et_al_meta_df["Patient"].isin(pal_et_al_test_p_ids)
]

In [None]:
# Load AnnData object (rows are cells, columns are genes)
train_sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))

# Most methods require single-cell reference with cells as columns and genes as columns
# Transpose the anndata
train_sc_df = train_sc_adata.to_df().T

# Rename index and column names
train_sc_df.index.name = "gene_symbol"
train_sc_df.columns.name = "cell_id"

In [None]:
# Save single-cell data
hspe_log_sc_df.to_csv(Path(prefix).joinpath("data/hspe/scRNA_ref_logged.tsv"), sep="\t")
hspe_sc_df.T.to_csv(Path(prefix).joinpath("data/hspe/scRNA_ref.tsv"), sep="\t")

In [None]:
# Save test data by purity levels
for pur_lvl in tqdm(pur_lvls):
    subset_obs_df = test_labels_df[test_labels_df["Cancer_Epithelial"] == pur_lvl]
    subset_test_counts_df = log_test_counts_df.loc[subset_obs_df.index, :]

    # Within each tumour purity, split data into 10 shards
    # This allows us to paralellize the run into 190-fold
    for shard in tqdm(list(range(0, 20, 1))):
        shard_obs_df = np.array_split(subset_obs_df, 20)[shard]
        shard_test_counts_df = subset_test_counts_df.loc[shard_obs_df.index, :]

        shard_test_counts_df.to_csv(
            Path(prefix).joinpath(
                f"data/hspe/logged_test_counts_{pur_lvl}_pur_lvl_{shard}.tsv"
            ),
            sep="\t",
        )

##### Extract pure samples
Both dtangle and hspe require a pure_samples variable. This is a list variable, in which each item corresponds to one cell type and indexes of all cells of the same type in the single-cell reference DataFrame <br>

We need to retrieve cell type of the single-cell reference data and save this information into a .json file

In [None]:
# Clone metadata
hspe_meta_df = train_meta_df.copy()

In [None]:
# Reset index of log_train_sc_df() so we have order of cell ids as the indexes
reset_hspe_log_sc_df = hspe_log_sc_df.reset_index()

# Iterate over cell types and extract cell indexes from single-cell reference
pure_samples_d = {}

for c_type in tqdm(hspe_meta_df["cell_labels"].unique()):
    c_ids = (hspe_meta_df[hspe_meta_df["cell_labels"] == c_type]).index.tolist()
    c_indexes = reset_hspe_log_sc_df[reset_hspe_log_sc_df["cell_id"].isin(c_ids)].index

    # Python starts indexes from 0 and R starts from 1
    # Add 1 to index and add to pure_samples_d
    pure_samples_d[c_type] = (c_indexes + 1).tolist()

In [None]:
# Save pure_samples_d into a json file
json.dump(
    pure_samples_d,
    open(Path(prefix).joinpath(f"data/hspe/pure_samples.json"), "w"),
    indent=4,
)

### 8. MuSiC
MuSiC requires single-cell and bulk expressions in ExpressionSet objects <br>
The single-cell ExpressionSet also needs to a phenoType item containing
- **sampleID**        index of patient
- **SubjectName**      patient id
- **cellTypeID**       index of cell type
- **cellType**         cell annotation labels

In [None]:
# Clone single-cell reference and metadata
music_sc_df = train_sc_df.copy()
music_meta_df = train_meta_df.copy()

# Rearrange indexes in meta DF to match order of counts DataFrame
music_meta_df = music_meta_df.reindex(music_sc_df.columns)

Metadata for running MuSiC with neither marker genes nor cell subtypes

In [None]:
# Extract "Patient" + "celltype_major columns" and rename columns to match MuSiC requirements
pheno_df = train_meta_df[["Patient", "cell_labels"]].rename(
    columns={"Patient": "SubjectName", "cell_labels": "cellType"}
)

pheno_df.index.name = None

In [None]:
# Encode cell labels into number to use as cellTypeID
l_encoder = LabelEncoder()
l_encoder.fit(train_meta_df["cell_labels"].unique())
pheno_df["cellTypeID"] = l_encoder.transform(pheno_df["cellType"]) + 1

# Encode patient ids into number to use as sampleID
l_encoder = LabelEncoder()
l_encoder.fit(pheno_df["SubjectName"].unique())
pheno_df["sampleID"] = l_encoder.transform(pheno_df["SubjectName"]) + 1

In [None]:
# Save pheno DataFrame
pheno_df.to_csv(Path(prefix).joinpath("data/music/pheno.tsv"), sep="\t")

In [None]:
# Save train counts
music_sc_df = music_sc_df[pheno_df.index]
music_sc_df.to_csv(
    Path(prefix).joinpath("data/music/scRNA_ref.tsv"), sep="\t", chunksize=5000
)

### 9. BayesPrism

In [None]:
# Clone single-cell reference and metadata
bprism_sc_df = train_sc_df.copy()
bprism_meta_df = train_meta_df.copy()

# Rearrange indexes in meta DF to match order of counts DataFrame
bprism_meta_df = bprism_meta_df.reindex(bprism_sc_df.columns)

In [None]:
# Extract cell labels
bprism_meta_df.rename(columns={"cell_labels": "cell_type_labels"}, inplace=True)
bprism_meta_df[["cell_type_labels"]].to_csv(
    Path(prefix).joinpath("data/bprism_v2/single_cell_labels.tsv"), sep="\t"
)

In [None]:
# Save single-cell counts
bprism_sc_df.T.to_csv(
    Path(prefix).joinpath("data/bprism_v2/scRNA_ref.tsv"), sep="\t", chunksize=5000
)