# Prepare training and test data for specific tools

In [None]:
import json

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as adata

from tqdm import tqdm
from pathlib import Path
from collections import ChainMap
from sklearn import preprocessing as pp
from sklearn.preprocessing import LabelEncoder

%load_ext blackcellmagic

In [None]:
prefix = "???/deconvolution_benchmarking/05_external_scrna_validation/bassez_et_al"

In [None]:
# Training patient IDs
train_p_ids = [
    # HER2+
    "BIOKEY_13",
    # ER+
    "BIOKEY_3",
    "BIOKEY_5",
    "BIOKEY_12",
    "BIOKEY_18",
    "BIOKEY_22",
    "BIOKEY_24",
    "BIOKEY_27",
    "BIOKEY_29",
    "BIOKEY_30",
    "BIOKEY_40",
    "BIOKEY_42",
    # TNBC
    "BIOKEY_2",
    "BIOKEY_9",
    "BIOKEY_10",
    "BIOKEY_11",
    "BIOKEY_14",
    "BIOKEY_15",
    "BIOKEY_33",
    "BIOKEY_35",
    "BIOKEY_36",
    "BIOKEY_41",
]
# Test patient IDs
test_p_ids = [
    # HER2+
    "BIOKEY_28",
    # ER+
    "BIOKEY_4",
    "BIOKEY_6",
    "BIOKEY_7",
    "BIOKEY_17",
    "BIOKEY_21",
    "BIOKEY_37",
    # TNBC
    "BIOKEY_1",
    "BIOKEY_16",
    "BIOKEY_19",
    "BIOKEY_26",
    "BIOKEY_31",
]
pur_lvls = np.arange(0.05, 1, 0.05).round(3).tolist()
c_types = [
    "Cancer_cell",
    "T_cell",
    "B_cell",
    "Myeloid_cell",
    "Endothelial_cell",
    "Fibroblast",
    "Mast_cell",
    "pDC",
]

## 0. Process simulated test mixtures
- Grab the .h5ad file containing all test mixture we previously generated 
- Also save it into partitions corresponding to purity levels

In [None]:
# Load up test mixture AnnData object
test_adata = sc.read_h5ad(Path(prefix).joinpath("data/test/test_sim_mixts.h5ad"))
test_counts_df = test_adata.to_df()
test_labels_df = test_adata.obs.copy()

# Drop the "batch" column and fill NaN by 0
test_labels_df.drop(["batch"], axis=1, inplace=True)
test_labels_df.fillna(0, inplace=True)

In [None]:
# Save test data into 19 patitions corresponding to 19 purity levels
for pur_lvl in tqdm(pur_lvls):
    subset_obs_df = test_labels_df[test_labels_df["Cancer_cell"] == pur_lvl]
    subset_test_counts_df = test_counts_df.loc[subset_obs_df.index, :]

    subset_test_counts_df.T.to_csv(
        Path(prefix).joinpath(f"data/test/test_counts_{pur_lvl}_pur_lvl.tsv"), sep="\t"
    )

## 2. Prepare data for each method

#### Load single-cell metadata

In [None]:
# First load up all metadata
meta_df = pd.read_csv(
    Path(prefix).joinpath("data/Miniatlas_meta_9_10.tsv"), index_col=0, sep="\t"
)

# Split into train and test
train_meta_df = meta_df[meta_df["Patient"].isin(train_p_ids)]
test_meta_df = meta_df[meta_df["Patient"].isin(test_p_ids)]

#### Load single-cell reference containing only original cells processed above

In [None]:
# Load AnnData object (rows are cells, columns are genes)
train_sc_adata = sc.read_h5ad(Path(prefix).joinpath("data/train/scRNA_ref.h5ad"))

# Most methods require single-cell reference with cells as columns and genes as columns
# Transpose the anndata
train_sc_df = train_sc_adata.to_df().T

# Rename index and column names
train_sc_df.index.name = "gene_symbol"
train_sc_df.columns.name = "cell_id"

#### 1. CIBERSORTx

In [None]:
# Copy single-cell reference
cbx_sc_df = train_sc_df.copy()

# Rename index
cbx_sc_df.index.name = "gene_symbol"

In [None]:
# CBX requires a single-cell reference matrix with cell labels as columns and gene symbols as rows
# First make very sure that cell ids in train_sc_df and train_sc_adata.var are in the same order
assert np.array_equal(
    cbx_sc_df.columns.values, train_sc_adata.obs["cell_labels"].index.values
)

# Then simply replace columns with cell labels
cbx_sc_df.columns = train_sc_adata.obs["cell_labels"].astype(str).values

In [None]:
# Save output beautifully
cbx_sc_df.to_csv(
    Path(prefix).joinpath("data/cbx/scRNA_ref.tsv"), sep="\t", chunksize=5000
)

#### 2. Scaden

#### Prepare AnnData training data

In [None]:
# First load the concatenate AnnData object that contains all train simulated mixtures
scaden_train_adata = sc.read_h5ad(
    Path(prefix).joinpath("data/train/training_sim_mixts.h5ad")
)
scaden_train_counts_df = scaden_train_adata.to_df()
scaden_train_labels_df = scaden_train_adata.obs

# Remove "batch" column in obs and replace NaN by 0
scaden_train_adata.obs = scaden_train_adata.obs.drop(["batch"], axis=1).fillna(0)

# Scaden requires cell fractions DataFrame to have a column call "ds"
# This column is supposed to store info on what dataset each row comes from
# And the during training we can delect which dataset gets used for training, which is quite handy
# However, in this case, there is only 1 dataset
# Make all row ds="Bassez_et_al"
scaden_train_adata.obs["ds"] = "Bassez_et_al"

# add cell types and signature genes
scaden_train_adata.uns["cell_types"] = c_types
scaden_train_adata.uns["unknown"] = ""

# Rename index and columns properly
scaden_train_adata.obs.index.name = "mixture_id"

In [None]:
# Save training data beautifully
scaden_train_adata.write_h5ad(Path(prefix).joinpath("data/scaden/train_counts.h5ad"))

### 3. CPM
Cannot be run as it requires UMAP coordinates

### 4. bisque
bisque expect a .h5ad file holding non-logs single-cell gene counts in the bique/ folder <br>
This file would have been previously generated for CPM

In [None]:
# Copy train metadata
bisque_meta_df = train_meta_df.copy()

In [None]:
# Extract patieint id, cell labels and cell ids into a phenotype DataFrame
pheno_df = bisque_meta_df[["Patient", "cell_labels"]].reset_index()
pheno_df.columns = ["cell_ids", "patient_ids", "cell_labels"]

pheno_df.to_csv(Path(prefix).joinpath("data/bisque/phenotypes.tsv"), sep="\t")

In [None]:
# Clone single-cell reference
bisque_sc_df = train_sc_df.copy()

#### If we're using scaled non-logged counts

In [None]:
# Re-arrange single-cell DataFrame to match the same order of cell ids as phenotype DataFrame
bisque_sc_df = bisque_sc_df[pheno_df["cell_ids"].values]

# Normalize data
mms = pp.MinMaxScaler(feature_range=(0, 1), copy=True)
scaled_sc_arr = mms.fit_transform(bisque_sc_df.T).T
bisque_scaled_sc_df = pd.DataFrame(
    scaled_sc_arr, index=bisque_sc_df.index, columns=bisque_sc_df.columns
)

# Save scaled linear counts
bisque_scaled_sc_df.to_csv(
    Path(prefix).joinpath("data/bisque/scaled_scRNA_ref.tsv"), sep="\t", chunksize=5000
)

### 5. DWLS
DWLS only expects single cell labels accompanying the single-cell data

In [None]:
# Clone single-cell reference and metadata
dwls_sc_df = train_sc_df.copy()
dwls_meta_df = train_meta_df.copy()

In [None]:
# Extract cell labels into a DataFrame
labels_df = dwls_meta_df[["cell_labels"]].sort_index()
labels_df.to_csv(Path(prefix).joinpath("data/dwls/single_cell_labels.tsv"), sep="\t")

In [None]:
# Re-arrange single-cell DataFrame to match the same order of cell ids as phenotype DataFrame
dwls_sc_df = dwls_sc_df[labels_df.index]

dwls_sc_df.to_csv(
    Path(prefix).joinpath("data/dwls/scRNA_ref.tsv"), sep="\t", chunksize=5000
)

### 6. EPIC
EPIC relies on the signature matrix and marker genes generated by CIBERSORTx to run <br>
This processing script assumes that these 2 files have already been put in the data/epic folders
- Signature matrix (containing all genes): cbx_sig_matrix.txt
- Marker genes (a subset of signature matrix): cbx_sig_matrix.txt

In [None]:
# All CBX signature matrices are the same across tumour purity levels
# Grab one
cbx_sig_matrix_df = pd.read_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/CIBERSORTx_sigmatrix.txt"),
    index_col=0,
    sep="\t",
)

# EPIC assumes the "unknown" cells in a tumour is cancer cells
# Therefore we need to drop Cancer Epithelial from the signature matrix
cbx_sig_matrix_df.drop(["Cancer_cell"], axis=1, inplace=True)

# Save signature matrix beautifully
cbx_sig_matrix_df.to_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/reference_profiles.tsv"), sep="\t"
)

# Extract marker genes from marker gene profiles and save into a .csv
marker_gene_labels_df = cbx_sig_matrix_df.index.to_frame()
marker_gene_labels_df.rename(columns={"NAME": "gene_symbol"}, inplace=True)

marker_gene_labels_df.to_csv(
    Path(prefix).joinpath("data/epic/cbx_sig_matrix/marker_gene_symbols.tsv"), sep="\t"
)

### 7. hspe

In [None]:
# Clone single-cell reference and metadata
hspe_sc_df = train_sc_df.copy()
hspe_meta_df = train_meta_df.copy()

In [None]:
# Apply log1p (i.e. add 1 and apply log2)
# Both dtangle and hspe only mention log2 without + 1. This will lead to undefined output, as log2(0) = infinity. We therefore added 1 to gene expressions to avoid this
# 0 gene expression values will stil return 0 after log1p transformation
hspe_log_sc_df = np.log2(hspe_sc_df + 1)

# Also oth dtangle and hspe require bulk mixtures and single-cell reference to have genes as columns and rows as samples. We need to tranpose it
hspe_log_sc_df = hspe_log_sc_df.T

In [None]:
# Load test count DataFrames and transpose them so genes are columns and samples are rows
test_adata = sc.read_h5ad(Path(prefix).joinpath("data/test/test_mixtures.h5ad"))
test_counts_df = test_adata.to_df()
test_labels_df = test_adata.obs

# Drop the "batch" column and fill NaN by 0
test_labels_df.drop(["batch"], axis=1, inplace=True)
test_labels_df.fillna(0, inplace=True)

# Apply log1p one test counts
log_test_counts_df = np.log2(test_counts_df + 1)

##### Save train & test counts

In [None]:
# Before saving train and test counts , do a sanity check to make sure train and test DataFrames have the same genes in the same order
assert np.array_equal(
    hspe_log_sc_df.columns.to_numpy(), log_test_counts_df.columns.to_numpy()
)

In [None]:
# Save single-cell data
hspe_log_sc_df.to_csv(Path(prefix).joinpath("data/hspe/scRNA_ref.tsv"), sep="\t")

In [None]:
# Save test data by purity levels
for pur_lvl in tqdm(pur_lvls):
    subset_obs_df = test_labels_df[test_labels_df["Cancer_cell"] == pur_lvl]
    subset_test_counts_df = log_test_counts_df.loc[subset_obs_df.index, :]

    # Within each tumour purity, split data into 10 shards
    # This allows us to paralellize the run into 190-fold
    for shard in tqdm(list(range(0, 20, 1))):
        shard_obs_df = np.array_split(subset_obs_df, 20)[shard]
        shard_test_counts_df = subset_test_counts_df.loc[shard_obs_df.index, :]

        shard_test_counts_df.to_csv(
            Path(prefix).joinpath(
                f"data/hspe/logged_test_counts_{pur_lvl}_pur_lvl_{shard}.tsv"
            ),
            sep="\t",
        )

##### Extract pure samples
Both dtangle and hspe require a pure_samples variable. This is a list variable, in which each item corresponds to one cell type and indexes of all cells of the same type in the single-cell reference DataFrame <br>

We need to retrieve cell type of the single-cell reference data and save this information into a .json file

In [None]:
# Clone metadata
hspe_meta_df = train_meta_df.copy()

In [None]:
# Reset index of log_train_sc_df() so we have order of cell ids as the indexes
reset_hspe_log_sc_df = hspe_log_sc_df.reset_index()

# Iterate over cell types and extract cell indexes from single-cell reference
pure_samples_d = {}

for c_type in tqdm(hspe_meta_df["cell_labels"].unique()):
    c_ids = (hspe_meta_df[hspe_meta_df["cell_labels"] == c_type]).index.tolist()
    c_indexes = reset_hspe_log_sc_df[reset_hspe_log_sc_df["cell_id"].isin(c_ids)].index

    # Python starts indexes from 0 and R starts from 1
    # Add 1 to index and add to pure_samples_d
    pure_samples_d[c_type] = (c_indexes + 1).tolist()

In [None]:
# Save pure_samples_d into a json file
json.dump(
    pure_samples_d,
    open(Path(prefix).joinpath(f"data/hspe/pure_samples.json"), "w"),
    indent=4,
)

### 8. MuSiC
MuSiC requires single-cell and bulk expressions in ExpressionSet objects <br>
The single-cell ExpressionSet also needs to a phenoType item containing
- **sampleID**        index of patient
- **SubjectName**      patient id
- **cellTypeID**       index of cell type
- **cellType**         cell annotation labels

In [None]:
# Clone single-cell reference and metadata
music_sc_df = train_sc_df.copy()
music_meta_df = train_meta_df.copy()

# Rearrange indexes in meta DF to match order of counts DataFrame
music_meta_df = music_meta_df.reindex(music_sc_df.columns)

Metadata for running MuSiC with neither marker genes nor cell subtypes

In [None]:
# Extract "Patient" + "celltype_major columns" and rename columns to match MuSiC requirements
pheno_df = train_meta_df[["Patient", "cell_labels"]].rename(
    columns={"Patient": "SubjectName", "cell_labels": "cellType"}
)

pheno_df.index.name = None

In [None]:
# Encode cell labels into number to use as cellTypeID
l_encoder = LabelEncoder()
l_encoder.fit(c_types)
pheno_df["cellTypeID"] = l_encoder.transform(pheno_df["cellType"]) + 1

# Encode patient ids into number to use as sampleID
l_encoder = LabelEncoder()
l_encoder.fit(pheno_df["SubjectName"].unique())
pheno_df["sampleID"] = l_encoder.transform(pheno_df["SubjectName"]) + 1

In [None]:
# Save pheno DataFrame
pheno_df.to_csv(Path(prefix).joinpath("data/music/pheno.tsv"), sep="\t")

In [None]:
# Save train counts
music_sc_df = music_sc_df[pheno_df.index]
music_sc_df.to_csv(
    Path(prefix).joinpath("data/music/scRNA_ref.tsv"), sep="\t", chunksize=5000
)

### 9. BayesPrism

In [None]:
# Clone single-cell reference and metadata
bprism_sc_df = train_sc_df.copy()
bprism_meta_df = train_meta_df.copy()

# Rearrange indexes in meta DF to match order of counts DataFrame
bprism_meta_df = bprism_meta_df.reindex(bprism_sc_df.columns)

In [None]:
# Extract cell labels
bprism_meta_df.rename(columns={"cell_labels": "cell_type_labels"}, inplace=True)
bprism_meta_df[["cell_type_labels"]].to_csv(
    Path(prefix).joinpath("data/bprism/single_cell_labels.tsv"), sep="\t"
)

In [None]:
# Save single-cell counts
bprism_sc_df.T.to_csv(
    Path(prefix).joinpath("data/bprism/scRNA_ref.tsv"), sep="\t", chunksize=5000
)