# This notebook ... TODO

In [1]:
import pathlib
import yaml
import subprocess
import pickle

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from sklearn.model_selection import train_test_split

from buddi.preprocessing import sc_preprocess


## Notebook Parameters

In [2]:
# TODO consider whether to move these into config.yml
CELL_TYPE_COL = 'scpred_CellType'
SAMPLE_ID_COL = 'sample_id'
STIM_COL = 'stim'

GENE_ID_COL = 'gene_ids'

DATASPLIT_COL = 'isTraining'

DATASPLIT_SEED = 42

## Load config

In [3]:
# Get the root directory of the analysis repository
REPO_ROOT = subprocess.run(
    ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
).stdout.strip()
REPO_ROOT = pathlib.Path(REPO_ROOT)

CONFIG_FILE = REPO_ROOT / 'config.yml'
assert CONFIG_FILE.exists(), f"Config file not found at {CONFIG_FILE}"

with open(CONFIG_FILE, 'r') as file:
    config_dict = yaml.safe_load(file)

## Retrieve Path to Processed Single-Cell RNA-seq Data and relevant Metadata

In [4]:
STUDY_GEO_ID = 'GSE154600' # TODO consider whether to move this into config.yml as well
SC_DATA_PATH = pathlib.Path(config_dict['data_path']['sc_data_path'])

SC_ADATA_PATH = SC_DATA_PATH / f'{STUDY_GEO_ID}_processed'
assert SC_ADATA_PATH.exists(), f"Processed Single-cell Data path {SC_ADATA_PATH} does not exist"
SC_ADATA_FILE = SC_ADATA_PATH / f'{STUDY_GEO_ID}_processed.h5ad'
assert SC_ADATA_FILE.exists(), f"Processed Single-cell Data file {SC_ADATA_FILE} does not exist"

SC_METADATA_PATH = SC_DATA_PATH / f'{STUDY_GEO_ID}_metadata'
assert SC_METADATA_PATH.exists(), f"Single-cell Metadata path {SC_METADATA_PATH} does not exist"

## Define Path to write Pre-Processing Outputs

In [5]:
PREPROCESSING_OUTPUT_PATH = REPO_ROOT / 'processed_data'
assert PREPROCESSING_OUTPUT_PATH.exists(), f"Preprocessing output path {PREPROCESSING_OUTPUT_PATH} does not exist"
SC_AUGMENTED_DATA_PATH = PREPROCESSING_OUTPUT_PATH / 'sc_augmented'
SC_AUGMENTED_DATA_PATH.mkdir(exist_ok=True, parents=True)

## Preprocessing of scRNA-seq Anndata before Moving to Pseudobulk

### Load and Preprocess Anndata

In [6]:
adata = sc.read_h5ad(SC_ADATA_FILE)

# checking if the defined columns are present in the adata.obs
assert CELL_TYPE_COL in adata.obs.columns, f"Column {CELL_TYPE_COL} not found in adata.obs"
assert SAMPLE_ID_COL in adata.obs.columns, f"Column {SAMPLE_ID_COL} not found in adata.obs"
assert STIM_COL in adata.obs.columns, f"Column {STIM_COL} not found in adata.obs"

In [7]:
adata.var_names_make_unique()
adata.var[GENE_ID_COL] = adata.var.index.tolist()

# replace underscores with hyphens in the sample_id column
adata.obs[SAMPLE_ID_COL] = adata.obs[SAMPLE_ID_COL].str.replace('_', '-')

Print some basic information

In [8]:
print(adata.shape)
print(adata.var.head())
print(adata.obs.head())

(36111, 24520)
              gene_ids  n_cells     mt  n_cells_by_counts  mean_counts  \
gene_ids                                                                 
AL627309.1  AL627309.1       23  False                 23     0.000614   
AL669831.5  AL669831.5      629  False                629     0.017388   
FAM87B          FAM87B       26  False                 26     0.000694   
LINC00115    LINC00115      463  False                463     0.012661   
FAM41C          FAM41C      279  False                279     0.007746   

            pct_dropout_by_counts  total_counts  
gene_ids                                         
AL627309.1              99.938567          23.0  
AL669831.5              98.319934         651.0  
FAM87B                  99.930554          26.0  
LINC00115               98.763322         474.0  
FAM41C                  99.254788         290.0  
                           GSM             Barcode  Cluster          cellType  \
AAACCTGAGCTGCCCA-1  GSM4675273  AAA

## Some Stats

### Stimulation

In [9]:
tab = adata.obs.groupby([SAMPLE_ID_COL, STIM_COL]).size()
tab.unstack()

stim,CTRL
sample_id,Unnamed: 1_level_1
Samp-T59,11689
Samp-T76,11876
Samp-T77,4974
Samp-T89,4291
Samp-T90,3281


### Cell type

In [10]:
adata.obs[CELL_TYPE_COL].value_counts()

CD8+ T-cells         8183
Fibroblasts          4385
Epithelial cells     4021
Macrophages          3979
Adipocytes           3955
Monocytes            3507
Mesangial cells      3488
CD4+ T-cells         1476
NK cells             1409
B-cells              1169
Endothelial cells     539
Name: scpred_CellType, dtype: int64

## Write the Dense Expression Matrix and Celltype column to use in CIBERSORTx


In [11]:
sc_profile_file = SC_AUGMENTED_DATA_PATH / f'{STUDY_GEO_ID}_sig.pkl'

dense_matrix = adata.X.todense()
dense_df = pd.DataFrame(dense_matrix, columns = adata.var[GENE_ID_COL])
dense_df.insert(loc=0, column=CELL_TYPE_COL, value=adata.obs[CELL_TYPE_COL].to_list())

pickle.dump( dense_df, open( sc_profile_file, "wb" ) )

# free up memory
del dense_matrix
del dense_df

## Output Gene ids

In [12]:
gene_out_file = SC_AUGMENTED_DATA_PATH / f'{STUDY_GEO_ID}_genes.pkl'
gene_ids = adata.var[GENE_ID_COL]
pickle.dump(gene_ids, open( gene_out_file, "wb" ) )

## Make Pseudobulks

First perform random train test split stratifying by sample id and stimulation

In [13]:
# Split the data into train and test sets
train_idx, test_idx = train_test_split(
    adata.obs.index,
    test_size=0.2,
    stratify=adata.obs[[SAMPLE_ID_COL, STIM_COL, CELL_TYPE_COL]],
    random_state=DATASPLIT_SEED
)

# Assign the split to the DATASPLIT_COL
adata.obs.loc[train_idx, DATASPLIT_COL] = 'Train'
adata.obs.loc[test_idx, DATASPLIT_COL] = 'Test'

In [14]:
ADD_PER_CELL_TYPE_NOISE = True
N_CELLS_PER_PSEUDO_BULK = 5_000
N_PSEUDO_BULKS_PER_CONDITION = 1_000

# Unique values for experiment/perturbation/cell type
samples = adata.obs[SAMPLE_ID_COL].unique()
stims = adata.obs[STIM_COL].unique()
cell_types = adata.obs[CELL_TYPE_COL].unique()
datasplits = adata.obs[DATASPLIT_COL].unique()

n_samples = len(samples)
n_genes = len(gene_ids)
n_cell_types = len(cell_types)

# Define cell-type level noise for the generated pseudo-bulk profiles
if ADD_PER_CELL_TYPE_NOISE:
    # this produces a list of numpy arrays, each of length n_genes
    # to reflect the expression noise associated with each specific cell type
    per_cell_type_noise = [
        np.random.lognormal(0, 0, n_genes) for i in range(n_cell_types)]
else:
    per_cell_type_noise = None

# Generate pseudo-bulk profiles grouping by sample_id and stim
for _sample in samples:
    for _stim in stims:
        for _datasplit in datasplits:

            print(f"Generating pseudo-bulk profiles for sample {_sample}, stim {_stim}, and datasplit {_datasplit} ...")
            
            ## Subset adata to the current sample, stim and train/test split
            subset_idx = np.where(
                np.logical_and.reduce((
                    adata.obs[SAMPLE_ID_COL] == _sample, 
                    adata.obs[STIM_COL] == _stim,
                    adata.obs[DATASPLIT_COL] == _datasplit
                ))
            )[0]
            
            if len(subset_idx) == 0:
                continue
            subset_adata = adata[subset_idx, :]

            print("Generating random prop pseudo-bulk profiles ...")
            random_prop_pb_outputs = sc_preprocess.make_prop_and_sum(
                in_adata=subset_adata,
                # the number of pseudo-bulk profiles to generate
                num_samples=N_PSEUDO_BULKS_PER_CONDITION,
                # the number of cells included/sampled when generating each pseudo-bulk profile
                num_cells=N_CELLS_PER_PSEUDO_BULK,
                # pseudo-bulk profiles will be generated with random proportions
                use_true_prop=False,
                # apply the per cell type noise
                cell_noise=per_cell_type_noise,
                # no sample noise
                useSampleNoise=False,
            )

            count_df, pb_df, test_count_df, test_pb_df = random_prop_pb_outputs
            # divide the count matrix by the sum of each row to get proportions
            prop_df = count_df.div(count_df.sum(axis=1), axis=0)
            test_prop_df = test_count_df.div(test_count_df.sum(axis=1), axis=0)

            del count_df
            del test_count_df

            n_random_prop_pbs = len(pb_df)

            print("Generating single cell type dominant pseudo-bulk profiles ...")

            # Generate pseudo-bulk profiles where a single cell type dominates
            # this will produce num_samp * n_cell_types pseudo-bulk profiles
            ct_prop_df = sc_preprocess.get_single_celltype_prop_matrix(
                num_samp=100, #  generate 100 per cell type
                cell_order=cell_types
            )

            # Use proportion matrix to generate pseudo-bulk profiles
            sc_prop_df, sc_pb_df, _ = sc_preprocess.use_prop_make_sum(
                in_adata=subset_adata,
                num_cells=N_CELLS_PER_PSEUDO_BULK,
                # use the generated single cell type dominant proportion matrix
                props_vec=ct_prop_df,
                # apply the same per cell type noise used for random prop pseudo-bulk profiles
                cell_noise=per_cell_type_noise,
                # no sample noise
                sample_noise=None,
                useSampleNoise=False
            )

            n_single_celltype_pbs = len(sc_pb_df)

            print('Concatenating the two types of pseudo-bulk profiles ...')
            prop_df =  pd.concat([prop_df, sc_prop_df])
            pb_df = pd.concat([pb_df, sc_pb_df])

            n_total_pbs = n_random_prop_pbs + n_single_celltype_pbs

            metadata_df = pd.DataFrame(
                data = {"sample_id":[_sample]*n_total_pbs,
                        "stim":[_stim]*n_total_pbs,
                        "isTraining":[_datasplit]*n_total_pbs,
                        "cell_prop_type":['random']*n_random_prop_pbs + ['single_celltype']*n_single_celltype_pbs,
                        "samp_type":['sc_ref']*n_total_pbs}
                        )
            
            print("Writing the pseudo-bulk profiles ...")
            pseudobulk_file = SC_AUGMENTED_DATA_PATH / f'{STUDY_GEO_ID}_{_sample}_{_stim}_{_datasplit}_pseudo_splits.pkl'
            prop_file = SC_AUGMENTED_DATA_PATH / f'{STUDY_GEO_ID}_{_sample}_{_stim}_{_datasplit}_prop_splits.pkl'
            meta_file = SC_AUGMENTED_DATA_PATH / f'{STUDY_GEO_ID}_{_sample}_{_stim}_{_datasplit}_meta_splits.pkl'

            pickle.dump( prop_df, open( prop_file, "wb" ) )
            pickle.dump( pb_df, open( pseudobulk_file, "wb" ) )
            pickle.dump( metadata_df, open( meta_file, "wb" ) )


Generating pseudo-bulk profiles for sample Samp-T59, stim CTRL, and datasplit Train ...
Generating random prop pseudo-bulk profiles ...
0
100
200
300
400
500
600
700
800
900
1000
Generating single cell type dominant pseudo-bulk profiles ...
0
100
200
300
400
500
600
700
800
900
1000
Concatenating the two types of pseudo-bulk profiles ...
Writing the pseudo-bulk profiles ...
Generating pseudo-bulk profiles for sample Samp-T59, stim CTRL, and datasplit Test ...
Generating random prop pseudo-bulk profiles ...
0
100
200
300
400
500
600
700
800
900
1000
Generating single cell type dominant pseudo-bulk profiles ...
0
100
200
300
400
500
600
700
800
900
1000
Concatenating the two types of pseudo-bulk profiles ...
Writing the pseudo-bulk profiles ...
Generating pseudo-bulk profiles for sample Samp-T76, stim CTRL, and datasplit Train ...
Generating random prop pseudo-bulk profiles ...
0
100
200
300
400
500
600
700
800
900
1000
Generating single cell type dominant pseudo-bulk profiles ...
0
100
