# This notebook ... TODO

In [1]:
import pathlib
import yaml
import subprocess
import pickle

import pandas as pd
import scanpy as sc
import anndata as ad


## Notebook Parameters

In [2]:
# TODO consider whether to move these into config.yml
CELL_TYPE_COL = 'cellType'
SAMPLE_ID_COL = 'sample_id'
STIM_COL = 'stim'

## Load config

In [3]:
# Get the root directory of the analysis repository
REPO_ROOT = subprocess.run(
    ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
).stdout.strip()
REPO_ROOT = pathlib.Path(REPO_ROOT)

CONFIG_FILE = REPO_ROOT / 'config.yml'
assert CONFIG_FILE.exists(), f"Config file not found at {CONFIG_FILE}"

with open(CONFIG_FILE, 'r') as file:
    config_dict = yaml.safe_load(file)

## Retrieve Path to Processed Single-Cell RNA-seq Data and relevant Metadata

In [4]:
STUDY_GEO_ID = 'GSE154600' # TODO consider whether to move this into config.yml as well
SC_DATA_PATH = pathlib.Path(config_dict['data_path']['sc_data_path'])

SC_ADATA_PATH = SC_DATA_PATH / f'{STUDY_GEO_ID}_processed'
assert SC_ADATA_PATH.exists(), f"Processed Single-cell Data path {SC_ADATA_PATH} does not exist"
SC_ADATA_FILE = SC_ADATA_PATH / f'{STUDY_GEO_ID}_processed.h5ad'
assert SC_ADATA_FILE.exists(), f"Processed Single-cell Data file {SC_ADATA_FILE} does not exist"

SC_METADATA_PATH = SC_DATA_PATH / f'{STUDY_GEO_ID}_metadata'
assert SC_METADATA_PATH.exists(), f"Single-cell Metadata path {SC_METADATA_PATH} does not exist"

## Define Path to write Pre-Processing Outputs

In [5]:
PREPROCESSING_OUTPUT_PATH = REPO_ROOT / 'processed_data'
assert PREPROCESSING_OUTPUT_PATH.exists(), f"Preprocessing output path {PREPROCESSING_OUTPUT_PATH} does not exist"
SC_AUGMENTED_DATA_PATH = PREPROCESSING_OUTPUT_PATH / 'sc_augmented'
SC_AUGMENTED_DATA_PATH.mkdir(exist_ok=True, parents=True)

## Preprocessing of scRNA-seq Anndata before Moving to Pseudobulk

### Load and Preprocess Anndata

In [6]:
adata = sc.read_h5ad(SC_ADATA_FILE)
adata.var_names_make_unique()

In [7]:
# checking if the defined columns are present in the adata.obs
assert CELL_TYPE_COL in adata.obs.columns, f"Column {CELL_TYPE_COL} not found in adata.obs"
assert SAMPLE_ID_COL in adata.obs.columns, f"Column {SAMPLE_ID_COL} not found in adata.obs"
assert STIM_COL in adata.obs.columns, f"Column {STIM_COL} not found in adata.obs"

Print some basic information

In [8]:
print(adata.shape)
print(adata.var.head())
print(adata.obs.head())

(36111, 24520)
              gene_ids  n_cells     mt  n_cells_by_counts  mean_counts  \
gene_ids                                                                 
AL627309.1  AL627309.1       23  False                 23     0.000614   
AL669831.5  AL669831.5      629  False                629     0.017388   
FAM87B          FAM87B       26  False                 26     0.000694   
LINC00115    LINC00115      463  False                463     0.012661   
FAM41C          FAM41C      279  False                279     0.007746   

            pct_dropout_by_counts  total_counts  
gene_ids                                         
AL627309.1              99.938567          23.0  
AL669831.5              98.319934         651.0  
FAM87B                  99.930554          26.0  
LINC00115               98.763322         474.0  
FAM41C                  99.254788         290.0  
                           GSM             Barcode  Cluster          cellType  \
AAACCTGAGCTGCCCA-1  GSM4675273  AAA

## Some Stats

### Stimulation

In [9]:
tab = adata.obs.groupby([SAMPLE_ID_COL, STIM_COL]).size()
tab.unstack()

stim,CTRL
sample_id,Unnamed: 1_level_1
Samp_T59,11689
Samp_T76,11876
Samp_T77,4974
Samp_T89,4291
Samp_T90,3281


### Cell type

In [10]:
adata.obs[CELL_TYPE_COL].value_counts()

CD8+ T-cells         8183
Fibroblasts          4385
Epithelial cells     4021
Macrophages          3979
Adipocytes           3955
Monocytes            3507
Mesangial cells      3488
CD4+ T-cells         1476
NK cells             1409
B-cells              1169
Endothelial cells     539
Name: cellType, dtype: int64

## Write the Dense Expression Matrix and Celltype column to use in CIBERSORTx


In [11]:
sc_profile_file = SC_AUGMENTED_DATA_PATH / f'{STUDY_GEO_ID}_sig.pkl'

dense_matrix = adata.X.todense()
dense_df = pd.DataFrame(dense_matrix, columns = adata.var['gene_ids'])
dense_df.insert(loc=0, column=CELL_TYPE_COL, value=adata.obs[CELL_TYPE_COL].to_list())

pickle.dump( dense_df, open( sc_profile_file, "wb" ) )