# This notebook reads the processed scRNA seq dataset and produces a txt reference expression matrix as input to cibersortx signature matrix pipeline

In [1]:
import sys
import pathlib
import yaml
import subprocess
import pickle

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from sklearn.model_selection import train_test_split

## Preprocessing Parameters

## Load config
The config file specifies the path to data and software repo (due to currently in active development)

In [2]:
# Get the root directory of the analysis repository
REPO_ROOT = subprocess.run(
    ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
).stdout.strip()
REPO_ROOT = pathlib.Path(REPO_ROOT)

CONFIG_FILE = REPO_ROOT / 'config.yml'
assert CONFIG_FILE.exists(), f"Config file not found at {CONFIG_FILE}"

with open(CONFIG_FILE, 'r') as file:
    config_dict = yaml.safe_load(file)

## Retrieve Path to Processed Single-Cell RNA-seq Data and relevant Metadata

In [3]:
STUDY_GEO_ID = 'GSE154600' # TODO consider whether to move this into config.yml as well
SC_DATA_PATH = pathlib.Path(config_dict['data_path']['sc_data_path'])

SC_ADATA_PATH = SC_DATA_PATH / f'{STUDY_GEO_ID}_processed'
assert SC_ADATA_PATH.exists(), f"Processed Single-cell Data path {SC_ADATA_PATH} does not exist"
SC_ADATA_FILE = SC_ADATA_PATH / f'{STUDY_GEO_ID}_processed.h5ad'
assert SC_ADATA_FILE.exists(), f"Processed Single-cell Data file {SC_ADATA_FILE} does not exist"

SC_METADATA_PATH = SC_DATA_PATH / f'{STUDY_GEO_ID}_metadata'
assert SC_METADATA_PATH.exists(), f"Single-cell Metadata path {SC_METADATA_PATH} does not exist"

## Define Path to write Pre-Processing Outputs

In [4]:
PREPROCESSING_OUTPUT_PATH = REPO_ROOT / 'processed_data'
assert PREPROCESSING_OUTPUT_PATH.exists(), f"Preprocessing output path {PREPROCESSING_OUTPUT_PATH} does not exist"
CIBERSORTX_INPUT_PATH = PREPROCESSING_OUTPUT_PATH / 'cibersortx_input'
CIBERSORTX_INPUT_PATH.mkdir(parents=True, exist_ok=True)

In [5]:
GENE_ID_COL = 'gene_ids'

adata = sc.read_h5ad(SC_ADATA_FILE)
adata.var_names_make_unique()
adata.var[GENE_ID_COL] = adata.var.index.tolist()

## Examine number of classes in each cell label

In [6]:
unique_classes = {
    'cellType': adata.obs['cellType'].nunique(),
    'hpca_celltype': adata.obs['hpca_celltype'].nunique(),
    'encode_celltype': adata.obs['encode_celltype'].nunique()
}

unique_classes

{'cellType': 11, 'hpca_celltype': 31, 'encode_celltype': 11}

## Select Cell Type for cibersortx and downstream analysis

In [7]:
CELL_TYPE_COL = 'encode_celltype'
assert CELL_TYPE_COL in adata.obs.columns, f"Column {CELL_TYPE_COL} not found in adata.obs"

## Produce scRNA reference matrix as input to cibersortx signature matrix generation
### Select single sample from the scRNA seq dataset as input to cibersortx due to file size restrictions 

In [8]:
sample = 'GSM4675273'
SUB_SAMPLE = True

## Single Sample Subset
adata_single_sample = adata[adata.obs['GSM'] == sample].copy() # single sample adata
sc.pp.normalize_total(adata_single_sample, target_sum=1e4)

### Further sub-sample within single sample to reduce reference matrix size

In [None]:
MIN_N = 200
SUB_SAMP_PROP = 0.6

if SUB_SAMPLE == True:

    print(adata_single_sample.shape)
    # Create a new AnnData object to store the subsampled cells
    selected_indices = []

    # Get unique cell types
    for cell_type, indices in adata_single_sample.obs.groupby(CELL_TYPE_COL).groups.items():
        num_cells = len(indices)
        
        # If the cell type has more than MIN_N cells, subsample to 80%
        if num_cells > MIN_N:
            subsample_size = int(SUB_SAMP_PROP * num_cells)  # Compute 80% size
            sampled_indices = np.random.choice(indices, size=subsample_size, replace=False)  # Randomly select cells
        else:
            sampled_indices = indices  # Keep all cells if <=100

        selected_indices.extend(sampled_indices)

    # Subset the AnnData object to retain only selected cells
    adata_single_sample = adata_single_sample[selected_indices, :]

print(adata_single_sample.shape)

(11689, 24520)
(7101, 24520)


  for cell_type, indices in adata_single_sample.obs.groupby(CELL_TYPE_COL).groups.items():


### Export as tab delimited txt

In [10]:
adata_single_sample = adata_single_sample[adata_single_sample.obs.sort_values(by=CELL_TYPE_COL).index] # sort by cell type

## Transpose as (n_genes, n_cells)
dense_matrix = adata_single_sample.X.T.todense()

## Cell type (with duplicates) as column
dense_df = pd.DataFrame(dense_matrix, columns = adata_single_sample.obs[CELL_TYPE_COL].to_list())
## Gene id as row index
dense_df.index = adata_single_sample.var[GENE_ID_COL].to_list()

## Save to file
dense_df.to_csv(CIBERSORTX_INPUT_PATH / f'{sample}_cibersortx_sc_reference_input.txt', sep='\t')

In [11]:
dense_df.head()

Unnamed: 0,Adipocytes,Adipocytes.1,Adipocytes.2,Adipocytes.3,Adipocytes.4,Adipocytes.5,Adipocytes.6,Adipocytes.7,Adipocytes.8,Adipocytes.9,...,NK cells,NK cells.1,NK cells.2,NK cells.3,NK cells.4,NK cells.5,NK cells.6,NK cells.7,NK cells.8,NK cells.9
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL669831.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM87B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LINC00115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM41C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
