# This notebook formats the bulk RNAseq expression data to match the metadata fields from the generatedpseudobulks

Intended inputs to this dataset are paired `.tsv` expression and metadata files

The end product of this notebook will a single `.h5ad` file that includes all bulk obervations and the following metadata fields:
- stimulation column `stim`
- sample id column `sample_id`

In [1]:
import sys
import pathlib
import yaml
import subprocess
import pickle

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from sklearn.model_selection import train_test_split

## Preprocessing Parameters

In [2]:
SAMPLE_ID_COL = 'sample_id'
STIM_COL = 'stim'

GENE_ID_COL = 'gene_ids'

DATASPLIT_SEED = 42

## Load config
The config file specifies the path to data and software repo (due to currently in active development)

In [3]:
# Get the root directory of the analysis repository
REPO_ROOT = subprocess.run(
    ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
).stdout.strip()
REPO_ROOT = pathlib.Path(REPO_ROOT)

CONFIG_FILE = REPO_ROOT / 'config.yml'
assert CONFIG_FILE.exists(), f"Config file not found at {CONFIG_FILE}"

with open(CONFIG_FILE, 'r') as file:
    config_dict = yaml.safe_load(file)

## Add dev buddi fork to path and import

In [4]:
buddi_fork_path = config_dict['software_path']['buddi_HGSC']
buddi_fork_path = pathlib.Path(buddi_fork_path)
assert buddi_fork_path.exists(), f"buddi fork not found at {buddi_fork_path}"

sys.path.insert(0, str(buddi_fork_path))
# this is quite ugly, once activate modifications are done this will be changed
# to a proper installation + import
from buddi import preprocessing
from buddi.preprocessing import utils
from buddi.preprocessing import generate_pseudo_bulks

## Retrieve Path to Processed Bulk RNA-seq Data and relevant Metadata

In [5]:
BULK_DATA_PATH = pathlib.Path(config_dict['data_path']['bulk_data_path'])
BULK_BLACK_TSV_FILE = BULK_DATA_PATH / 'supp_table_6_black_expr.tsv'
assert BULK_BLACK_TSV_FILE.exists(), f"Black bulk expression data file not found at {BULK_BLACK_TSV_FILE}"
BULK_WHITE_TSV_FILE = BULK_DATA_PATH / 'supp_table_7_white_expr.tsv'
assert BULK_WHITE_TSV_FILE.exists(), f"White bulk expression data file not found at {BULK_WHITE_TSV_FILE}"

BULK_BLACK_METADATA_FILE = BULK_DATA_PATH / 'supp_table_3_main_black_metadata_table.tsv'
assert BULK_BLACK_METADATA_FILE.exists(), f"Black bulk metadata file not found at {BULK_BLACK_METADATA_FILE}"
BULK_WHITE_METADATA_FILE = BULK_DATA_PATH / 'supp_table_4_main_white_metadata_table.tsv'
assert BULK_WHITE_METADATA_FILE.exists(), f"White bulk metadata file not found at {BULK_WHITE_METADATA_FILE}"

## Define Path to write Pre-Processing Outputs

In [6]:
PREPROCESSING_OUTPUT_PATH = REPO_ROOT / 'processed_data'
assert PREPROCESSING_OUTPUT_PATH.exists(), f"Preprocessing output path {PREPROCESSING_OUTPUT_PATH} does not exist"
BULK_FORMAT_DATA_PATH = PREPROCESSING_OUTPUT_PATH / 'bulk_formatted'
BULK_FORMAT_DATA_PATH.mkdir(exist_ok=True, parents=True)

BULK_FORMAT_EXPR_FILE = BULK_FORMAT_DATA_PATH / 'schildkraut_bulk_processed.h5ad'

## Preprocessing of Bulk Data
### Load and Preprocess tsv data

In [7]:
bulk_expr_df = pd.DataFrame()
bulk_metadata_df = pd.DataFrame()

for stim, expr_file, metadata_file in zip(
    ['black', 'white'],
    [BULK_BLACK_TSV_FILE, BULK_WHITE_TSV_FILE], 
    [BULK_BLACK_METADATA_FILE, BULK_WHITE_METADATA_FILE]):

    # Load the expression data

    # Data is originally in the format of genes x samples
    expr_df = pd.read_csv(expr_file, sep='\t', index_col=0)
    expr_df = expr_df.T # Transpose to samples x genes
    expr_df.dropna() # Drop any rows with missing values
    expr_df.index = expr_df.index.str.replace('Sample_', '', regex=False) # Remove the 'Sample_' prefix from the sample IDs

    # Load the metadata
    metadata_df = pd.read_csv(metadata_file, sep='\t', index_col=0)

    # Subset to only the samples that have both expression and metadata
    overlapping_rows = expr_df.index.intersection(metadata_df.index)
    expr_df = expr_df.loc[overlapping_rows]
    metadata_df = metadata_df.loc[overlapping_rows]

    # Add the stim column
    metadata_df[STIM_COL] = stim

    bulk_expr_df = pd.concat([bulk_expr_df, expr_df])
    bulk_metadata_df = pd.concat([bulk_metadata_df, metadata_df])

In [8]:
print(bulk_expr_df.shape)
# drop any columns with missing values to remove genes not shared between the two datasets
bulk_expr_df = bulk_expr_df.dropna(axis=1)
print(bulk_expr_df.shape)

print(bulk_metadata_df.shape)

(588, 18983)
(588, 18509)
(588, 20)


## Create anndata object and export

In [9]:
adata = ad.AnnData(X=bulk_expr_df, obs=bulk_metadata_df)

adata.var[GENE_ID_COL] = adata.var.index.tolist()
adata.obs[SAMPLE_ID_COL] = adata.obs.index.tolist()

for col in adata.obs.columns:
    if adata.obs[col].dtype == object or adata.obs[col].dtype == "category":
        adata.obs[col] = adata.obs[col].astype("str")

adata.write(BULK_FORMAT_EXPR_FILE)

In [10]:
ct = pd.crosstab(adata.obs[SAMPLE_ID_COL], adata.obs[STIM_COL])
with pd.option_context(
    'display.max_rows', None,
    'display.max_columns', None,
    'display.width', None,
    'display.max_colwidth', None
):
    print(ct)

stim       black  white
sample_id              
18341X1        0      1
18341X10       1      0
18341X11       0      1
18341X12       1      0
18341X13       1      0
18341X14       1      0
18341X15       1      0
18341X17       1      0
18341X18       0      1
18341X19       0      1
18341X2        0      1
18341X20       0      1
18341X21       0      1
18341X22       1      0
18341X23       1      0
18341X24       0      1
18341X25       0      1
18341X26       1      0
18341X27       1      0
18341X28       1      0
18341X29       1      0
18341X3        0      1
18341X30       1      0
18341X31       1      0
18341X32       1      0
18341X33       1      0
18341X34       1      0
18341X35       1      0
18341X36       1      0
18341X38       1      0
18341X39       1      0
18341X4        0      1
18341X40       1      0
18341X41       1      0
18341X42       0      1
18341X43       0      1
18341X44       0      1
18341X45       1      0
18341X46       0      1
18341X47       0

In [11]:
gene_out_file = BULK_FORMAT_DATA_PATH / f'schildkraut_genes.pkl'
gene_ids = adata.var[GENE_ID_COL]
pickle.dump(gene_ids, open( gene_out_file, "wb" ) )