# This notebook formats bulk RNA-seq dataset

In [1]:
import gc
import sys
import pathlib
import gzip
import subprocess
import pickle

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from sklearn.model_selection import train_test_split

## Preprocessing Parameters

In [2]:
SAMPLE_ID_COL = 'sample_id'
SAMPLE_COL_SOURCE = 'source name'
STIM_COL = 'stim'
STIM_COL_SOURCE = 'characteristics: sex'
def get_stim_id(in_str):
    out_str = "female"
    if in_str == "m":
        out_str = "male"
           
    return(out_str)

GENE_ID_COL = 'gene_ids'

DATASPLIT_SEED = 42

## Retrieve Path to Processed Bulk RNA-seq Data and relevant Metadata

In [3]:
## Input data path
DATA_PATH = pathlib.Path('.').absolute() / 'example_data'
assert DATA_PATH.exists()
BULK_DATA_FILE = DATA_PATH / 'bulk_data' / 'GSE132040_190214.csv.gz'
assert BULK_DATA_FILE.exists()
BULK_METADATA_FILE = DATA_PATH / 'bulk_data' / 'GSE132040_MACA_Bulk_metadata.csv'
assert BULK_METADATA_FILE.exists()

## Output path
PREPROCESS_OUTPUT_PATH = pathlib.Path('.').absolute() / 'example_data' / 'preprocessed_data'
assert PREPROCESS_OUTPUT_PATH.exists(), 'Please create the output directory "preprocessed_data" first'
PREPROCESS_BULK_FORMAT_PATH = PREPROCESS_OUTPUT_PATH / 'bulk_formatted'
PREPROCESS_BULK_FORMAT_PATH.mkdir(parents=True, exist_ok=True)
PREPROCESS_BULK_FORMAT_FILE = PREPROCESS_BULK_FORMAT_PATH / 'GSE132040_190214_bulk_formatted.h5ad'
PREPROCESS_BULK_FORMAT_GENE_FILE = PREPROCESS_BULK_FORMAT_PATH / 'GSE132040_190214_bulk_genes.pkl'

## Preprocessing of Bulk Data
### Assemble csv expression and csv metadata to an adata object 

In [4]:
# Load the metadata file
metadata = pd.read_csv(BULK_METADATA_FILE, index_col=0)
metadata.index = metadata.index.astype(str) + '.gencode.vM19'

# Load the bulk data file as an AnnData object
with gzip.open(BULK_DATA_FILE, 'rt') as f:
    X = pd.read_csv(f, index_col=0)
    X = X.T
    metadata = metadata.loc[X.index] # order

# Assemble anndata object
adata = ad.AnnData(X, obs=metadata)

del X
del metadata
gc.collect()

10

### Format metadata

In [5]:
# remove non-gene IDs
gene_idx = np.where(np.logical_not(adata.var_names.str.startswith('__')))[0]
adata = adata[:, gene_idx]

# format the tissue 
adata.obs["tissue"] = [x.split("_")[0] for x in adata.obs["source name"]]

# subset to post-pubescent liver
adata = adata[np.where(adata.obs["tissue"] == "Liver")]
adata = adata[np.where(adata.obs["characteristics: age"] != "1")]

  adata.obs["tissue"] = [x.split("_")[0] for x in adata.obs["source name"]]


### format for BuDDI and write

In [8]:
adata.obs[SAMPLE_ID_COL] = adata.obs[SAMPLE_COL_SOURCE]
adata.obs[STIM_COL] = [get_stim_id(str(x)) for x in adata.obs[STIM_COL_SOURCE].tolist()]
adata.var[GENE_ID_COL] = adata.var.index.tolist()

  adata.obs[SAMPLE_ID_COL] = adata.obs[SAMPLE_COL_SOURCE]


### Sample vs Stim Contingency Table

In [9]:
ct = pd.crosstab(adata.obs[SAMPLE_ID_COL], adata.obs[STIM_COL])
with pd.option_context(
    'display.max_rows', None,
    'display.max_columns', None,
    'display.width', None,
    'display.max_colwidth', None
):
    print(ct)

stim       female  male
sample_id              
Liver_1         1     0
Liver_10        0     1
Liver_11        0     1
Liver_12        0     1
Liver_13        0     1
Liver_14        0     1
Liver_16        0     1
Liver_17        0     1
Liver_18        0     1
Liver_19        0     1
Liver_2         1     0
Liver_20        0     1
Liver_21        1     0
Liver_22        0     1
Liver_23        0     1
Liver_24        1     0
Liver_25        0     1
Liver_26        0     1
Liver_27        1     0
Liver_28        0     1
Liver_29        1     0
Liver_3         0     1
Liver_31        1     0
Liver_32        0     1
Liver_33        1     0
Liver_34        0     1
Liver_35        0     1
Liver_36        0     1
Liver_38        0     1
Liver_39        0     1
Liver_4         1     0
Liver_40        0     1
Liver_42        0     1
Liver_43        0     1
Liver_44        0     1
Liver_45        0     1
Liver_46        0     1
Liver_47        1     0
Liver_48        1     0
Liver_49        

In [None]:
del adata.raw
adata.write(PREPROCESS_BULK_FORMAT_FILE)
pickle.dump(adata.var[GENE_ID_COL], open(PREPROCESS_BULK_FORMAT_GENE_FILE, "wb"))

  adata.obs[SAMPLE_ID_COL] = adata.obs[SAMPLE_COL_SOURCE]
