# This notebook formats bulk RNA-seq dataset

In [1]:
import gc
import sys
import pathlib
import gzip
import subprocess
import pickle

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
from sklearn.model_selection import train_test_split

## Preprocessing Parameters

In [2]:
SAMPLE_ID_COL = 'sample_id'
SAMPLE_COL_SOURCE = 'source name'
STIM_COL = 'stim'
STIM_COL_SOURCE = 'characteristics: sex'
def get_stim_id(in_str):
    out_str = "female"
    if in_str == "m":
        out_str = "male"
           
    return(out_str)

GENE_ID_COL = 'gene_ids'

DATASPLIT_SEED = 42

## Retrieve Path to Processed Bulk RNA-seq Data and relevant Metadata

In [4]:
## Input data path
DATA_PATH = pathlib.Path('.').absolute() / 'example_data'
assert DATA_PATH.exists()
BULK_DATA_FILE = DATA_PATH / 'bulk_data' / 'GSE132040_190214.csv.gz'
assert BULK_DATA_FILE.exists()
BULK_METADATA_FILE = DATA_PATH / 'bulk_data' / 'GSE132040_MACA_Bulk_metadata.csv'
assert BULK_METADATA_FILE.exists()

## Output path
PREPROCESS_OUTPUT_PATH = pathlib.Path('.').absolute() / 'example_data' / 'preprocessed_data'
assert PREPROCESS_OUTPUT_PATH.exists(), 'Please create the output directory "preprocessed_data" first'
PREPROCESS_BULK_FORMAT_PATH = PREPROCESS_OUTPUT_PATH / 'bulk_formatted'
PREPROCESS_BULK_FORMAT_PATH.mkdir(parents=True, exist_ok=True)
PREPROCESS_BULK_FORMAT_FILE = PREPROCESS_BULK_FORMAT_PATH / 'GSE132040_190214_bulk_formatted.h5ad'
PREPROCESS_BULK_FORMAT_GENE_FILE = PREPROCESS_BULK_FORMAT_PATH / 'GSE132040_190214_bulk_genes.pkl'

## Preprocessing of Bulk Data
### Assemble csv expression and csv metadata to an adata object 

In [5]:
# Load the metadata file
metadata = pd.read_csv(BULK_METADATA_FILE, index_col=0)
metadata.index = metadata.index.astype(str) + '.gencode.vM19'

# Load the bulk data file as an AnnData object
with gzip.open(BULK_DATA_FILE, 'rt') as f:
    X = pd.read_csv(f, index_col=0)
    X = X.T
    metadata = metadata.loc[X.index] # order

# Assemble anndata object
adata = ad.AnnData(X, obs=metadata)

del X
del metadata
gc.collect()

21

### Format metadata

In [6]:
# remove non-gene IDs
gene_idx = np.where(np.logical_not(adata.var_names.str.startswith('__')))[0]
adata = adata[:, gene_idx]

# format the tissue 
adata.obs["tissue"] = [x.split("_")[0] for x in adata.obs["source name"]]

# subset to post-pubescent liver
adata = adata[np.where(adata.obs["tissue"] == "Liver")]
adata = adata[np.where(adata.obs["characteristics: age"] != "1")]

  adata.obs["tissue"] = [x.split("_")[0] for x in adata.obs["source name"]]


In [7]:
adata.obs.head()

Unnamed: 0,title,source name,organism,characteristics: age,characteristics: developmental stage,characteristics: sex,molecule,description,processed data file,raw file,BioSample,Instrument Model,tissue
A12_384Bulk_Plate2_S12.gencode.vM19,Tabula Muris Senis (bulk RNA seq),Liver_56,Mus musculus C57/BL6,6,months postnatal,m,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9126831,SAMN11854575,Illumina NovaSeq 6000,Liver
A21_384Bulk_Plate2_S21.gencode.vM19,Tabula Muris Senis (bulk RNA seq),Liver_46,Mus musculus C57/BL6,6,months postnatal,m,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127503,SAMN11854599,Illumina NovaSeq 6000,Liver
A6_384Bulk_Plate1_S6.gencode.vM19,Tabula Muris Senis (bulk RNA seq),Liver_8,Mus musculus C57/BL6,3,months postnatal,f,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127256,SAMN11854613,Illumina NovaSeq 6000,Liver
A7_384Bulk_Plate3_S7.gencode.vM19,Tabula Muris Senis (bulk RNA seq),Liver_14,Mus musculus C57/BL6,3,months postnatal,m,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127050,SAMN11854618,Illumina NovaSeq 6000,Liver
A8_384Bulk_Plate2_S8.gencode.vM19,Tabula Muris Senis (bulk RNA seq),Liver_9,Mus musculus C57/BL6,15,months postnatal,f,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127056,SAMN11854620,Illumina NovaSeq 6000,Liver


### format for BuDDI and write

In [8]:
adata.obs[SAMPLE_ID_COL] = adata.obs[SAMPLE_COL_SOURCE]



adata.obs[STIM_COL] = [get_stim_id(str(x)) for x in adata.obs[STIM_COL_SOURCE].tolist()]
adata.var[GENE_ID_COL] = adata.var.index.tolist()

del adata.raw
adata.write(PREPROCESS_BULK_FORMAT_FILE)

pickle.dump(adata.var[GENE_ID_COL], open(PREPROCESS_BULK_FORMAT_GENE_FILE, "wb"))

  adata.obs[SAMPLE_ID_COL] = adata.obs[SAMPLE_COL_SOURCE]
