In [1]:
import pandas as pd
import scanpy as sc
from scipy.sparse import csr_matrix
from scipy.io import mmwrite, mmread
import pickle as pkl
import os
import json
import warnings
from utils import safe_make_dir
warnings.filterwarnings('ignore')

root_dir = 'raw_data/raw_data/'
out_dir = 'raw_data'

### CASE1

In [2]:
adata = sc.read_h5ad(os.path.join(root_dir, 'pbmc9/pbmc35699.h5ad'))
adata.X = adata.raw.X

adata.obs = adata.obs.assign(celltype=adata.obs['cell_type'],
                             batch=adata.obs['donor_id'])
adata.var.set_index('feature_name', inplace=True)


s_data = adata[adata.obs['donor_id'].isin(['NL1', 'Emp2', 'Emp3']), :]
t_data = adata[adata.obs['donor_id'].isin(['Emp1',  'NL2',  'NL3']), :]

s_data = s_data[s_data.obs['celltype'] != 'plasma cell', :]  # natural killer cell, b cell, dendritic cell, 
t_data = t_data[t_data.obs['celltype'] != 'dendritic cell', :]

safe_make_dir(os.path.join(out_dir, 'pbmc9'))
s_data.write_h5ad(os.path.join(out_dir, 'pbmc9/pbmc9_s.h5ad'))
t_data.write_h5ad(os.path.join(out_dir, 'pbmc9/pbmc9_t.h5ad'))

raw_data/pbmc9 exists!



### CASE2

In [7]:
s_data = sc.read_h5ad(os.path.join(root_dir, 'pbmc40/pbmc40.h5ad'))
t_data = sc.read_h5ad(os.path.join(root_dir, 'pbmc40/pbmc10.h5ad'))

s_data.X = s_data.raw.X
t_data.X = t_data.raw.X

s_data = s_data[s_data.obs.disease == 'COVID-19']
s_data = s_data[s_data.obs.sample_id.isin(list(s_data.obs.sample_id.value_counts().index)[:20])]

s_data.obs = s_data.obs.assign(celltype=s_data.obs['cell_type'], batch=s_data.obs['sample_id'], disease=s_data.obs['disease'])
t_data.obs = t_data.obs.assign(celltype=t_data.obs['cell_type'], batch=t_data.obs['donor_id'])

s_data.var.set_index('feature_name', inplace=True)
t_data.var.set_index('feature_name', inplace=True)

del_celltypes = s_data.obs.celltype.value_counts()[s_data.obs.celltype.value_counts() < 200].index
s_data = s_data[~s_data.obs.celltype.isin(del_celltypes), :]

safe_make_dir(os.path.join(out_dir, 'pbmc40'))
s_data.write_h5ad(os.path.join(out_dir, 'pbmc40/pbmc40.h5ad'))
t_data.write_h5ad(os.path.join(out_dir, 'pbmc40/pbmc10.h5ad'))

raw_data/pbmc40 created!



### CASE3

In [12]:
ct_dict = json.load(open(os.path.join(root_dir, 'cross_species/ct_dict.json'), 'rb'))

human = sc.read_h5ad(os.path.join(root_dir, 'cross_species/human.h5ad'))
mouse = sc.read_h5ad(os.path.join(root_dir, 'cross_species/mouse.h5ad'))
mm = sc.read_h5ad(os.path.join(root_dir, 'cross_species/macaqueM.h5ad'))
mf = sc.read_h5ad(os.path.join(root_dir, 'cross_species/macaqueF.h5ad'))

human.obs['celltype'] = human.obs['celltype'].map(ct_dict)
mouse.obs['celltype'] = mouse.obs['celltype'].map(ct_dict)
mm.obs['celltype'] = mm.obs['celltype'].map(ct_dict)
mf.obs['celltype'] = mf.obs['celltype'].map(ct_dict)

human.obs = human.obs.assign(celltype=human.obs['celltype'], batch=human.obs['batch'])
mouse.obs = mouse.obs.assign(celltype=mouse.obs['celltype'], batch=mouse.obs['batch'])
mm.obs = mm.obs.assign(celltype=mm.obs['celltype'], batch=mm.obs['batch'])
mf.obs = mf.obs.assign(celltype=mf.obs['celltype'], batch=mf.obs['batch'])

safe_make_dir(os.path.join(out_dir, 'cross_species'))
human.write_h5ad(os.path.join(out_dir, 'cross_species/human.h5ad'))
mouse.write_h5ad(os.path.join(out_dir, 'cross_species/mouse.h5ad'))
mm.write_h5ad(os.path.join(out_dir, 'cross_species/macaqueM.h5ad'))
mf.write_h5ad(os.path.join(out_dir, 'cross_species/macaqueF.h5ad'))

... storing 'celltype' as categorical


raw_data/cross_species created!



... storing 'celltype' as categorical
... storing 'celltype' as categorical
... storing 'celltype' as categorical


### CASE4

In [13]:
s_data = sc.read_h5ad(os.path.join(root_dir, 'mg/mg_ref.h5ad'))
t_data = sc.read_h5ad(os.path.join(root_dir, 'mg/mg_query.h5ad'))

s_data.X = s_data.X.astype('float32')
t_data.X = t_data.X.astype('float32')

s_data.obs = s_data.obs.assign(celltype=s_data.obs['cell_type__custom'], celltype_major=s_data.obs['cell_type__ontology_label'], batch=s_data.obs['biosample_id'])
t_data.obs = t_data.obs.assign(celltype=t_data.obs['predicted_labels_Celltypist'], batch=t_data.obs['batch'])

safe_make_dir(os.path.join(out_dir, 'mg'))
s_data.write_h5ad(os.path.join(out_dir, 'mg/mg_ref.h5ad'))
t_data.write_h5ad(os.path.join(out_dir, 'mg/mg_query.h5ad'))

raw_data/mg created!

