In [1]:
import os
import scanpy as sc
import pickle as pkl
import warnings
from utils import pre_process_data, get_common_hvg, safe_make_dir

warnings.filterwarnings('ignore')

root_dir = 'raw_data'
out_dir = 'data'

In [7]:
# CASE1
s_data = sc.read_h5ad(os.path.join(root_dir, 'pbmc9/pbmc9_s.h5ad'))
t_data = sc.read_h5ad(os.path.join(root_dir, 'pbmc9/pbmc9_t.h5ad'))

s_data.obs['celltype'].astype('str')
t_data.obs['celltype'].astype('str')

s_data = pre_process_data(s_data, n_top_genes=1200)
t_data = pre_process_data(t_data, n_top_genes=1200)

safe_make_dir(os.path.join(out_dir, 'pbmc9'))
s_data.write_h5ad(os.path.join(out_dir, 'pbmc9/pbmc9_s.h5ad'))
t_data.write_h5ad(os.path.join(out_dir, 'pbmc9/pbmc9_t.h5ad'))

common_hvg = get_common_hvg([s_data, t_data])
pkl.dump(list(common_hvg), open(os.path.join(out_dir, f'pbmc9/hvg_{len(common_hvg)}_pbmc9.pkl'), 'wb'))
print(f'hvg file is saved in: ', os.path.join(out_dir, f'pbmc9/hvg_{len(common_hvg)}_pbmc9.pkl'))

AnnData expects .var.index to contain strings, but got values like:
    ['ENSG00000238009.6', 'WASH9P', 'ENSG00000228463.10', 'ENSG00000237094.13', 'ENSG00000230021.10']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")
  adata.obs['n_genes'] = number
  view_to_actual(adata)
  disp_grouped = df.groupby('mean_bin')['dispersions']
AnnData expects .var.index to contain strings, but got values like:
    ['ENSG00000238009.6', 'WASH9P', 'ENSG00000228463.10', 'ENSG00000237094.13', 'ENSG00000230021.10']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "var")
  adata.obs['n_genes'] = number
  view_to_actual(adata)
  disp_grouped = df.groupby('mean_bin')['dispersions']


data/pbmc9 exists!

hvg file is saved in:  data/pbmc9/hvg_1815_pbmc9.pkl


  pkl.dump(list(common_hvg), open(os.path.join(out_dir, f'pbmc9/hvg_{len(common_hvg)}_pbmc9.pkl'), 'wb'))


In [None]:
# CASE2
s_data = sc.read_h5ad(os.path.join(root_dir, 'pbmc40/pbmc40.h5ad'))
t_data = sc.read_h5ad(os.path.join(root_dir, 'pbmc40/pbmc10.h5ad'))

s_data.obs['celltype'].astype('str')
t_data.obs['celltype'].astype('str')

s_data = pre_process_data(s_data, n_top_genes=1200)
t_data = pre_process_data(t_data, n_top_genes=1200)

safe_make_dir(os.path.join(out_dir, 'pbmc40'))
s_data.write_h5ad(os.path.join(out_dir, 'pbmc40/pbmc40.h5ad'))
t_data.write_h5ad(os.path.join(out_dir, 'pbmc40/pbmc10.h5ad'))

common_hvg = get_common_hvg([s_data, t_data])
pkl.dump(list(common_hvg), open(os.path.join(out_dir, f'pbmc40/hvg_{len(common_hvg)}_pbmc40.pkl'), 'wb'))
print(f'hvg file is saved in: ', os.path.join(out_dir, f'pbmc40/hvg_{len(common_hvg)}_pbmc40.pkl'))

In [9]:
# CASE3
s_data = sc.read_h5ad(os.path.join(root_dir, 'cross_species/human.h5ad'))
t_data_mouse = sc.read_h5ad(os.path.join(root_dir, 'cross_species/mouse.h5ad'))
t_data_mm = sc.read_h5ad(os.path.join(root_dir, 'cross_species/macaqueM.h5ad'))
t_data_mf = sc.read_h5ad(os.path.join(root_dir, 'cross_species/macaqueF.h5ad'))

s_data = pre_process_data(s_data, n_top_genes=1600)
t_data_mouse = pre_process_data(t_data_mouse, n_top_genes=1600)
t_data_mm = pre_process_data(t_data_mm, n_top_genes=1600)
t_data_mf = pre_process_data(t_data_mf, n_top_genes=1600)

s_data.var.index = s_data.var.index.str.upper()
t_data_mouse.var.index = t_data_mouse.var.index.str.upper()
t_data_mm.var.index = t_data_mm.var.index.str.upper()
t_data_mf.var.index = t_data_mf.var.index.str.upper()




safe_make_dir(os.path.join(out_dir, 'cross_species'))
s_data.write_h5ad(os.path.join(out_dir, 'cross_species/human.h5ad'))
t_data_mouse.write_h5ad(os.path.join(out_dir, 'cross_species/mouse.h5ad'))
t_data_mm.write_h5ad(os.path.join(out_dir, 'cross_species/macaqueM.h5ad'))
t_data_mf.write_h5ad(os.path.join(out_dir, 'cross_species/macaqueF.h5ad'))

common_hvg = get_common_hvg([s_data, t_data_mm, t_data_mf, t_data_mouse])
pkl.dump(list(common_hvg), open(os.path.join(out_dir, f'cross_species/hvg_{len(common_hvg)}_cross_species.pkl'), 'wb'))
print(f'hvg file is saved in: ', os.path.join(out_dir, f'cross_species/hvg_{len(common_hvg)}_cross_species.pkl'))

  adata.obs['n_genes'] = number
  view_to_actual(adata)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  adata.obs['n_genes'] = number
  view_to_actual(adata)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  adata.obs['n_genes'] = number
  view_to_actual(adata)
  disp_grouped = df.groupby('mean_bin')['dispersions']
  adata.obs['n_genes'] = number
  view_to_actual(adata)
  disp_grouped = df.groupby('mean_bin')['dispersions']


data/cross_species exists!

hvg file is saved in:  data/cross_species/hvg_1208_cross_species.pkl


  pkl.dump(list(common_hvg), open(os.path.join(out_dir, f'cross_species/hvg_{len(common_hvg)}_cross_species.pkl'), 'wb'))


In [None]:
# CASE4
s_hvg_background = [
    'ZBTB16', 'ZNF462',  # NKT ()
    'FOXP3', 'IKZF2', 'CYTOR', 'CCR4',  # Treg
    'CXCR5', 'MAF',  # CD4 Tcm fh
    'FOSB', 'FOS',  # CD4 Tcm Th0
    'RORC', 'TNFRSF4',  # CD4 Tcm Th17
    'ZBTB16', ' ZNF462',  # NKT periphery
    'ZNF683', 'KLRK1',  # aa CD8 T
    'EOMES', 'GZMK',  # CD8 Tem
    'GZMK', 'KLRB1',  # CD4 Tem Th1/17
    'GZMH', 'CST7',  # CD4 Tem Th1
    'RGS1', 'CCL4L2',  # CD8 Trm
    'NKG7', 'SATB1',  # CD8 Temra
    'TBX21', 'CX3CR1'  # CD4 Temra Th1
]

s_data = sc.read_h5ad(os.path.join(root_dir, 'mg/mg_ref.h5ad'))
t_data = sc.read_h5ad(os.path.join(root_dir, 'mg/mg_query.h5ad'))

s_data = pre_process_data(s_data, n_top_genes=1200)
t_data = pre_process_data(t_data, n_top_genes=1200)

s_data.obs.index = s_data.obs.index + '_' + ['s']
t_data.obs.index = t_data.obs.index + '_' + ['t']

t_data.obs['batch'] = 'batch' + t_data.obs['batch'].astype(str)

safe_make_dir(os.path.join(out_dir, 'mg'))
s_data.write_h5ad(os.path.join(out_dir, 'mg/mg_ref.h5ad'))
t_data.write_h5ad(os.path.join(out_dir, 'mg/mg_query.h5ad'))

common_hvg = get_common_hvg([s_data, t_data], [s_hvg_background])
pkl.dump(common_hvg, open(os.path.join(out_dir, f'mg/hvg_{len(common_hvg)}_mg.pkl'), 'wb'))
print(f'hvg file is saved in: ', os.path.join(out_dir, f'mg/hvg_{len(common_hvg)}_mg.pkl'))