## Here we show the processing of human: m1d1n, the other samples are the same.

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
from scipy.sparse import csr_matrix
from scipy.io import mmread
import xchrom as xc

In [2]:
path1 = './raw_data/alone/human/'
metadata = pd.read_csv(f"{path1}/GSE229169_human_filtered_metadata.tsv.gz", sep='\t')
features = pd.read_csv(f'{path1}/GSE229169_human_filtered_features.tsv.gz', sep='\t', header=None)
features.columns = ['gene_ids'] 
cell_names = pd.read_csv(f'{path1}/GSE229169_human_filtered_barcodes.tsv.gz', sep = '\t', header=None, index_col=None)
cell_names.columns = ['cell_ids'] 
X = csr_matrix(mmread(f'{path1}/GSE229169_human_filtered_matrix.mtx.gz').T)
adata = anndata.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = features.gene_ids))
adata.obs = metadata 
adata.obs.index = adata.obs['cell']
print('sample and cell numbers:\n',adata.obs['orig.ident'].value_counts())

sample and cell numbers:
 M1_donor1_neun    10795
M1_donor2_neun     8283
M1_donor3          7466
M1_donor2          6690
M1_donor1          4489
M1_donor3_neun     1662
M1_donor3_rep2     1552
Name: orig.ident, dtype: int64


In [4]:
adata.obs.index[:2]

Index(['M1_donor1_AAACAGCCAACAACAA-1', 'M1_donor1_AAACAGCCAGGTATTT-1'], dtype='object', name='cell')

In [5]:
m1d1n_path = './raw_data/total/human/'
cell_names = pd.read_csv(f'{m1d1n_path}/M1_donor1_neun_barcodes.tsv.gz', sep='\t', header=None)
cell_names.columns = ['cell_ids'] 
features = pd.read_csv(f'{m1d1n_path}/M1_donor1_neun_features.tsv.gz', sep='\t', header=None)
features.columns = ['ids','names','feature_types','chr','start','end'] 
X = csr_matrix(mmread(f'{m1d1n_path}/M1_donor1_neun_matrix.mtx.gz').T)
hd1 = anndata.AnnData(X, obs=pd.DataFrame(index=cell_names.cell_ids), var=pd.DataFrame(index = features.names))
hd1.var = features
hd1_rna = hd1[:,hd1.var['feature_types']=='Gene Expression']
hd1_atac = hd1[:,hd1.var['feature_types']=='Peaks']

rna = hd1_rna.copy()
rna.obs.index = ['M1_donor1_neun_' + cell for cell in rna.obs.index]
print(rna.obs.index[:3])
atac = hd1_atac.copy()
atac.obs.index = ['M1_donor1_neun_' + cell for cell in atac.obs.index]
print(atac.obs.index[:3])

  utils.warn_names_duplicates("var")


Index(['M1_donor1_neun_AAACAGCCAAACGCGA-1',
       'M1_donor1_neun_AAACAGCCAGTAGGTG-1',
       'M1_donor1_neun_AAACAGCCATAAGTTC-1'],
      dtype='object')
Index(['M1_donor1_neun_AAACAGCCAAACGCGA-1',
       'M1_donor1_neun_AAACAGCCAGTAGGTG-1',
       'M1_donor1_neun_AAACAGCCATAAGTTC-1'],
      dtype='object')




In [6]:
## 1. extract paired cells and data
common_cells = adata.obs.index.intersection(rna.obs.index)
print('cell numbers:\n',len(common_cells))
adata_c = adata[common_cells, :]
rna_c = rna[common_cells, :]
atac_c = atac[common_cells, :]
rna_c.obs['cell_type'] = adata_c.obs['subclass']
atac_c.obs['cell_type'] = adata_c.obs['subclass']

  rna_c.obs['cell_type'] = adata_c.obs['subclass']


cell numbers:
 10795


  atac_c.obs['cell_type'] = adata_c.obs['subclass']


In [7]:
## 2. filt data
rna_f, atac_f = xc.pp.filter_multiome_data(
    ad_rna = rna_c,
    ad_atac = atac_c,
    species = 'human',
    filter_ratio = 0.05
)

RNA data after filtering: View of AnnData object with n_obs × n_vars = 10795 × 12481
    obs: 'cell_type', 'n_genes'
    var: 'ids', 'names', 'feature_types', 'chr', 'start', 'end', 'n_cells'
ATAC data after filtering: View of AnnData object with n_obs × n_vars = 10795 × 50118
    obs: 'cell_type', 'n_genes'
    var: 'ids', 'names', 'feature_types', 'chr', 'start', 'end', 'n_cells'


In [8]:
rna_f.write_h5ad('./data/1_human/m1d1n/ad_rna.h5ad')
atac_f.write_h5ad('./data/1_human/m1d1n/ad_atac.h5ad')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c


In [9]:
hu = sc.read_h5ad('./data/1_human/m1d1n/ad_rna.h5ad')
hu_genelst = hu.var.iloc[:,:2]
print(hu_genelst)
hu_genelst.to_csv('./hu_genelst.csv', index=False)

                   ids       names
14     ENSG00000237491   LINC01409
16     ENSG00000228794   LINC01128
24     ENSG00000188976       NOC2L
29     ENSG00000188290        HES4
30     ENSG00000187608       ISG15
...                ...         ...
36570  ENSG00000198695      MT-ND6
36571  ENSG00000198727      MT-CYB
36574  ENSG00000274847       MAFIP
36580  ENSG00000273748  AL592183.1
36599  ENSG00000278817  AC007325.4

[12481 rows x 2 columns]
