## 0. Prerequisite

In [1]:
# utils
import sys
import time
import scipy
import numpy as np
import pandas as pd
from collections import Counter

# scRNA
import anndata
import scanpy as sc

# custome functions
from pipeline import dl, pp, cl, pl, utils

%load_ext autoreload
%autoreload 2

In [3]:
dataset_path = pd.read_csv('dataset.txt', header = None, index_col=None)
dataset_path[0]

0    /projects/zhanglab/users/johnson/data/HumanEmb...
1    /projects/zhanglab/users/johnson/data/HumanPBM...
2    /projects/zhanglab/users/johnson/data/MouseEmb...
3    /projects/zhanglab/users/johnson/data/HumanBre...
4    /projects/zhanglab/users/johnson/data/HumanDev...
5    /projects/zhanglab/users/johnson/data/HumanHea...
6    /projects/zhanglab/users/johnson/data/GorillaM...
Name: 0, dtype: object

## 1. Processed Data

### Proprocessing


In [104]:


file = dataset_path[0][9]
adata = dl.load_data(file)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata,subset=True)

import gc
gc.collect()

adata

load:  /projects/zhanglab/users/johnson/data/dIPFC.h5ad
n_obs × n_vars:  (610719, 27930)
running time:  0.0001366138458251953


  disp_grouped = df.groupby('mean_bin')['dispersions']


AnnData object with n_obs × n_vars = 610719 × 4494
    obs: 'nGene', 'nUMI', 'percent.mt', 'mapped_reads', 'subtype', 'subclass', 'class', 'tech_rep', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'donor_id', 'suspension_type', 'is_primary_data', 'Source', 'author_age_year', '# technical replicates', 'PMI', '# nuclei with RNA assay', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'name', 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype

### data label

In [105]:
# chose clustering label
file_name = 'dIPFC.h5ad'
target = 'cell_type'

# remove NaN
adata = adata[adata.obs[target] != 'nan']

# counter
cell_types = Counter(adata.obs[target])
print('[prev] #cell_type:', len(cell_types))

# filter out cell types with less than 50 cells
cell_types = {key: value for key, value in cell_types.items() if value > 200}
print('[post] #cell_type:', len(cell_types))

cell_types

[prev] #cell_type: 25
[post] #cell_type: 24


{'oligodendrocyte': 131014,
 'L2/3 intratelencephalic projecting glutamatergic neuron': 158915,
 'sst GABAergic cortical interneuron': 29295,
 'oligodendrocyte precursor cell': 25543,
 'endothelial cell': 20869,
 'vip GABAergic cortical interneuron': 22359,
 'astrocyte': 65115,
 'L6 corticothalamic-projecting glutamatergic cortical neuron': 12375,
 'L6 intratelencephalic projecting glutamatergic neuron of the primary motor cortex': 19352,
 'microglial cell': 25988,
 'L5/6 near-projecting glutamatergic neuron of the primary motor cortex': 7315,
 'pericyte': 14158,
 'lamp5 GABAergic cortical interneuron': 13582,
 'T cell': 1416,
 'pvalb GABAergic cortical interneuron': 22730,
 'inhibitory interneuron': 9331,
 'macrophage': 1869,
 'L6b glutamatergic cortical neuron': 9502,
 'vascular leptomeningeal cell': 6033,
 'smooth muscle cell': 5769,
 'L5 extratelencephalic projecting glutamatergic cortical neuron': 2365,
 'chandelier pvalb GABAergic cortical interneuron': 4548,
 'erythroid lineage 

In [106]:
adata = adata[adata.obs[target].isin(cell_types.keys())]
labels_dict = dl.label_encode(adata, target)
adata

  adata.obs['label'] = labels


AnnData object with n_obs × n_vars = 610653 × 4494
    obs: 'nGene', 'nUMI', 'percent.mt', 'mapped_reads', 'subtype', 'subclass', 'class', 'tech_rep', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'donor_id', 'suspension_type', 'is_primary_data', 'Source', 'author_age_year', '# technical replicates', 'PMI', '# nuclei with RNA assay', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'label'
    var: 'name', 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'featur

## 2. Write

In [107]:
dir_path = '/projects/zhanglab/users/david/data/processed/'
adata.write(dir_path + file_name)
adata

AnnData object with n_obs × n_vars = 610653 × 4494
    obs: 'nGene', 'nUMI', 'percent.mt', 'mapped_reads', 'subtype', 'subclass', 'class', 'tech_rep', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'donor_id', 'suspension_type', 'is_primary_data', 'Source', 'author_age_year', '# technical replicates', 'PMI', '# nuclei with RNA assay', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'label'
    var: 'name', 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'featur