In [1]:
# import package
import discotoolkit as dt
import scanpy as sc
import pandas as pd
import numpy as np



For the sake of the tutorial, we will first download a sample in h5ad file using the download data funciton and then apply the discotoolkit CELLiD annotation function

In [2]:
# filter to only one sample
filter = dt.Filter(sample="AML003_3p")

# filter the database based on the metadata
metadata = dt.filter_disco_metadata(filter) 

# download the data and ignore if it is exist
dt.download_disco_data(metadata) 

INFO:root:Retrieving metadata from DISCO database
INFO:root:Filtering sample
INFO:root:Retrieving cell type information of each sample from DISCO database
INFO:root:1 samples and 6086 cells were found
INFO:root: AML003_3p has been downloaded before. Ignore ...


In [3]:
# helper function to allow the user to see how many atlas is in disco database
print(dt.get_atlas())

['breast', 'skin', 'stomach', 'adipose', 'intestine', 'testis', 'skeletal_muscle', 'pancreas', 'PDAC', 'bladder', 'brain', 'gingiva', 'fibroblast', 'ovary', 'blood', 'heart', 'thymus', 'eye', 'breast_milk', 'adrenal_gland', 'lung', 'ovarian_cancer', 'liver', 'placenta', 'tonsil', 'bone_marrow', 'kidney']


In [4]:
# first we need to read the h5ad file and extract the raw gene expression
adata = sc.read_h5ad("DISCOtmp/AML003_3p.h5ad")
temp = adata.X.toarray()

# apply log normalise to the count data gene expression
### Ignore this if the data has been normalised
log_temp = sc.pp.normalize_per_cell(temp, counts_per_cell_after=1e4, copy=True)
norm_temp = sc.pp.log1p(log_temp, copy=True)

temp = pd.DataFrame(norm_temp, columns=adata.var.index)

temp["cluster"] = np.array(adata.obs["seurat_clusters"]) # get the cluster metadata from 
integrated_data = temp.groupby("cluster").mean().transpose() # get the average expression for each cluster

# we want the rna format to have gene as index and cluster category as the columns
# here is the example. gene, cluster
integrated_data.head()


This is where adjacency matrices should go now.
  warn(


cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
MIR1302-2HG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM138A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,0.011724,0.006387,0.004899,0.004184,0.001418,0.006817,0.001781,0.015216,0.005277,0.005516,0.002518,0.011815,0.0,0.014784,0.004878,0.018821,0.0
AL627309.3,0.0,0.0,0.001803,0.001054,0.0,0.0,0.005733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# apply cellid_cluster function to annotate the cluster
cell_type = dt.CELLiD_cluster(rna = integrated_data, atlas = ["lung"], n_predict = 1)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
cell_type

Unnamed: 0,cell_type,atlas,score,input_index
0,Cycling AT2,lung,0.598743,0
0,Cycling AT2,lung,0.608232,1
0,Cycling AT2,lung,0.595361,2
0,Red blood cell,lung,0.655653,3
0,Red blood cell,lung,0.654327,4
0,Cycling AT2,lung,0.606264,5
0,Red blood cell,lung,0.64777,6
0,Cycling AT2,lung,0.610074,7
0,Red blood cell,lung,0.629338,8
0,ADAMDEC1+ADAM28+ fibroblast,lung,0.593066,9
