In [1]:
# !pip install scgen
# Import package 
# Main using package here is scanpy 
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
import os
import time
from datetime import timedelta
import random
import scgen

In [2]:
base_name = os.path.basename(os.getcwd())
print(base_name)

dataset9_Human_cell_atlas


In [3]:
print(sc.__version__)
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
# sc.settings.set_figure_params(dpi=300, frameon=False)  # low dpi (dots per inch) yields small inline figures

1.4.3
scanpy==1.4.3 anndata==0.6.21 umap==0.3.9 numpy==1.16.2 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.20.3 statsmodels==0.9.0 


In [4]:
def save_images(base_name, dpi=300, fig_type = ".png"):
    output_dir = os.path.dirname(base_name)
    if not output_dir=="" and os.path.exists(output_dir): os.makedirs(output_dir)
    fn, fe = os.path.splitext(base_name)
    if(fe == ""):
        base_name = base_name + fig_type
    pl.savefig(base_name, dpi=dpi)
    pl.close()
    
def plotTSNE(adata, color_group, n_pcs=20, perplexity=30, save_filename='tsne', use_repx = False):
    #adata.var_names_make_unique()
    random.seed(42)
    if use_repx:
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity, use_rep='X')
    else:    
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity, n_jobs=20)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.4)
    save_images(save_filename) 
    
def plotUMAP(adata, color_group, save_filename, use_repx = False):
    
#     if use_repx:
#         sc.pp.neighbors(adata, use_rep='X')
#     else:    
#         sc.pp.neighbors(adata,n_neighbors=10, n_pcs=20)
        
    sc.tl.umap(adata)
    sc.pl.umap(adata, color = color_group, show=False, wspace=.4)
    save_images(save_filename)

In [5]:
# read data from read count text table, data from R: genes x cells
expression_data = '../../dataset/dataset9_Human_cell_atlas/HCA_all_UMI_filtered_log_transposed.txt'
adata = sc.read_text(expression_data, delimiter='\t', first_column_names=True, dtype='float64')
print(adata)  # 6954 x 1328
print(adata.obs_names[0:3])
print(adata.var_names[0:3])



AnnData object with n_obs × n_vars = 762000 × 18969 
Index(['MantonCB1_HiSeq_1-AAACCTGAGGAGTTGC-1',
       'MantonCB1_HiSeq_1-AAACCTGAGGCATTGG-1',
       'MantonCB1_HiSeq_1-AAACCTGCACAGACAG-1'],
      dtype='object')
Index(['ENSG00000238009', 'ENSG00000279457', 'ENSG00000228463'], dtype='object')


In [6]:
# Read sample into a pandas series
cell_info = "./HCA_all_cell_info.txt"
sample_adata = pd.read_csv(cell_info,header=0, index_col=0, sep='\t')
print(sample_adata.values.shape)
print(sample_adata.keys())
print(sample_adata.index)

(762000, 7)
Index(['batch_detail', 'cell_type', 'donor', 'age', 'batch', 'sex', 'species'], dtype='object')
Index(['MantonCB1_HiSeq_1-AAACCTGAGGAGTTGC-1',
       'MantonCB1_HiSeq_1-AAACCTGAGGCATTGG-1',
       'MantonCB1_HiSeq_1-AAACCTGCACAGACAG-1',
       'MantonCB1_HiSeq_1-AAACCTGCACAGATTC-1',
       'MantonCB1_HiSeq_1-AAACCTGCACCCAGTG-1',
       'MantonCB1_HiSeq_1-AAACCTGCACTGTGTA-1',
       'MantonCB1_HiSeq_1-AAACCTGCAGACGCAA-1',
       'MantonCB1_HiSeq_1-AAACCTGCAGGTCCAC-1',
       'MantonCB1_HiSeq_1-AAACCTGCAGTCTTCC-1',
       'MantonCB1_HiSeq_1-AAACCTGGTTCCACGG-1',
       ...
       'MantonBM8_HiSeq_8-TTTGTCAAGTTTAGGA-1',
       'MantonBM8_HiSeq_8-TTTGTCACAACTGCTA-1',
       'MantonBM8_HiSeq_8-TTTGTCACACTGTGTA-1',
       'MantonBM8_HiSeq_8-TTTGTCAGTACCGCTG-1',
       'MantonBM8_HiSeq_8-TTTGTCAGTATCAGTC-1',
       'MantonBM8_HiSeq_8-TTTGTCAGTCAACATC-1',
       'MantonBM8_HiSeq_8-TTTGTCATCATTTGGG-1',
       'MantonBM8_HiSeq_8-TTTGTCATCCTCAACC-1',
       'MantonBM8_HiSeq_8-TTTGTCATC

In [7]:
adata.obs['batch'] = sample_adata.loc[adata.obs_names, "batch"]
print(len(adata.obs['batch']))
adata.obs['cell_type'] = sample_adata.loc[adata.obs_names, "cell_type"]
print(len(adata.obs['cell_type']))
# Save output into h5ad, easy to access 
# adata.write_h5ad(os.path.join(data_dir,'hvg_dataset2_cellatlas.h5ad'))

762000
762000


In [8]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata,n_neighbors=15, n_pcs=20)
sc.tl.umap(adata)

computing PCA with n_comps = 50
    finished (0:13:09.04)
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished (0:03:56.82) --> added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix
computing UMAP
    using 'X_pca' with n_pcs = 20
    finished (0:20:50.68) --> added
    'X_umap', UMAP coordinates (adata.obsm)


In [9]:
sc.pl.umap(adata, color=["batch"], wspace=.3, show=False)
save_images('dataset10_umap')

... storing 'batch' as categorical


In [10]:
color_group = ["cell_type","batch"]
plotTSNE(adata, color_group, 20, 90, base_name + '_tsne')

computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)
    finished (0:57:57.27) --> added
    'X_tsne', tSNE coordinates (adata.obsm)


In [None]:
print("Create a network")
t1 = time.time()
# Initialize scGen with input is number of genes
import scgen
network = scgen.VAEArith(x_dimension=adata.shape[1], model_path="./results_cellatlas/batch_hvg")
# Need to check batch_size

W0806 19:36:56.125436 139810207172416 deprecation_wrapper.py:119] From /home/xm/.local/lib/python3.7/site-packages/scgen/models/_vae.py:42: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.

W0806 19:36:56.127121 139810207172416 deprecation_wrapper.py:119] From /home/xm/.local/lib/python3.7/site-packages/scgen/models/_vae.py:48: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.



Create a network


W0806 19:36:59.182927 139810207172416 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0806 19:36:59.184667 139810207172416 deprecation_wrapper.py:119] From /home/xm/.local/lib/python3.7/site-packages/scgen/models/_vae.py:78: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

W0806 19:36:59.185702 139810207172416 deprecation_wrapper.py:119] From /home/xm/.local/lib/python3.7/site-packages/scgen/models/_vae.py:78: The name tf.AUTO_REUSE is deprecated. Please use tf.compat.v1.AUTO_REUSE instead.

W0806 19:36:59.187080 139810207172416 deprecation.py:323] From /home/xm/.local/lib/python3.7/site-packages/scgen/models/_vae

In [None]:
print("Train a network")
# Train scGen with nb epochs = 100
# Requirement: adata should contain 2 vector: adata.obs["cell_type"] and adata.obs["batch"]
network.train(train_data=adata, n_epochs=100, batch_size=50)

Train a network


In [None]:
from scipy import sparse
import anndata
# Using trained model to correct, normalize data 
# Using batch removal function from scGen package 

#corrected_adata = scgen.batch_removal(network, total_ann)

# In case this function does not work, replace batch_removal function by this function: batch_removal_v2
# Hoa Tran
def batch_removal_v2(network, adata):
    if sparse.issparse(adata.X):
        latent_all = network.to_latent(adata.X.A)
    else:
        latent_all = network.to_latent(adata.X)
    adata_latent = anndata.AnnData(latent_all)
    adata_latent.obs["cell_type"] = adata.obs["cell_type"].tolist()
    adata_latent.obs["batch"] = adata.obs["batch"].tolist()
    adata_latent.obs["cell_name"] = adata.obs["cell_name"].tolist()   #Hoa keep cell name infos
    unique_cell_types = np.unique(adata_latent.obs["cell_type"])
    shared_ct = []
    not_shared_ct = []
    for cell_type in unique_cell_types:
        temp_cell = adata_latent[adata_latent.obs["cell_type"] == cell_type]
        if len(np.unique(temp_cell.obs["batch"])) < 2:
            cell_type_ann = adata_latent[adata_latent.obs["cell_type"] == cell_type]
            not_shared_ct.append(cell_type_ann)
            continue
        temp_cell = adata_latent[adata_latent.obs["cell_type"] == cell_type]
        batch_list = {}
        batch_ind = {}
        max_batch = 0
        max_batch_ind = ""
        batches = np.unique(temp_cell.obs["batch"])
        for i in batches:
            temp = temp_cell[temp_cell.obs["batch"] == i]
            temp_ind = temp_cell.obs["batch"] == i
            if max_batch < len(temp):
                max_batch = len(temp)
                max_batch_ind = i
            batch_list[i] = temp
            batch_ind[i] = temp_ind
        max_batch_ann = batch_list[max_batch_ind]
        for study in batch_list:
            delta = np.average(max_batch_ann.X, axis=0) - np.average(batch_list[study].X, axis=0)
            batch_list[study].X = delta + batch_list[study].X
            temp_cell[batch_ind[study]].X = batch_list[study].X
        shared_ct.append(temp_cell)
    all_shared_ann = anndata.AnnData.concatenate(*shared_ct, batch_key="concat_batch")
    del all_shared_ann.obs["concat_batch"]
    if len(not_shared_ct) < 1:
        corrected = anndata.AnnData(network.reconstruct(all_shared_ann.X, use_data=True))
        corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist()
        corrected.obs["batch"] = all_shared_ann.obs["batch"].tolist()
        corrected.obs["cell_name"] = all_shared_ann.obs["cell_name"].tolist() #Hoa keep cell name infos
        corrected.var_names = adata.var_names.tolist()
        corrected.obs_names = corrected.obs['cell_name'] #Hoa assign cell name infos
        return corrected
    else:
        all_not_shared_ann = anndata.AnnData.concatenate(*not_shared_ct, batch_key="concat_batch")
        all_corrected_data = anndata.AnnData.concatenate(all_shared_ann, all_not_shared_ann, batch_key="concat_batch")
        del all_corrected_data.obs["concat_batch"]
        corrected = anndata.AnnData(network.reconstruct(all_corrected_data.X, use_data=True), )
        corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist() + all_not_shared_ann.obs[
            "cell_type"].tolist()
        corrected.obs["batch"] = all_shared_ann.obs["batch"].tolist() + all_not_shared_ann.obs["batch"].tolist()
        corrected.obs["cell_name"] = all_shared_ann.obs["cell_name"].tolist() + all_not_shared_ann.obs[
            "cell_name"].tolist()     #Hoa keep cell name infos
        corrected.var_names = adata.var_names.tolist()
        corrected.obs_names = corrected.obs['cell_name'] #Hoa assign cell name infos
        return corrected

In [None]:
print("Correct data")
# Correct data using batch_removal function
# Input: adata and network model 
adata.obs['cell_name'] = adata.obs_names
adata.obs['batch']=adata.obs['batch'].astype('category')
corrected_adata = batch_removal_v2(network, adata)
t2 = time.time()
print('Took '+str(timedelta(seconds=t2-t1)))
# corrected_adata = scgen.batch_removal(network, adata1)
# For verification
# print(corrected_adata.obs['cell_name'][350:400])
# print(corrected_adata.obs['cell_type'][350:400])
corrected_adata.obs_names[1:10]
print(corrected_adata)

In [None]:
print(t1)
print(t2)
time_taken = t2 - t1
time_taken_mins = divmod(time_taken, 60)
time_taken_hours, rest = divmod( time_taken, 3600)
hours_mins, hours_secs = divmod( rest, 60)
print('Took seconds: '+str(timedelta(seconds=round(time_taken))))
print('Took minutes: '+str(time_taken_mins))
print('Took hours_minutes_seconds: ',str(time_taken_hours),str(hours_mins),str(hours_secs))
usecase_name = 'scGen'
filename = 'hvg_scGen_exetime.csv'

data = {'use_case':usecase_name, 'exetime_secs':str(round(time_taken)),
       'exetimehours': str(time_taken_hours),
       'exetimemins': str(hours_mins),
       'exetimesecs':str(round(hours_secs))} 
  
# Creates pandas DataFrame. 
df = pd.DataFrame(data, index =['exetime'])
print(df)
df.to_csv(base_name + "_exetime.csv") 

In [None]:
sc.tl.pca(corrected_adata, svd_solver='arpack', n_comps=20)
corrected_adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat, same scale

In [None]:
plotTSNE(corrected_adata, color_group, 20, 90, base_name + '_scgene_corrected_tsne')

In [None]:
adata.write_csvs(base_name + "_results")

In [None]:
corrected_adata.write_csvs(base_name + "_corrected_results")