In [105]:
# !pip install scgen
# Import package 
# Main using package here is scanpy 
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
import os
import time
from datetime import timedelta
import random
import scgen

In [106]:
base_name = os.path.basename(os.getcwd())
print(base_name)

dataset10_hematoMNN_Hoa


In [107]:
print(sc.__version__)
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
# sc.settings.set_figure_params(dpi=300, frameon=False)  # low dpi (dots per inch) yields small inline figures

1.4.3
scanpy==1.4.3 anndata==0.6.21 umap==0.3.9 numpy==1.16.2 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.20.3 statsmodels==0.9.0 


In [108]:
def save_images(base_name, dpi=300, fig_type = ".png"):
    output_dir = os.path.dirname(base_name)
    if not output_dir=="" and os.path.exists(output_dir): os.makedirs(output_dir)
    fn, fe = os.path.splitext(base_name)
    if(fe == ""):
        base_name = base_name + fig_type
    pl.savefig(base_name, dpi=dpi)
    pl.close()
    
def plotTSNE(adata, color_group, n_pcs=20, perplexity=30, save_filename='tsne', use_repx = False):
    #adata.var_names_make_unique()
    random.seed(42)
    if use_repx:
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity, use_rep='X')
    else:    
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity, n_jobs=20)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.4)
    save_images(save_filename) 
    
def plotUMAP(adata, color_group, save_filename, use_repx = False):
    
#     if use_repx:
#         sc.pp.neighbors(adata, use_rep='X')
#     else:    
#         sc.pp.neighbors(adata,n_neighbors=10, n_pcs=20)
        
    sc.tl.umap(adata)
    sc.pl.umap(adata, color = color_group, show=False, wspace=.4)
    save_images(save_filename)

In [109]:
# read data from read count text table, data from R: genes x cells
expression_data = '../../dataset/dataset10_hematoMNN_Hoa/dataset10_hematoMNN_Hoa_all_batch_rc_transpose_log.txt'
adata = sc.read_text(expression_data, delimiter='\t', first_column_names=True, dtype='float64')
print(adata)  # 6954 x 1328
print(adata.obs_names[0:3])
print(adata.var_names[0:3])



AnnData object with n_obs × n_vars = 4649 × 3467 
Index(['W31106', 'W31107', 'W31108'], dtype='object')
Index(['ENSMUSG00000000171', 'ENSMUSG00000000290', 'ENSMUSG00000000594'], dtype='object')


In [110]:
# Read sample into a pandas series
cell_info = "../../dataset/dataset10_hematoMNN_Hoa/dataset10_hematoMNN_Hoa_all_cell_anno.txt"
sample_adata = pd.read_csv(cell_info,header=0, index_col=0, sep='\t')
print(sample_adata.values.shape)
print(sample_adata.keys())
print(sample_adata.index)

(4649, 2)
Index(['cell_type', 'batch'], dtype='object')
Index(['W31106', 'W31107', 'W31108', 'W31109', 'W31110', 'W31111', 'W31113',
       'W31114', 'W31115', 'W31116',
       ...
       'Prog_851', 'Prog_809', 'Prog_816', 'Prog_822', 'Prog_828', 'Prog_834',
       'Prog_840', 'Prog_846', 'Prog_852', 'Prog_810'],
      dtype='object', length=4649)


In [124]:
adata.obs['batch'] = sample_adata.loc[adata.obs_names, "batch"]
print(len(adata.obs['batch']))
adata.obs['cell_type'] = sample_adata.loc[adata.obs_names, "cell_type"]
print(len(adata.obs['cell_type']))

# Save output into h5ad, easy to access 
# adata.write_h5ad(os.path.join(data_dir,'hvg_dataset2_cellatlas.h5ad'))

4649
4649


In [125]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata,n_neighbors=15, n_pcs=20)
sc.tl.umap(adata)

computing PCA with n_comps = 50
    finished (0:00:01.69)
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished (0:00:00.77) --> added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix
computing UMAP
    using 'X_pca' with n_pcs = 20
    finished (0:00:16.81) --> added
    'X_umap', UMAP coordinates (adata.obsm)


In [126]:
sc.pl.umap(adata, color=["batch"], wspace=.3, show=False)
save_images('dataset10_umap')

... storing 'batch' as categorical
... storing 'cell_type' as categorical


In [127]:
color_group = ["cell_type","batch"]
plotTSNE(adata, color_group, 20, 90, base_name + '_tsne')

computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)
    finished (0:00:13.80) --> added
    'X_tsne', tSNE coordinates (adata.obsm)


In [128]:
print("Create a network")
t1 = time.time()
# Initialize scGen with input is number of genes
import scgen
network = scgen.VAEArith(x_dimension=adata.shape[1], model_path="./results_cellatlas/batch_hvg")
# Need to check batch_size

Create a network


In [129]:
print("Train a network")
# Train scGen with nb epochs = 100
# Requirement: adata should contain 2 vector: adata.obs["cell_type"] and adata.obs["batch"]
network.train(train_data=adata, n_epochs=100, batch_size=50)

Train a network


In [130]:
from scipy import sparse
import anndata
# Using trained model to correct, normalize data 
# Using batch removal function from scGen package 

#corrected_adata = scgen.batch_removal(network, total_ann)

# In case this function does not work, replace batch_removal function by this function: batch_removal_v2
# Hoa Tran
def batch_removal_v2(network, adata):
    if sparse.issparse(adata.X):
        latent_all = network.to_latent(adata.X.A)
    else:
        latent_all = network.to_latent(adata.X)
    adata_latent = anndata.AnnData(latent_all)
    adata_latent.obs["cell_type"] = adata.obs["cell_type"].tolist()
    adata_latent.obs["batch"] = adata.obs["batch"].tolist()
    adata_latent.obs["cell_name"] = adata.obs["cell_name"].tolist()   #Hoa keep cell name infos
    unique_cell_types = np.unique(adata_latent.obs["cell_type"])
    shared_ct = []
    not_shared_ct = []
    for cell_type in unique_cell_types:
        temp_cell = adata_latent[adata_latent.obs["cell_type"] == cell_type]
        if len(np.unique(temp_cell.obs["batch"])) < 2:
            cell_type_ann = adata_latent[adata_latent.obs["cell_type"] == cell_type]
            not_shared_ct.append(cell_type_ann)
            continue
        temp_cell = adata_latent[adata_latent.obs["cell_type"] == cell_type]
        batch_list = {}
        batch_ind = {}
        max_batch = 0
        max_batch_ind = ""
        batches = np.unique(temp_cell.obs["batch"])
        for i in batches:
            temp = temp_cell[temp_cell.obs["batch"] == i]
            temp_ind = temp_cell.obs["batch"] == i
            if max_batch < len(temp):
                max_batch = len(temp)
                max_batch_ind = i
            batch_list[i] = temp
            batch_ind[i] = temp_ind
        max_batch_ann = batch_list[max_batch_ind]
        for study in batch_list:
            delta = np.average(max_batch_ann.X, axis=0) - np.average(batch_list[study].X, axis=0)
            batch_list[study].X = delta + batch_list[study].X
            temp_cell[batch_ind[study]].X = batch_list[study].X
        shared_ct.append(temp_cell)
    all_shared_ann = anndata.AnnData.concatenate(*shared_ct, batch_key="concat_batch")
    del all_shared_ann.obs["concat_batch"]
    if len(not_shared_ct) < 1:
        corrected = anndata.AnnData(network.reconstruct(all_shared_ann.X, use_data=True))
        corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist()
        corrected.obs["batch"] = all_shared_ann.obs["batch"].tolist()
        corrected.obs["cell_name"] = all_shared_ann.obs["cell_name"].tolist() #Hoa keep cell name infos
        corrected.var_names = adata.var_names.tolist()
        corrected.obs_names = corrected.obs['cell_name'] #Hoa assign cell name infos
        return corrected
    else:
        all_not_shared_ann = anndata.AnnData.concatenate(*not_shared_ct, batch_key="concat_batch")
        all_corrected_data = anndata.AnnData.concatenate(all_shared_ann, all_not_shared_ann, batch_key="concat_batch")
        del all_corrected_data.obs["concat_batch"]
        corrected = anndata.AnnData(network.reconstruct(all_corrected_data.X, use_data=True), )
        corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist() + all_not_shared_ann.obs[
            "cell_type"].tolist()
        corrected.obs["batch"] = all_shared_ann.obs["batch"].tolist() + all_not_shared_ann.obs["batch"].tolist()
        corrected.obs["cell_name"] = all_shared_ann.obs["cell_name"].tolist() + all_not_shared_ann.obs[
            "cell_name"].tolist()     #Hoa keep cell name infos
        corrected.var_names = adata.var_names.tolist()
        corrected.obs_names = corrected.obs['cell_name'] #Hoa assign cell name infos
        return corrected

In [131]:
print("Correct data")
# Correct data using batch_removal function
# Input: adata and network model 
adata.obs['cell_name'] = adata.obs_names
adata.obs['batch']=adata.obs['batch'].astype('category')
corrected_adata = batch_removal_v2(network, adata)
t2 = time.time()
print('Took '+str(timedelta(seconds=t2-t1)))
# corrected_adata = scgen.batch_removal(network, adata1)
# For verification
# print(corrected_adata.obs['cell_name'][350:400])
# print(corrected_adata.obs['cell_type'][350:400])
corrected_adata.obs_names[1:10]
print(corrected_adata)

Correct data


Trying to set attribute `.X` of view, making a copy.
Trying to set attribute `.X` of view, making a copy.
Trying to set attribute `.X` of view, making a copy.


Took 0:04:33.743745
AnnData object with n_obs × n_vars = 4649 × 3467 
    obs: 'cell_type', 'batch', 'cell_name'


In [132]:
print(t1)
print(t2)
time_taken = t2 - t1
time_taken_mins = divmod(time_taken, 60)
time_taken_hours, rest = divmod( time_taken, 3600)
hours_mins, hours_secs = divmod( rest, 60)
print('Took seconds: '+str(timedelta(seconds=round(time_taken))))
print('Took minutes: '+str(time_taken_mins))
print('Took hours_minutes_seconds: ',str(time_taken_hours),str(hours_mins),str(hours_secs))
usecase_name = 'scGen'
filename = 'hvg_scGen_exetime.csv'

data = {'use_case':usecase_name, 'exetime_secs':str(round(time_taken)),
       'exetimehours': str(time_taken_hours),
       'exetimemins': str(hours_mins),
       'exetimesecs':str(round(hours_secs))} 
  
# Creates pandas DataFrame. 
df = pd.DataFrame(data, index =['exetime'])
print(df)
df.to_csv(base_name + "_exetime.csv") 

1563430269.407627
1563430543.1513717
Took seconds: 0:04:34
Took minutes: (4.0, 33.74374461174011)
Took hours_minutes_seconds:  0.0 4.0 33.74374461174011
        use_case exetime_secs exetimehours exetimemins exetimesecs
exetime    scGen          274          0.0         4.0          34


In [133]:
sc.tl.pca(corrected_adata, svd_solver='arpack', n_comps=20)
corrected_adata.obsm['X_pca'] *= -1 # multiply by -1 to match Seurat, same scale

computing PCA with n_comps = 20
    finished (0:00:00.26)


In [134]:
plotTSNE(corrected_adata, color_group, 20, 90, base_name + '_scgene_corrected_tsne')

computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical
... storing 'batch' as categorical


    finished (0:00:15.85) --> added
    'X_tsne', tSNE coordinates (adata.obsm)


In [135]:
adata.write_csvs(base_name + "_results")

writing '.csv' files to dataset10_hematoMNN_Hoa_results


In [136]:
corrected_adata.write_csvs(base_name + "_corrected_results")

writing '.csv' files to dataset10_hematoMNN_Hoa_corrected_results
