In [1]:
# Hoa Tran
# !pip install scgen
# More infos at: https://github.com/theislab/scGen
# Import package 
# Main using package here is scanpy 
import scgen
print(scgen.__version__)
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
from scipy import sparse
import anndata
import os
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
# sc.settings.set_figure_params(dpi=300, frameon=False)  # low dpi (dots per inch) yields small inline figures

1.0.0
scanpy==1.4.1 anndata==0.6.19 numpy==1.16.1 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.0 statsmodels==0.8.0 python-igraph==0.7.1 louvain==0.6.1 


In [2]:
def plotTSNE(adata, color_group, n_pcs=20, perplexity=90, save_filename='tsne', use_repx = False):
    #adata.var_names_make_unique()
    if use_repx:
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity, use_rep='X')
    else:    
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.4)
    save_images(save_filename) 

In [3]:
def write_to_csv(mat, genesname, cellsname, filename, save_dir):
    if isinstance(mat, np.ndarray):
        df = pd.DataFrame(mat, columns=genesname, index=cellsname)
    else:
        df = pd.DataFrame(mat.toarray(), columns=genesname, index=cellsname)        
    
    df.to_csv(os.path.join(save_dir,filename))

In [4]:
# Function to save figure into image in the figures folder
def save_images(basename):
    outname = os.path.join(save_dir, basename + '.png')
    pl.savefig(outname, dpi=150)
    pl.close()

In [5]:
#corrected_adata = scgen.batch_removal(network, total_ann)
# !error

# In case this function produce an output matrix without cells name, can not use for evaluation
# replace batch_removal function by: batch_removal_v2
# use a vector to keep cells name
# Hoa Tran
def batch_removal_v2(network, adata):
    if sparse.issparse(adata.X):
        latent_all = network.to_latent(adata.X.A)
    else:
        latent_all = network.to_latent(adata.X)
    adata_latent = anndata.AnnData(latent_all)
    adata_latent.obs["cell_type"] = adata.obs["cell_type"].tolist()
    adata_latent.obs["batch"] = adata.obs["batch"].tolist()
    adata_latent.obs["cell_name"] = adata.obs["cell_name"].tolist()   #Hoa keep cell name infos
    unique_cell_types = np.unique(adata_latent.obs["cell_type"])
    shared_ct = []
    not_shared_ct = []
    for cell_type in unique_cell_types:
        temp_cell = adata_latent[adata_latent.obs["cell_type"] == cell_type]
        if len(np.unique(temp_cell.obs["batch"])) < 2:
            cell_type_ann = adata_latent[adata_latent.obs["cell_type"] == cell_type]
            not_shared_ct.append(cell_type_ann)
            continue
        temp_cell = adata_latent[adata_latent.obs["cell_type"] == cell_type]
        batch_list = {}
        batch_ind = {}
        max_batch = 0
        max_batch_ind = ""
        batches = np.unique(temp_cell.obs["batch"])
        for i in batches:
            temp = temp_cell[temp_cell.obs["batch"] == i]
            temp_ind = temp_cell.obs["batch"] == i
            if max_batch < len(temp):
                max_batch = len(temp)
                max_batch_ind = i
            batch_list[i] = temp
            batch_ind[i] = temp_ind
        max_batch_ann = batch_list[max_batch_ind]
        for study in batch_list:
            delta = np.average(max_batch_ann.X, axis=0) - np.average(batch_list[study].X, axis=0)
            batch_list[study].X = delta + batch_list[study].X
            temp_cell[batch_ind[study]].X = batch_list[study].X
        shared_ct.append(temp_cell)
    all_shared_ann = anndata.AnnData.concatenate(*shared_ct, batch_key="concat_batch")
    del all_shared_ann.obs["concat_batch"]
    if len(not_shared_ct) < 1:
        corrected = anndata.AnnData(network.reconstruct(all_shared_ann.X, use_data=True))
        corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist()
        corrected.obs["batch"] = all_shared_ann.obs["batch"].tolist()
        corrected.obs["cell_name"] = all_shared_ann.obs["cell_name"].tolist() #Hoa keep cell name infos
        corrected.var_names = adata.var_names.tolist()
        corrected.obs_names = corrected.obs['cell_name'] #Hoa assign cell name infos
        return corrected
    else:
        all_not_shared_ann = anndata.AnnData.concatenate(*not_shared_ct, batch_key="concat_batch")
        all_corrected_data = anndata.AnnData.concatenate(all_shared_ann, all_not_shared_ann, batch_key="concat_batch")
        del all_corrected_data.obs["concat_batch"]
        corrected = anndata.AnnData(network.reconstruct(all_corrected_data.X, use_data=True), )
        corrected.obs["cell_type"] = all_shared_ann.obs["cell_type"].tolist() + all_not_shared_ann.obs[
            "cell_type"].tolist()
        corrected.obs["batch"] = all_shared_ann.obs["batch"].tolist() + all_not_shared_ann.obs["batch"].tolist()
        corrected.obs["cell_name"] = all_shared_ann.obs["cell_name"].tolist() + all_not_shared_ann.obs[
            "cell_name"].tolist()     #Hoa keep cell name infos
        corrected.var_names = adata.var_names.tolist()
        corrected.obs_names = corrected.obs['cell_name'] #Hoa assign cell name infos to obs_names
        return corrected

In [6]:
dirname = '/acrc/jinmiao/CJM_lab/Marion/Project/demo_scGen/'
#dirname = '/acrc/jinmiao/CJM_lab/Marion/Project/Hoa_batch_normalization/simulation_dataset_V3/demo_scGen/'
print(dirname)
data_dir = '/acrc/jinmiao/CJM_lab/Marion/Project/data/'
#data_dir = '/acrc/jinmiao/CJM_lab/Marion/Project/Hoa_batch_normalization/simulation_dataset_V3/data/'
print(data_dir)

simulation = ['simul1_dropout_005_b1_500_b2_900', 'simul2_dropout_025_b1_500_b2_900', 'simul3_dropout_005_b1_500_b2_450','simul4_dropout_025_b1_500_b2_450','simul5_dropout_005_b1_80_b2_400','simul6_dropout_025_b1_80_b2_400']
counts = ['all', 'HVG']

/acrc/jinmiao/CJM_lab/Marion/Project/demo_scGen/
/acrc/jinmiao/CJM_lab/Marion/Project/data/


In [7]:
for simu_name in simulation:
    print(simu_name)
    for genes in counts:
        print(genes)

        # Create folder to save the results 
        if not os.path.exists(os.path.join(dirname,simu_name)): os.makedirs(os.path.join(dirname,simu_name))
        if not os.path.exists(os.path.join(dirname,simu_name,genes)): os.makedirs(os.path.join(dirname,simu_name,genes))
        save_dir = os.path.join(dirname, simu_name, genes)

        # read data 
        if genes=='HVG':
            adata = pd.read_csv(os.path.join(data_dir,simu_name,'counts_HVG.txt'),sep='\t',header=0, index_col=0)
        else:
            adata = pd.read_csv(os.path.join(data_dir,simu_name,'counts.txt'),sep='\t',header=0, index_col=0)

        # transform to Scanpy object
        adata = sc.AnnData(adata)
        print(adata)  
        print(adata.obs_names[0:3])  # cells: observation
        print(adata.var_names[0:3])  # genes: variable

        # Read sample file, which contain celltype and batch info
        sample_adata = pd.read_csv(os.path.join(data_dir,simu_name,'cellinfo.txt'),header=0, index_col=0, sep='\t')
        print(sample_adata.values.shape)
        print(sample_adata.keys())

        # Save label information into adata object
        adata.obs['cell_type'] = sample_adata.loc[adata.obs_names,['Group']]
        adata.obs['batch'] = sample_adata.loc[adata.obs_names,['Batch']]
        print(len(adata.obs['cell_type']))
        print(len(adata.obs['batch']))

        # normalize and log before batch effect correction
        sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
        sc.pp.log1p(adata)

        # scGen network training
        # adata.shape[1]: number of features, here is number of genes
        # model_path: where to save trained model file
        print("Create a network")
        # Initialize scGen with input is number of genes
        import scgen
        network = scgen.VAEArith(x_dimension=adata.shape[1], model_path= os.path.join(dirname, simu_name,genes,'network'))
        print("Train a network")
        # Train scGen with nb epochs = 100
        # Can change the batch_size to 30 if dataset is too small
        # Requirement for scGen: adata should contain 2 label vector: adata.obs["cell_type"] and adata.obs["batch"]
        network.train(train_data=adata, n_epochs=100, batch_size=50)

        # scGen run
        print("Correct data")
        # Correct data using batch_removal_v2 function
        # Input: adata and network model 
        adata.obs['cell_name'] = adata.obs_names   # create a vector 'cell_name' to keep cell name first
        corrected_adata = batch_removal_v2(network, adata)   # get normalized expression matrix from batch_removal function
        # corrected_adata = scgen.batch_removal(network, adata1)
        print(corrected_adata.obs_names[1:5])
        print(corrected_adata)
    
        # run PCA
        npcs = 20
        sc.tl.pca(corrected_adata, svd_solver='arpack', n_comps=npcs) 
        corrected_adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
        # sc.pp.neighbors(corrected_adata,n_neighbors=15, n_pcs=20)

        # plot tSNE
        # take into account only first 20 PCs vectors, important parameter, can affect the output
        perplex = 30
        color_group = ["cell_type","batch"]    
        corrected_adata.obs['batch']=corrected_adata.obs['batch'].astype('category')  # factor function in R
        plotTSNE(corrected_adata, color_group, npcs, perplex, 'tsne_scGen_corrected')  #npcs=20, perplex=90

        #tSNE coordinates
        colnt = []
        for i in range(corrected_adata.obsm['X_tsne'].shape[1]):
            colnt.append("tSNE_"+str(i+1))
        df = pd.DataFrame(corrected_adata.obsm['X_tsne'], columns=colnt, index=corrected_adata.obs_names)
        df['batch'] = pd.Series(corrected_adata.obs['batch'], index=corrected_adata.obs_names)
        df['celltype'] = pd.Series(corrected_adata.obs['cell_type'], index=corrected_adata.obs_names)
        df.to_csv(os.path.join(save_dir,'scGen_tsne.csv')) 

        # Save as h5ad format. Corrected data have same dimensions as input data 
        corrected_adata.write_h5ad(os.path.join(save_dir,'output.h5ad'))

        # Write normalized data to csv file 
        filename = 'output.csv'
        write_to_csv(corrected_adata.X, corrected_adata.var_names, corrected_adata.obs_names,filename, save_dir)

simul1_dropout_005_b1_500_b2_900
all
AnnData object with n_obs × n_vars = 1400 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(1400, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
1400
1400
Create a network
Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.batch_normalization instead.
Instructions for updating:
Use keras.layers.dropout instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train a network
Correct data
Index(['Cell2', '

... storing 'cell_type' as categorical


    finished (0:00:06.61) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
HVG
AnnData object with n_obs × n_vars = 1400 × 712 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene3', 'Gene24', 'Gene25'], dtype='object')
(1400, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
1400
1400
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 1400 × 5000 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:07.24) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
HVG
AnnData object with n_obs × n_vars = 1400 × 740 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene3', 'Gene24', 'Gene25'], dtype='object')
(1400, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
1400
1400
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 1400 × 740 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:07.70) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
simul3_dropout_005_b1_500_b2_450
all
AnnData object with n_obs × n_vars = 950 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 950 × 5000 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:04.81) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
HVG
AnnData object with n_obs × n_vars = 950 × 699 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene9', 'Gene15', 'Gene20'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 950 × 699 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:04.95) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
simul4_dropout_025_b1_500_b2_450
all
AnnData object with n_obs × n_vars = 950 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 950 × 5000 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:04.40) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
HVG
AnnData object with n_obs × n_vars = 950 × 720 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene9', 'Gene15', 'Gene20'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 950 × 720 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:05.12) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
simul5_dropout_005_b1_80_b2_400
all
AnnData object with n_obs × n_vars = 480 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 480 × 5000 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:02.13) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
HVG
AnnData object with n_obs × n_vars = 480 × 647 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene2', 'Gene13', 'Gene32'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 480 × 647 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:02.27) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
simul6_dropout_025_b1_80_b2_400
all
AnnData object with n_obs × n_vars = 480 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 480 × 5000 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:02.16) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
HVG
AnnData object with n_obs × n_vars = 480 × 676 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene2', 'Gene13', 'Gene18'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
Create a network
Train a network
Correct data
Index(['Cell2', 'Cell4', 'Cell7', 'Cell10'], dtype='object', name='cell_name')
AnnData object with n_obs × n_vars = 480 × 676 
    obs: 'cell_type', 'batch', 'cell_name'
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)


... storing 'cell_type' as categorical


    finished (0:00:02.33) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
