In [12]:
import numpy as np
import scanorama
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import scanpy as sc
import os

In [13]:
from scanorama import correct, visualize, process_data
from scanorama import dimensionality_reduce

In [14]:
def plotTSNE(adata, color_group, n_pcs=20, perplexity=90, save_filename='tsne', use_repx = False):
    #adata.var_names_make_unique()
    if use_repx:
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity, use_rep='X')
    else:    
        sc.tl.tsne(adata, random_state=0, n_pcs=n_pcs, perplexity=perplexity)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.4)
    save_images(save_filename) 

In [15]:
def write_to_csv(mat, genesname, cellsname, filename, save_dir):
    if isinstance(mat, np.ndarray):
        df = pd.DataFrame(mat, columns=genesname, index=cellsname)
    else:
        df = pd.DataFrame(mat.toarray(), columns=genesname, index=cellsname)        
    
    df.to_csv(os.path.join(save_dir,filename))

In [16]:
# Function to save figure into image in the figures folder
def save_images(basename):
    outname = os.path.join(save_dir, basename + '.png')
    pl.savefig(outname, dpi=150)
    pl.close()

In [17]:
dirname = '/acrc/jinmiao/CJM_lab/Marion/Project/Hoa_batch_normalization/simulation_dataset_V3/demo_scanorama/'
print(dirname)
data_dir = '/acrc/jinmiao/CJM_lab/Marion/Project/Hoa_batch_normalization/simulation_dataset_V3/data/'
print(data_dir)

simulation = ['simul1_dropout_005_b1_500_b2_900', 'simul2_dropout_025_b1_500_b2_900', 'simul3_dropout_005_b1_500_b2_450','simul4_dropout_025_b1_500_b2_450','simul5_dropout_005_b1_80_b2_400','simul6_dropout_025_b1_80_b2_400']
counts = ['all', 'HVG']

/acrc/jinmiao/CJM_lab/Marion/Project/demo_scanorama/
/acrc/jinmiao/CJM_lab/Marion/Project/data/


In [18]:
for simu_name in simulation:
    print(simu_name)
    for genes in counts:
        print(genes)

        # Create folder to save the results 
        if not os.path.exists(os.path.join(dirname,simu_name)): os.makedirs(os.path.join(dirname,simu_name))
        if not os.path.exists(os.path.join(dirname,simu_name,genes)): os.makedirs(os.path.join(dirname,simu_name,genes))
        save_dir = os.path.join(dirname, simu_name, genes)

        # read data 
        if genes=='HVG':
            adata = pd.read_csv(os.path.join(data_dir,simu_name,'counts_HVG.txt'),sep='\t',header=0, index_col=0)
        else:
            adata = pd.read_csv(os.path.join(data_dir,simu_name,'counts.txt'),sep='\t',header=0, index_col=0)

        # transform to Scanpy object
        adata = sc.AnnData(adata)
        print(adata)  
        print(adata.obs_names[0:3])  # cells: observation
        print(adata.var_names[0:3])  # genes: variable

        # Read sample file, which contain celltype and batch info
        sample_adata = pd.read_csv(os.path.join(data_dir,simu_name,'cellinfo.txt'),header=0, index_col=0, sep='\t')
        print(sample_adata.values.shape)
        print(sample_adata.keys())

        # Save label information into adata object
        adata.obs['cell_type'] = sample_adata.loc[adata.obs_names,['Group']]
        adata.obs['batch'] = sample_adata.loc[adata.obs_names,['Batch']]
        print(len(adata.obs['cell_type']))
        print(len(adata.obs['batch']))

        # separate the object into two objects
        #sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
        #sc.pp.log1p(adata)
        batch1 = adata[adata.obs['batch']=='Batch1',:].copy()
        print(batch1)
        batch2 = adata[adata.obs['batch']=='Batch2',:].copy()
        print(batch2)

        # scanorama run
        adata_ls = [batch1, batch2]
        corrected = scanorama.correct_scanpy(adata_ls,batch_size=30,return_dense=True, knn=10)
        corrected_adata = sc.AnnData(np.concatenate([corrected[0].X, corrected[1].X]))
        print(corrected_adata)
        corrected_adata.var_names = adata.var_names
        corrected_adata.obs_names = adata.obs_names
        corrected_adata.obs = adata.obs

        # run PCA
        npcs = 20
        sc.tl.pca(corrected_adata, svd_solver='arpack', n_comps=npcs)
        corrected_adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
        # sc.pp.neighbors(corrected_adata,n_neighbors=15, n_pcs=20)

        # plot tSNE
        # take into account only first 20 PCs vectors, important parameter, can affect the output
        perplex = 30
        color_group = ["cell_type","batch"]    
        corrected_adata.obs['batch']=corrected_adata.obs['batch'].astype('category')  # factor function in R
        plotTSNE(corrected_adata, color_group, npcs, perplex, 'tsne_scanorama_corrected')  #npcs=20, perplex=90

        #tSNE coordinates
        colnt = []
        for i in range(corrected_adata.obsm['X_tsne'].shape[1]):
            colnt.append("tSNE_"+str(i+1))
        df = pd.DataFrame(corrected_adata.obsm['X_tsne'], columns=colnt, index=corrected_adata.obs_names)
        df['batch'] = pd.Series(corrected_adata.obs['batch'], index=corrected_adata.obs_names)
        df['celltype'] = pd.Series(corrected_adata.obs['cell_type'], index=corrected_adata.obs_names)
        df.to_csv(os.path.join(save_dir,'scanorama_tsne.csv')) 

        # Save as h5ad format. Corrected data have same dimensions as input data 
        corrected_adata.write_h5ad(os.path.join(save_dir,'output.h5ad'))

        # Write normalized data to csv file 
        filename = 'output.csv'
        write_to_csv(corrected_adata.X, corrected_adata.var_names, corrected_adata.obs_names,filename, save_dir)

simul1_dropout_005_b1_500_b2_900
all
AnnData object with n_obs × n_vars = 1400 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(1400, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
1400
1400
AnnData object with n_obs × n_vars = 500 × 5000 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 900 × 5000 
    obs: 'cell_type', 'batch'
Found 5000 genes among all datasets
[[0.    0.712]
 [0.    0.   ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 1400 × 5000 


... storing 'cell_type' as categorical


HVG
AnnData object with n_obs × n_vars = 1400 × 712 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene3', 'Gene24', 'Gene25'], dtype='object')
(1400, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
1400
1400
AnnData object with n_obs × n_vars = 500 × 712 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 900 × 712 
    obs: 'cell_type', 'batch'
Found 712 genes among all datasets
[[0.    0.786]
 [0.    0.   ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 1400 × 712 


... storing 'cell_type' as categorical


simul2_dropout_025_b1_500_b2_900
all
AnnData object with n_obs × n_vars = 1400 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(1400, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
1400
1400
AnnData object with n_obs × n_vars = 500 × 5000 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 900 × 5000 
    obs: 'cell_type', 'batch'
Found 5000 genes among all datasets
[[0.    0.724]
 [0.    0.   ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 1400 × 5000 


... storing 'cell_type' as categorical


HVG
AnnData object with n_obs × n_vars = 1400 × 740 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene3', 'Gene24', 'Gene25'], dtype='object')
(1400, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
1400
1400
AnnData object with n_obs × n_vars = 500 × 740 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 900 × 740 
    obs: 'cell_type', 'batch'
Found 740 genes among all datasets
[[0.    0.816]
 [0.    0.   ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 1400 × 740 


... storing 'cell_type' as categorical


simul3_dropout_005_b1_500_b2_450
all
AnnData object with n_obs × n_vars = 950 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
AnnData object with n_obs × n_vars = 500 × 5000 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 450 × 5000 
    obs: 'cell_type', 'batch'
Found 5000 genes among all datasets
[[0.         0.79555556]
 [0.         0.        ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 950 × 5000 


... storing 'cell_type' as categorical


HVG
AnnData object with n_obs × n_vars = 950 × 699 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene9', 'Gene15', 'Gene20'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
AnnData object with n_obs × n_vars = 500 × 699 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 450 × 699 
    obs: 'cell_type', 'batch'
Found 699 genes among all datasets
[[0.         0.74444444]
 [0.         0.        ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 950 × 699 


... storing 'cell_type' as categorical


simul4_dropout_025_b1_500_b2_450
all
AnnData object with n_obs × n_vars = 950 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
AnnData object with n_obs × n_vars = 500 × 5000 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 450 × 5000 
    obs: 'cell_type', 'batch'
Found 5000 genes among all datasets
[[0.         0.85333333]
 [0.         0.        ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 950 × 5000 


... storing 'cell_type' as categorical


HVG
AnnData object with n_obs × n_vars = 950 × 720 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene9', 'Gene15', 'Gene20'], dtype='object')
(950, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
950
950
AnnData object with n_obs × n_vars = 500 × 720 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 450 × 720 
    obs: 'cell_type', 'batch'
Found 720 genes among all datasets
[[0.         0.75111111]
 [0.         0.        ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 950 × 720 


... storing 'cell_type' as categorical


simul5_dropout_005_b1_80_b2_400
all
AnnData object with n_obs × n_vars = 480 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
AnnData object with n_obs × n_vars = 80 × 5000 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 400 × 5000 
    obs: 'cell_type', 'batch'
Found 5000 genes among all datasets
[[0.     0.9875]
 [0.     0.    ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 480 × 5000 


... storing 'cell_type' as categorical


HVG
AnnData object with n_obs × n_vars = 480 × 647 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene2', 'Gene13', 'Gene32'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
AnnData object with n_obs × n_vars = 80 × 647 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 400 × 647 
    obs: 'cell_type', 'batch'
Found 647 genes among all datasets
[[0.   0.95]
 [0.   0.  ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 480 × 647 


... storing 'cell_type' as categorical


simul6_dropout_025_b1_80_b2_400
all
AnnData object with n_obs × n_vars = 480 × 5000 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene1', 'Gene2', 'Gene3'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
AnnData object with n_obs × n_vars = 80 × 5000 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 400 × 5000 
    obs: 'cell_type', 'batch'
Found 5000 genes among all datasets
[[0. 1.]
 [0. 0.]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 480 × 5000 


... storing 'cell_type' as categorical


HVG
AnnData object with n_obs × n_vars = 480 × 676 
Index(['Cell1', 'Cell2', 'Cell3'], dtype='object')
Index(['Gene2', 'Gene13', 'Gene18'], dtype='object')
(480, 4)
Index(['Cell', 'Batch', 'Group', 'ExpLibSize'], dtype='object')
480
480
AnnData object with n_obs × n_vars = 80 × 676 
    obs: 'cell_type', 'batch'
AnnData object with n_obs × n_vars = 400 × 676 
    obs: 'cell_type', 'batch'
Found 676 genes among all datasets
[[0.    0.975]
 [0.    0.   ]]
Processing datasets (0, 1)
AnnData object with n_obs × n_vars = 480 × 676 


... storing 'cell_type' as categorical
