In [5]:
# Scanorama batch effect correction
# See more at: https://github.com/brianhie/scanorama
# Hoa Tran 

import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import time
from datetime import timedelta
import scanpy as sc
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

scanpy==1.4+18.gaabe446 anndata==0.6.17 numpy==1.15.4 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.2 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [6]:
# Create folder to save the results 
import os
dirname = os.getcwd()
print(dirname)
data_dir = os.path.join('/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/dataset/dataset7_Mouse_retina/final/')

if not os.path.exists('./results/'): os.makedirs('./results/')   
if not os.path.exists('./results/results_dataset7_MouseRetina/'): os.makedirs('./results/results_dataset7_MouseRetina/')    
save_dir = os.path.join(dirname, 'results/results_dataset7_MouseRetina/')

# Function to save figure as image in the figures folder
save_fig_dir='./figures/dataset7_MouseRetina/'
if not os.path.exists('./figures/'): os.makedirs('./figures/')
if not os.path.exists(save_fig_dir): os.makedirs(save_fig_dir)
def save_images(filename, save_fig_dir):    
    outname = save_fig_dir + filename + '.png'
    pl.savefig(outname, dpi=150)
    pl.close()

/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python


In [7]:
def load_data(data_dir, myDataFn, mySampleFn, batchid, batchlb,save_dir, savefn, saveh5ad=True):
    myData = pd.read_csv(os.path.join(data_dir,myDataFn),header=0, index_col=0, sep='\t')
    print(myData.values.shape)
    print(myData.keys()[1:3])
    print(myData.index[1:3])
    mySample = pd.read_csv(os.path.join(data_dir,mySampleFn),header=0, index_col=0, sep='\t')
    print(mySample.values.shape)
    print(mySample.keys())
    print(mySample.index[1:5])
    adata = sc.AnnData(myData.values.T)
    adata.obs_names = myData.keys()
    adata.var_names = myData.index
    adata.obs['cell_type'] = mySample.loc[adata.obs_names,['CellType']]
    adata.obs['batch'] = batchid
#     adata.obs['batch'] = adata.obs['batch'].astype('category')
    adata.obs['batchlb'] = batchlb

    # Save output into h5ad, lightweight, easy to access and load again
    # Similar to rds format in R
    if saveh5ad:
        adata.write_h5ad(os.path.join(save_dir,savefn))
        
    print(adata)
    return adata

In [8]:
data_dir = os.path.join('/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/dataset/dataset7_Mouse_retina/raw_data_python/')
# myDataFn1 = 'b1_exprs.txt'
# mySampleFn1 = 'b1_celltype.txt'
# batchid1 = 1
# batchlb1 = 'batch1'
savefn1 = 'myRawData1.h5ad'
# adata1 = load_data(data_dir, myDataFn1, mySampleFn1, batchid1, batchlb1, save_dir, savefn1,saveh5ad=False)

# myDataFn2 = 'b2_exprs.txt'
# mySampleFn2 = 'b2_celltype.txt'
# batchid2 = 2
# batchlb2 = 'batch2'
savefn2 = 'myRawData2.h5ad'
# adata2 = load_data(data_dir, myDataFn2, mySampleFn2, batchid2, batchlb2, save_dir, savefn2,saveh5ad=False)

adata1 = sc.read_h5ad(os.path.join(data_dir,savefn1))
adata2 = sc.read_h5ad(os.path.join(data_dir,savefn2))
print(adata1)
print(adata2)

AnnData object with n_obs × n_vars = 26830 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'
AnnData object with n_obs × n_vars = 44808 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'


In [9]:
sum(adata2.var_names==adata1.var_names)

12333

In [10]:
from scanorama import correct, visualize, process_data
from scanorama import dimensionality_reduce
import scanorama
adata_ls = [adata1, adata2]


In [11]:

# Batch correction.
# datasets = [df.values]
# List of datasets (matrices of cells-by-genes)
t1 = time.time()
corrected = scanorama.correct_scanpy(adata_ls, batch_size=50, return_dense=True, knn=20)
# corrected, genes = scanorama.correct(adata_ls, adata1.var_names,batch_size=30)
t2 = time.time()
print('Took '+str(timedelta(seconds=t2-t1)))

Found 12333 genes among all datasets
[[0.         0.51625047]
 [0.         0.        ]]
Processing datasets (0, 1)
Took 0:33:44.778367


In [16]:
adata = sc.AnnData(np.concatenate([corrected[0].X, corrected[1].X]))
print(adata)

AnnData object with n_obs × n_vars = 71638 × 12333 


In [17]:
print(corrected[0])
print(corrected[0].obs_names[0:10]==adata1.obs_names[0:10])
print(corrected[1])
print(corrected[1].obs_names[0:10]==adata2.obs_names[0:10])

AnnData object with n_obs × n_vars = 26830 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'
[ True  True  True  True  True  True  True  True  True  True]
AnnData object with n_obs × n_vars = 44808 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'
[ True  True  True  True  True  True  True  True  True  True]


In [18]:
adata.var_names = corrected[0].var_names
adata.obs_names = corrected[0].obs_names.tolist() + corrected[1].obs_names.tolist() 
adata.obs['cell_type'] = corrected[0].obs['cell_type'].tolist() + corrected[1].obs['cell_type'].tolist() 
adata.obs['batch'] = corrected[0].obs['batch'].tolist() + corrected[1].obs['batch'].tolist() 
adata.obs['batchlb'] = corrected[0].obs['batchlb'].tolist() + corrected[1].obs['batchlb'].tolist() 
adata

AnnData object with n_obs × n_vars = 71638 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'

In [19]:
npcs = 20  
sc.tl.pca(adata, svd_solver='arpack', n_comps=npcs)  
adata.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
adata

AnnData object with n_obs × n_vars = 71638 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'
    uns: 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [21]:
def getExecutionTime(t1, t2, save_dir, usecase_name,filename):
    time_taken = (t2 - t1)
    time_taken_mins = divmod(time_taken, 60)
    time_taken_hours, rest = divmod( time_taken, 3600)
    hours_mins, hours_secs = divmod( rest, 60)
    print('Took seconds: '+str(timedelta(seconds=round(time_taken))))
    print('Took minutes: '+str(time_taken_mins))
    print('Took hours_minutes_seconds: ',str(time_taken_hours),str(hours_mins),str(hours_secs))
    
    

    data = {'use_case':usecase_name, 'exetime_secs':str(round(time_taken)),
           'exetimehours': str(time_taken_hours),
           'exetimemins': str(hours_mins),
           'exetimesecs':str(round(hours_secs))} 
    df = pd.DataFrame(data, index =['exetime'])
    print(df)
    df.to_csv(save_dir+filename) 

# Evaluation runtime of main batch effect removal function
filename = 'scanorama_exetime.csv'
usecase_name = 'scanorama_exetime' 
getExecutionTime(t1, t2, save_dir, usecase_name, filename)  # t1: start time, t2: end time  
print(save_dir)

Took seconds: 0:32:43
Took minutes: (32.0, 42.52858233451843)
Took hours_minutes_seconds:  0.0 32.0 42.52858233451843
                  use_case exetime_secs exetimehours exetimemins exetimesecs
exetime  scanorama_exetime         1963          0.0        32.0          43
/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python/results/results_dataset7_MouseRetina/


In [22]:
npcs = 20  # our pre-defined
perplex = 30  # our pre-defined
nb_neighbors = 15


# def save_images(filename, save_fig_dir):    
#     outname = save_fig_dir + filename + '.png'
#     pl.savefig(outname, dpi=150)
#     pl.close()
    
# Function to plot TSNE
def plotTSNE(adata, color_group, save_filename='tsne', save_dir='', n_pcs=20, perplex=30, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.tl.tsne(adata, perplexity=perplex, use_rep='X')
    else:    # Run tsne using pcs vectors
        sc.tl.tsne(adata, n_pcs=n_pcs, perplexity=perplex)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.3)
    save_images(save_filename, save_dir) 
    
def plotUMAP(adata, color_group, save_filename='umap', save_dir='', npcs=20, nb_neighbors=15, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.pp.neighbors(adata, use_rep='X')
    else:    # Run umap using pcs vectors
        sc.pp.neighbors(adata,n_neighbors=nb_neighbors, n_pcs=npcs)
        
    sc.tl.umap(adata)
    sc.pl.umap(adata, color = color_group, show=False)
    save_images(save_filename, save_dir)

    
color_group = ["batchlb","cell_type"] 
save_fn_tsne = 'scanorama_tsne'
save_fn_umap = 'scanorama_umap'
print(save_fig_dir)
plotUMAP(adata, color_group, save_fn_umap, save_fig_dir, npcs, nb_neighbors, False)
plotTSNE(adata, color_group, save_fn_tsne, save_fig_dir, npcs, perplex, False)

./figures/dataset7_MouseRetina/
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished (0:00:50.88) --> added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix
computing UMAP


... storing 'cell_type' as categorical
... storing 'batchlb' as categorical


    finished (0:01:22.60) --> added
    'X_umap', UMAP coordinates (adata.obsm)
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)
    finished (0:11:56.13) --> added
    'X_tsne', tSNE coordinates (adata.obsm)


In [23]:
def save_output_txt(adata, save_dir): 
    colnu = []
    for i in range(adata.obsm['X_umap'].shape[1]):
        colnu.append("UMAP"+str(i+1))
    df = pd.DataFrame(adata.obsm['X_umap'], columns=colnu, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_umap.csv')  

    # Save output of tsne for visualization
    colnt = []
    for i in range(adata.obsm['X_tsne'].shape[1]):
        colnt.append("tSNE_"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_tsne'], columns=colnt, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_tsne.csv') 

    # Save output of pca for evaluation ASW
    colnpc = []
    for i in range(20):
        colnpc.append("X_pca"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_pca'][:, :20], columns=colnpc, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_pca.csv')

# Apply to our data    
save_output_txt(adata, save_dir)

In [24]:
savefn = 'scanorama_normalized_adata.h5ad'
adata.write_h5ad(os.path.join(save_dir,savefn))

In [16]:
savefn = 'scanorama_normalized_adata.h5ad'
save_dir = os.path.join(dirname, 'results_without_normalization/results_dataset7_MouseRetina/')
adata = sc.read_h5ad(os.path.join(save_dir,savefn))
adata

AnnData object with n_obs × n_vars = 71638 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'
    uns: 'batchlb_colors', 'cell_type_colors', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_umap', 'X_tsne'
    varm: 'PCs'

In [17]:
data_dir_1 = '/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/xiaomeng/generate_PCA_tSNE_UMAP_v1.1/dataset7/'
fn = 'dataset7_fastMNN_pca.csv'
mypca = pd.read_csv(os.path.join(data_dir_1, fn),header=0, index_col=0, sep=',')
print(mypca.values.shape)
print(mypca.keys()[1:3])
print(mypca.index[1:3])

(62732, 23)
Index(['V2', 'V3'], dtype='object')
Index(['Bipolar1_CAAAGCATTTGC', 'Bipolar1_CTTTTGATTGAC'], dtype='object')


In [18]:
adata_ext = adata[mypca.index,:].copy()
adata_ext

AnnData object with n_obs × n_vars = 62732 × 12333 
    obs: 'cell_type', 'batch', 'batchlb'
    uns: 'batchlb_colors', 'cell_type_colors', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_umap', 'X_tsne'
    varm: 'PCs'

In [19]:
print(save_fig_dir)

./figures/dataset7_MouseRetina/


In [20]:
npcs = 20  
sc.tl.pca(adata_ext, svd_solver='arpack', n_comps=npcs)  
adata_ext.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
adata_ext

npcs = 20  # our pre-defined
perplex = 30  # our pre-defined
nb_neighbors = 15


# def save_images(filename, save_fig_dir):    
#     outname = save_fig_dir + filename + '.png'
#     pl.savefig(outname, dpi=150)
#     pl.close()
    
# Function to plot TSNE
def plotTSNE(adata, color_group, save_filename='tsne', save_dir='', n_pcs=20, perplex=30, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.tl.tsne(adata, perplexity=perplex, use_rep='X')
    else:    # Run tsne using pcs vectors
        sc.tl.tsne(adata, n_pcs=n_pcs, perplexity=perplex)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.3)
    save_images(save_filename, save_dir) 
    
def plotUMAP(adata, color_group, save_filename='umap', save_dir='', npcs=20, nb_neighbors=15, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.pp.neighbors(adata, use_rep='X')
    else:    # Run umap using pcs vectors
        sc.pp.neighbors(adata,n_neighbors=nb_neighbors, n_pcs=npcs)
        
    sc.tl.umap(adata)
    sc.pl.umap(adata, color = color_group, show=False)
    save_images(save_filename, save_dir)

    
color_group = ["batchlb","cell_type"] 
save_fn_tsne = 'scanorama_tsne'
save_fn_umap = 'scanorama_umap'
print(save_fig_dir)
plotUMAP(adata_ext, color_group, save_fn_umap, save_fig_dir, npcs, nb_neighbors, False)
plotTSNE(adata_ext, color_group, save_fn_tsne, save_fig_dir, npcs, perplex, False)

./figures/dataset7_MouseRetina/
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished (0:00:45.61) --> added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix
computing UMAP
    finished (0:01:12.94) --> added
    'X_umap', UMAP coordinates (adata.obsm)
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)
    finished (0:10:26.01) --> added
    'X_tsne', tSNE coordinates (adata.obsm)


In [24]:
def save_output_txt(adata, save_dir): 
    colnu = []
    for i in range(adata.obsm['X_umap'].shape[1]):
        colnu.append("UMAP"+str(i+1))
    df = pd.DataFrame(adata.obsm['X_umap'], columns=colnu, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['blb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_umap.csv')  

    # Save output of tsne for visualization
    colnt = []
    for i in range(adata.obsm['X_tsne'].shape[1]):
        colnt.append("tSNE_"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_tsne'], columns=colnt, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['blb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_tsne.csv') 

    # Save output of pca for evaluation ASW
    colnpc = []
    for i in range(20):
        colnpc.append("X_pca"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_pca'][:, :20], columns=colnpc, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['blb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_pca.csv')

# Apply to our data    
save_output_txt(adata_ext, save_dir)

In [27]:
save_dir

'/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python/results_without_normalization/results_dataset7_MouseRetina/'

In [25]:
adata_ext.obsm['X_umap'].shape

(62732, 2)