In [3]:
# Scanorama batch effect correction
# See more at: https://github.com/brianhie/scanorama
# Hoa Tran 

import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import time
from datetime import timedelta
import scanpy as sc
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

scanpy==1.4+18.gaabe446 anndata==0.6.17 numpy==1.15.4 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.2 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [4]:
# Create folder to save the results 
import os
dirname = os.getcwd()
print(dirname)


if not os.path.exists('./results/'): os.makedirs('./results/')   
if not os.path.exists('./results/results_dataset12_cellline/'): os.makedirs('./results/results_dataset12_cellline/')    
save_dir = os.path.join(dirname, 'results/results_dataset12_cellline/')

# Function to save figure as image in the figures folder
save_fig_dir='./figures/dataset12_cellline/'
if not os.path.exists('./figures/'): os.makedirs('./figures/')
if not os.path.exists(save_fig_dir): os.makedirs(save_fig_dir)
def save_images(filename, save_fig_dir):    
    outname = save_fig_dir + filename + '.png'
    pl.savefig(outname, dpi=110)
    pl.close()

/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python


In [5]:
def load_data(data_dir, myDataFn, mySampleFn, batchid, batchlb, save_dir, savefn, saveh5ad=True):
    myData = pd.read_csv(os.path.join(data_dir,myDataFn),header=0, index_col=0, sep='\t')
    print(myData.values.shape)
    print(myData.keys()[1:3])
    print(myData.index[1:3])
    mySample = pd.read_csv(os.path.join(data_dir,mySampleFn),header=0, index_col=0, sep='\t')
    print(mySample.values.shape)
    print(mySample.keys())
    print(mySample.index[1:3])
    adata = sc.AnnData(myData.values.T)
    adata.obs_names = myData.keys()
    adata.var_names = myData.index
    adata.obs['cell_type'] = mySample.loc[adata.obs_names,['CellType']]
    adata.obs['batch'] = batchid
#     adata.obs['batch'] = adata.obs['batch'].astype('category')
    adata.obs['batchlb'] = batchlb

    # Save output into h5ad, lightweight, easy to access and load again
    # Similar to rds format in R
    if saveh5ad:
        adata.write_h5ad(os.path.join(save_dir,savefn))
        
    print(adata)
    return adata

In [11]:
data_dir = os.path.join('/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/dataset/dataset12_cell_line/raw_data_python/')

# myDataFn1 = 'b1_exprs.txt'
# mySampleFn1 = 'b1_celltype.txt'
# batchid1 = 1
# batchlb1 = 'batch1'
savefn1 = 'myRawData1.h5ad'
# adata1 = load_data(data_dir, myDataFn1, mySampleFn1, batchid1, batchlb1, save_dir, savefn1,saveh5ad=True)
adata1 = sc.read_h5ad(os.path.join(data_dir,savefn1))

# myDataFn2 = 'b2_exprs.txt'
# mySampleFn2 = 'b2_celltype.txt'
# batchid2 = 2
# batchlb2 = 'batch2'
savefn2 = 'myRawData2.h5ad'
# adata2 = load_data(data_dir, myDataFn2, mySampleFn2, batchid2, batchlb2, save_dir, savefn2,saveh5ad=True)
adata2 = sc.read_h5ad(os.path.join(data_dir,savefn2))

# myDataFn3 = 'b3_exprs.txt'
# mySampleFn3 = 'b3_celltype.txt'
# batchid3 = 3
# batchlb3 = 'batch3'
savefn3 = 'myRawData3.h5ad'
# adata3 = load_data(data_dir, myDataFn3, mySampleFn3, batchid3, batchlb3, save_dir, savefn3,saveh5ad=True)
adata3 = sc.read_h5ad(os.path.join(data_dir,savefn3))

print(adata1)
print(adata2)
print(adata3)
sum(adata2.var_names==adata1.var_names)
sum(adata2.var_names==adata3.var_names)

AnnData object with n_obs × n_vars = 2885 × 32738 
    obs: 'cell_type', 'batch', 'batchlb'
AnnData object with n_obs × n_vars = 3258 × 32738 
    obs: 'cell_type', 'batch', 'batchlb'
AnnData object with n_obs × n_vars = 3388 × 32738 
    obs: 'cell_type', 'batch', 'batchlb'


32738

In [12]:
# Filtering data 
adata = sc.AnnData(np.concatenate([adata1.X, adata2.X, adata3.X]))
adata.obs_names = adata1.obs_names.tolist() + adata2.obs_names.tolist() + adata3.obs_names.tolist()
adata.var_names = adata1.var_names.tolist()
adata.obs['cell_type'] = adata1.obs['cell_type'].tolist() + adata2.obs['cell_type'].tolist()+ adata3.obs['cell_type'].tolist()
adata.obs['batch'] = adata1.obs['batch'].tolist() + adata2.obs['batch'].tolist() + adata3.obs['batch'].tolist()
adata.obs['batchlb'] = adata1.obs['batchlb'].tolist() + adata2.obs['batchlb'].tolist()+ adata3.obs['batchlb'].tolist()
# Filtering data 
sc.pp.filter_cells(adata, min_genes=300)
sc.pp.filter_genes(adata, min_cells=10)
adata

filtered out 16136 genes that are detected in less than 10 cells


AnnData object with n_obs × n_vars = 9531 × 16602 
    obs: 'cell_type', 'batch', 'batchlb', 'n_genes'
    var: 'n_cells'

In [14]:
adata1 = adata[adata.obs['batch']==1,:].copy()
adata2 = adata[adata.obs['batch']==2,:].copy()
adata3 = adata[adata.obs['batch']==3,:].copy()
print(adata1)
print(adata2)
print(adata3)

AnnData object with n_obs × n_vars = 2885 × 16602 
    obs: 'cell_type', 'batch', 'batchlb', 'n_genes'
    var: 'n_cells'
AnnData object with n_obs × n_vars = 3258 × 16602 
    obs: 'cell_type', 'batch', 'batchlb', 'n_genes'
    var: 'n_cells'
AnnData object with n_obs × n_vars = 3388 × 16602 
    obs: 'cell_type', 'batch', 'batchlb', 'n_genes'
    var: 'n_cells'


In [15]:
from scanorama import correct, visualize, process_data
from scanorama import dimensionality_reduce
import scanorama
adata_ls = [adata1, adata2, adata3]


In [16]:
# Batch correction.
# datasets = [df.values]
# List of datasets (matrices of cells-by-genes)
t1 = time.time()
corrected = scanorama.correct_scanpy(adata_ls, batch_size=50, return_dense=True, knn=20)
# corrected, genes = scanorama.correct(adata_ls, adata1.var_names,batch_size=30)
t2 = time.time()
print('Took '+str(timedelta(seconds=t2-t1)))

Found 16602 genes among all datasets
[[0.         0.00859423 0.7254766 ]
 [0.         0.         0.26089626]
 [0.         0.         0.        ]]
Processing datasets (0, 2)
Processing datasets (1, 2)
Took 0:02:14.615477


In [18]:
adata_corrected = sc.AnnData(np.concatenate([corrected[0].X, corrected[1].X, corrected[2].X]))
print(adata_corrected)

AnnData object with n_obs × n_vars = 9531 × 16602 


In [21]:
adata_corrected.var_names = corrected[0].var_names
adata_corrected.obs_names = corrected[0].obs_names.tolist() + corrected[1].obs_names.tolist() + corrected[2].obs_names.tolist()
adata_corrected.obs['cell_type'] = corrected[0].obs['cell_type'].tolist() + corrected[1].obs['cell_type'].tolist() + corrected[2].obs['cell_type'].tolist()
adata_corrected.obs['batch'] = corrected[0].obs['batch'].tolist() + corrected[1].obs['batch'].tolist() + corrected[2].obs['batch'].tolist()
adata_corrected.obs['batchlb'] = corrected[0].obs['batchlb'].tolist() + corrected[1].obs['batchlb'].tolist() + corrected[2].obs['batchlb'].tolist()
adata_corrected

AnnData object with n_obs × n_vars = 9531 × 16602 
    obs: 'cell_type', 'batch', 'batchlb'
    uns: 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [22]:
npcs = 20  
sc.tl.pca(adata_corrected, svd_solver='arpack', n_comps=npcs)  
adata_corrected.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat
adata_corrected

AnnData object with n_obs × n_vars = 9531 × 16602 
    obs: 'cell_type', 'batch', 'batchlb'
    uns: 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [23]:
def getExecutionTime(t1, t2, save_dir, usecase_name,filename):
    time_taken = (t2 - t1)
    time_taken_mins = divmod(time_taken, 60)
    time_taken_hours, rest = divmod( time_taken, 3600)
    hours_mins, hours_secs = divmod( rest, 60)
    print('Took seconds: '+str(timedelta(seconds=round(time_taken))))
    print('Took minutes: '+str(time_taken_mins))
    print('Took hours_minutes_seconds: ',str(time_taken_hours),str(hours_mins),str(hours_secs))
    
    

    data = {'use_case':usecase_name, 'exetime_secs':str(round(time_taken)),
           'exetimehours': str(time_taken_hours),
           'exetimemins': str(hours_mins),
           'exetimesecs':str(round(hours_secs))} 
    df = pd.DataFrame(data, index =['exetime'])
    print(df)
    df.to_csv(save_dir+filename) 

# Evaluation runtime of main batch effect removal function
filename = 'scanorama_exetime.csv'
usecase_name = 'scanorama_exetime' 
getExecutionTime(t1, t2, save_dir, usecase_name, filename)  # t1: start time, t2: end time  
print(save_dir)

Took seconds: 0:02:15
Took minutes: (2.0, 14.615476608276367)
Took hours_minutes_seconds:  0.0 2.0 14.615476608276367
                  use_case exetime_secs exetimehours exetimemins exetimesecs
exetime  scanorama_exetime          135          0.0         2.0          15
/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python/results/results_dataset12_cellline/


In [24]:
npcs = 20  # our pre-defined
perplex = 30  # our pre-defined
nb_neighbors = 15


# def save_images(filename, save_fig_dir):    
#     outname = save_fig_dir + filename + '.png'
#     pl.savefig(outname, dpi=150)
#     pl.close()
    

def plotUMAP(adata, color_group, save_filename='umap', save_dir='', npcs=20, nb_neighbors=15, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.pp.neighbors(adata, use_rep='X')
    else:    # Run umap using pcs vectors
        sc.pp.neighbors(adata,n_neighbors=nb_neighbors, n_pcs=npcs)
        
    sc.tl.umap(adata)
    sc.pl.umap(adata, color = color_group, show=False)
    save_images(save_filename, save_dir)

    
color_group = ["batchlb","cell_type"] 
save_fn_umap = 'scanorama_umap'
print(save_fig_dir)
plotUMAP(adata_corrected, color_group, save_fn_umap, save_fig_dir, npcs, nb_neighbors, False)
print('Save output of UMAP in :',save_fig_dir)

./figures/dataset12_cellline/
computing neighbors
    using 'X_pca' with n_pcs = 20
    finished (0:00:12.82) --> added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix
computing UMAP


... storing 'cell_type' as categorical
... storing 'batchlb' as categorical


    finished (0:00:26.98) --> added
    'X_umap', UMAP coordinates (adata.obsm)
Save output of UMAP in : ./figures/dataset12_cellline/


In [25]:
# Function to plot TSNE
def plotTSNE(adata, color_group, save_filename='tsne', save_dir='', n_pcs=20, perplex=30, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.tl.tsne(adata, perplexity=perplex, use_rep='X')
    else:    # Run tsne using pcs vectors
        sc.tl.tsne(adata, n_pcs=n_pcs, perplexity=perplex)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.3)
    save_images(save_filename, save_dir) 

save_fn_tsne = 'scanorama_tsne'
plotTSNE(adata_corrected, color_group, save_fn_tsne, save_fig_dir, npcs, perplex, False)
print('Save output of t-SNE in :',save_fig_dir)

computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)
    finished (0:00:59.02) --> added
    'X_tsne', tSNE coordinates (adata.obsm)
Save output of t-SNE in : ./figures/dataset12_cellline/


In [27]:
def save_output_txt(adata, save_dir): 
    colnu = []
    for i in range(adata.obsm['X_umap'].shape[1]):
        colnu.append("UMAP"+str(i+1))
    df = pd.DataFrame(adata.obsm['X_umap'], columns=colnu, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_umap.csv')  

    # Save output of tsne for visualization
    colnt = []
    for i in range(adata.obsm['X_tsne'].shape[1]):
        colnt.append("tSNE_"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_tsne'], columns=colnt, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_tsne.csv') 

    # Save output of pca for evaluation ASW
    colnpc = []
    for i in range(20):
        colnpc.append("X_pca"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_pca'][:, :20], columns=colnpc, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_pca.csv')

# Apply to our data    
save_output_txt(adata_corrected, save_dir)
print('Save output of normalized data in :',save_dir)

Save output of normalized data in : /acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python/results/results_dataset12_cellline/


In [28]:
savefn = 'scanorama_normalized_adata.h5ad'
adata_corrected.write_h5ad(os.path.join(save_dir,savefn))
print('Save output of normalized data in :',save_dir)

Save output of normalized data in : /acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python/results/results_dataset12_cellline/
