In [31]:
# Scanorama batch effect correction
# See more at: https://github.com/brianhie/scanorama
# Hoa Tran 
# Update code from python version 2 to python version 3, Keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from matplotlib import rcParams
import time
from datetime import timedelta
import scanpy as sc
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()

scanpy==1.4+18.gaabe446 anndata==0.6.17 numpy==1.15.4 scipy==1.1.0 pandas==0.23.4 scikit-learn==0.20.2 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [32]:
# Create folder to save the results 
import os
dirname = os.getcwd()
print(dirname)

if not os.path.exists('./results/'): os.makedirs('./results/')   
if not os.path.exists('./results/results_dataset4_pancreatic/'): os.makedirs('./results/results_dataset4_pancreatic/')    
save_dir = os.path.join(dirname, 'results/results_dataset4_pancreatic/')

# Function to save figure as image in the figures folder
save_fig_dir='./figures/dataset4_pancreatic/'
if not os.path.exists('./figures/'): os.makedirs('./figures/')
if not os.path.exists(save_fig_dir): os.makedirs(save_fig_dir)
def save_images(filename, save_fig_dir):    
    outname = save_fig_dir + filename + '.png'
    pl.savefig(outname, dpi=150)
    pl.close()

/acrc/jinmiao/CJM_lab/hoatran/demo_normalization/demo_Scanorama/scanorama_python


In [33]:
def load_data(data_dir, myDataFn, mySampleFn,save_dir, savefn, saveh5ad=False):
    myData = pd.read_csv(os.path.join(data_dir,myDataFn),header=0, index_col=0, sep='\t')
    mySample = pd.read_csv(os.path.join(data_dir,mySampleFn),header=0, index_col=0, sep='\t')
    adata = sc.AnnData(myData.values.T)
    adata.obs_names = myData.keys()
    adata.var_names = myData.index
    adata.obs['cell_type'] = mySample.loc[adata.obs_names,['celltype']]
    adata.obs['batch'] = mySample.loc[adata.obs_names,['batch']]
#     adata.obs['batch'] = adata.obs['batch'].astype('category')
    adata.obs['batchlb'] = mySample.loc[adata.obs_names,['batchlb']]

    # Save output into h5ad, lightweight, easy to access and load again
    # Similar to rds format in R
    if saveh5ad:
        adata.write_h5ad(os.path.join(save_dir,savefn))
        
    print(adata)
    return adata

In [34]:
data_dir = os.path.join('/home/hoa/hoatran/demo_normalization/dataset/dataset4_human_pancreatic/raw_data_python/')

# myDataFn = 'myData_pancreatic_5batches.txt'
# mySampleFn = 'mySample_pancreatic_5batches.txt'
savefn = 'myRawData1.h5ad'
# adata = load_data(data_dir, myDataFn, mySampleFn, save_dir, savefn,saveh5ad=True)
# print('Read and filter data')

adata = sc.read_h5ad(os.path.join(data_dir,savefn))

In [35]:
adata

AnnData object with n_obs × n_vars = 14767 × 15558 
    obs: 'cell_type', 'batch', 'batchlb'

In [36]:
# Ignore this step for this dataset
# Filtering data 
# adata
# sc.pp.filter_cells(adata, min_genes=300)
# sc.pp.filter_genes(adata, min_cells=10)

# Not different in the case of Scanorama
# sc.pp.log1p(adata)
# sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
adata

AnnData object with n_obs × n_vars = 14767 × 15558 
    obs: 'cell_type', 'batch', 'batchlb'

In [37]:
adata_dup = adata.copy()
sc.pp.normalize_per_cell(adata_dup)
sc.pp.log1p(adata_dup)

In [38]:
sc.pp.highly_variable_genes(adata_dup, n_top_genes=5000, flavor='seurat')
print([sum(adata_dup.var['highly_variable']),len(adata_dup.var['highly_variable'])])
adata = adata[:,adata_dup.var['highly_variable']]
adata

--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
[5000, 15558]


View of AnnData object with n_obs × n_vars = 14767 × 5000 
    obs: 'cell_type', 'batch', 'batchlb'

In [39]:
from scanorama import correct, visualize, process_data
from scanorama import dimensionality_reduce

# Extract data from batch 1 and batch 2
adata1_filtered = adata[adata.obs['batch']==1,:].copy()  # after concatenate, the values change from 1 to 0, 2 to 1
print(adata1_filtered)
adata2_filtered = adata[adata.obs['batch']==2,:].copy()
print(adata2_filtered)
adata3_filtered = adata[adata.obs['batch']==3,:].copy()
print(adata3_filtered)
adata4_filtered = adata[adata.obs['batch']==4,:].copy()
print(adata4_filtered)
adata5_filtered = adata[adata.obs['batch']==5,:].copy()
print(adata5_filtered)


AnnData object with n_obs × n_vars = 8569 × 5000 
    obs: 'cell_type', 'batch', 'batchlb'
AnnData object with n_obs × n_vars = 2122 × 5000 
    obs: 'cell_type', 'batch', 'batchlb'
AnnData object with n_obs × n_vars = 2127 × 5000 
    obs: 'cell_type', 'batch', 'batchlb'
AnnData object with n_obs × n_vars = 457 × 5000 
    obs: 'cell_type', 'batch', 'batchlb'
AnnData object with n_obs × n_vars = 1492 × 5000 
    obs: 'cell_type', 'batch', 'batchlb'


In [40]:
adata_ls = [adata1_filtered, adata2_filtered, adata3_filtered, adata4_filtered, adata5_filtered]


In [41]:
import scanorama
# Batch correction.
# datasets = [df.values]
# List of datasets (matrices of cells-by-genes)
t1 = time.time()
corrected = scanorama.correct_scanpy(adata_ls, batch_size=50, return_dense=True, knn=20)
# corrected, genes = scanorama.correct(adata_ls, adata1.var_names,batch_size=30)
t2 = time.time()
print('Took '+str(timedelta(seconds=t2-t1)))

Found 5000 genes among all datasets
[[0.         0.20263902 0.348378   0.09846827 0.1963807 ]
 [0.         0.         0.63195099 0.27133479 0.01979265]
 [0.         0.         0.         0.73085339 0.26474531]
 [0.         0.         0.         0.         0.27613941]
 [0.         0.         0.         0.         0.        ]]
Processing datasets (2, 3)
Processing datasets (1, 2)
Processing datasets (0, 2)
Processing datasets (3, 4)
Processing datasets (1, 3)
Processing datasets (2, 4)
Processing datasets (0, 1)
Processing datasets (0, 4)
Took 0:06:02.516935


In [42]:
total_ann = sc.AnnData(np.concatenate([corrected[0].X, corrected[1].X, corrected[2].X, corrected[3].X, corrected[4].X]))
print(total_ann)

AnnData object with n_obs × n_vars = 14767 × 5000 


In [43]:
print(corrected[0].var_names[1:5])
# print(corrected[0].obs_names[0:10]==adata1_filtered.obs_names[0:10])
# print(corrected[1])
# print(corrected[1].obs_names[0:10]==adata2_filtered.obs_names[0:10])
# print(corrected[2])
# print(corrected[2].obs_names[0:10]==adata3_filtered.obs_names[0:10])
# print(corrected[1])
# print(corrected[3].obs_names[0:10]==adata4_filtered.obs_names[0:10])
# print(corrected[4].obs_names[0:10]==adata5_filtered.obs_names[0:10])
print(corrected[0].obs_names[0:5])

Index(['A4GALT', 'AAAS', 'AADAC', 'AARSD1'], dtype='object')
Index(['0', '1', '2', '3', '4'], dtype='object')


In [44]:
total_ann.var_names = corrected[0].var_names
total_ann.obs_names = adata1_filtered.obs_names.tolist() + adata2_filtered.obs_names.tolist() + adata3_filtered.obs_names.tolist() + adata4_filtered.obs_names.tolist() + adata5_filtered.obs_names.tolist()
total_ann.obs['cell_type'] = adata1_filtered.obs['cell_type'].tolist() + adata2_filtered.obs['cell_type'].tolist() + adata3_filtered.obs['cell_type'].tolist() + adata4_filtered.obs['cell_type'].tolist() + adata5_filtered.obs['cell_type'].tolist()
total_ann.obs['batch'] = adata1_filtered.obs['batch'].tolist() + adata2_filtered.obs['batch'].tolist() + adata3_filtered.obs['batch'].tolist() + adata4_filtered.obs['batch'].tolist() + adata5_filtered.obs['batch'].tolist()
total_ann.obs['batchlb'] = adata1_filtered.obs['batchlb'].tolist() + adata2_filtered.obs['batchlb'].tolist() + adata3_filtered.obs['batchlb'].tolist() + adata4_filtered.obs['batchlb'].tolist() + adata5_filtered.obs['batchlb'].tolist()
total_ann

AnnData object with n_obs × n_vars = 14767 × 5000 
    obs: 'cell_type', 'batch', 'batchlb'

In [45]:
sc.tl.pca(total_ann, svd_solver='arpack') # n_comps=20
total_ann.obsm['X_pca'] *= -1  # multiply by -1 to match Seurat


In [46]:
def getExecutionTime(t1, t2, save_dir, usecase_name,filename):
    time_taken = (t2 - t1)
    time_taken_mins = divmod(time_taken, 60)
    time_taken_hours, rest = divmod( time_taken, 3600)
    hours_mins, hours_secs = divmod( rest, 60)
    print('Took seconds: '+str(timedelta(seconds=round(time_taken))))
    print('Took minutes: '+str(time_taken_mins))
    print('Took hours_minutes_seconds: ',str(time_taken_hours),str(hours_mins),str(hours_secs))
    
    

    data = {'use_case':usecase_name, 'exetime_secs':str(round(time_taken)),
           'exetimehours': str(time_taken_hours),
           'exetimemins': str(hours_mins),
           'exetimesecs':str(round(hours_secs))} 
    df = pd.DataFrame(data, index =['exetime'])
    print(df)
    df.to_csv(save_dir+filename) 

# Evaluation runtime of main batch effect removal function
filename = 'scanorama_exetime.csv'
usecase_name = 'scanorama_exetime' 
getExecutionTime(t1, t2, save_dir, usecase_name, filename)  # t1: start time, t2: end time  

Took seconds: 0:06:03
Took minutes: (6.0, 2.516935110092163)
Took hours_minutes_seconds:  0.0 6.0 2.516935110092163
                  use_case exetime_secs exetimehours exetimemins exetimesecs
exetime  scanorama_exetime          363          0.0         6.0           3


In [47]:
npcs = 20  # our pre-defined
perplex = 30  # our pre-defined
nb_neighbors = 15


# def save_images(filename, save_fig_dir):    
#     outname = save_fig_dir + filename + '.png'
#     pl.savefig(outname, dpi=150)
#     pl.close()
    
# Function to plot TSNE
def plotTSNE(adata, color_group, save_filename='tsne', save_dir='', n_pcs=20, perplex=30, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.tl.tsne(adata, perplexity=perplex, use_rep='X')
    else:    # Run tsne using pcs vectors
        sc.tl.tsne(adata, n_pcs=n_pcs, perplexity=perplex)
    sc.pl.tsne(adata, color = color_group, show=False, wspace=.3)
    save_images(save_filename, save_dir) 
    
def plotUMAP(adata, color_group, save_filename='umap', save_dir='', npcs=20, nb_neighbors=15, use_repx = False):
    
    # Run with all genes and entire matrix
    if use_repx:
        sc.pp.neighbors(adata, use_rep='X')
    else:    # Run umap using pcs vectors
        sc.pp.neighbors(adata,n_neighbors=nb_neighbors, n_pcs=npcs)
        
    sc.tl.umap(adata)
    sc.pl.umap(adata, color = color_group, show=False)
    save_images(save_filename, save_dir)

    
color_group = ["batchlb","cell_type"] 
save_fn_tsne = 'scanorama_tsne'
save_fn_umap = 'scanorama_umap'
plotUMAP(total_ann, color_group, save_fn_umap, save_fig_dir, npcs, nb_neighbors, False)
plotTSNE(total_ann, color_group, save_fn_tsne, save_fig_dir, npcs, perplex, False)

computing neighbors
    using 'X_pca' with n_pcs = 20
    finished (0:00:09.85) --> added to `.uns['neighbors']`
    'distances', distances for each pair of neighbors
    'connectivities', weighted adjacency matrix
computing UMAP


... storing 'cell_type' as categorical
... storing 'batchlb' as categorical


    finished (0:00:17.33) --> added
    'X_umap', UMAP coordinates (adata.obsm)
computing tSNE
    using 'X_pca' with n_pcs = 20
    using the 'MulticoreTSNE' package by Ulyanov (2017)
    finished (0:01:40.85) --> added
    'X_tsne', tSNE coordinates (adata.obsm)


In [48]:
def save_output(adata, save_dir): 
    colnu = []
    for i in range(adata.obsm['X_umap'].shape[1]):
        colnu.append("UMAP"+str(i+1))
    df = pd.DataFrame(adata.obsm['X_umap'], columns=colnu, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_umap.csv')  

    # Save output of tsne for visualization
    colnt = []
    for i in range(adata.obsm['X_tsne'].shape[1]):
        colnt.append("tSNE_"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_tsne'], columns=colnt, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_tsne.csv') 

    # Save output of pca for evaluation ASW
    colnpc = []
    for i in range(20):
        colnpc.append("X_pca"+str(i+1))

    df = pd.DataFrame(adata.obsm['X_pca'][:, :20], columns=colnpc, index=adata.obs_names)
    df['batch'] = pd.Series(adata.obs['batch'], index=adata.obs_names)
    df['batchlb'] = pd.Series(adata.obs['batchlb'], index=adata.obs_names)
    df['celltype'] = pd.Series(adata.obs['cell_type'], index=adata.obs_names)
    df.to_csv(save_dir+'scanorama_pca.csv')

save_output(total_ann, save_dir)

In [49]:
savefn = 'scanorama_normalized_adata.h5ad'
total_ann.write_h5ad(os.path.join(save_dir,savefn))