In [None]:
import scanpy as sc
import scanorama
import anndata
import os
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=120, dpi_save=150, facecolor='white', color_map='tab20b')

In [None]:
OUTPUT_PATH = "C:/Users/julia/Project/scanorama_output"

### SIMPLE integration process using sc.external.pp.scanorama_integrate two integrate the two samples ###

load in anndata objects, filtering, pp, hvg, regressing & scaling have been performed. They were frozen right after PCA performance.
Results stored in h5ad file, now loaded in:

In [None]:
anndata_BLN = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_N/AnnData_storage/BL_N.h5ad")
anndata_BLC = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_C/AnnData_storage/BL_C.h5ad")
anndata_BLA = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_A/AnnData_storage/BL_A.h5ad")

In [None]:
print('BL_N: ', anndata_BLN.shape)
print('BL_C: ', anndata_BLC.shape)
print('BL_A: ', anndata_BLA.shape)

make the obs 'batch' that is a dramatic overrepresentation of the sample name, because otherwise scanorama does not work :(

In [None]:
anndata_BLA.obs['batch'] = 'BL_A'
anndata_BLC.obs['batch'] = 'BL_C'
anndata_BLN.obs['batch'] = 'BL_N'

In [None]:
print('BL_N\n', anndata_BLN, '\nBL_C\n', anndata_BLC, '\nBL_A\n', anndata_BLA)

sc.external.pp.scanorama() does not directly accept a list of AnnData objects, you need to concatinate the objects firts, then yeet them in the function, and seperate them by key (batch in this case)

In [None]:
combi = anndata.concat([anndata_BLC.raw.to_adata(), anndata_BLN.raw.to_adata()], index_unique="_")

In [None]:
combi.obs

The function finds 1762 similar genes

In [None]:
combi

Run Scanorma, the key 'batch' simply has the sample name stored.

In [None]:
# default settings:
# sc.external.pp.scanorama_integrate(combi, key='batch', basis='X_pca', adjusted_basis='X_scanorama', knn=20, sigma=15, approx=True, alpha=0.1, batch_size=5000)
sc.external.pp.scanorama_integrate(combi, key='batch')

In [None]:
combi

In [None]:
integrated_expression = combi.X
integrated_expression

In [None]:
sc.pp.neighbors(combi)
sc.tl.umap(combi)

In [None]:
sc.pl.umap(combi, color='batch', 
           legend_loc='on data', palette='Set1',
             color_map='magma', title='BL N & BL C integrated UMAP',
             return_fig=False, show=False)

--------------------------------------------------------------------------------------------------------------------

Use Scanorama directly

In [None]:
anndata_BLN = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_N/AnnData_storage/BL_N.h5ad")
anndata_BLC = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_C/AnnData_storage/BL_C.h5ad")
anndata_BLA = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_A/AnnData_storage/BL_A.h5ad")

In [None]:
anndata_BLA.obs['batch'] = 'BL_A'
anndata_BLC.obs['batch'] = 'BL_C'
anndata_BLN.obs['batch'] = 'BL_N'

Note: HVG found that both sets have in common are the same as the concatinated dataset I made in the code above

In [None]:
data_list = [anndata_BLA.raw.to_adata(), anndata_BLC.raw.to_adata()]
integrated_data = scanorama.integrate_scanpy(data_list)
# integrated_data = anndata.AnnData(X=integrated_data)


In [None]:
print('BL_A:\n', data_list[0],'\nBL_C:\n', data_list[1])

In [None]:
integrated_adata = anndata.AnnData(X=anndata.concat(data_list))

In [None]:
integrated_adata

In [None]:
sc.pp.neighbors(integrated_adata)
sc.tl.umap(integrated_adata)

In [None]:
sc.pl.umap(integrated_adata, color='batch', 
           legend_loc='on data', palette='tab20b',
             color_map='magma', title='BL A & BL C integrated Scanorama UMAP',
             return_fig=False, show=False)

In [None]:
integrated_adata

--------------------------------------------------------------------------------------------------------------------

Try out harmony

In [None]:
anndata_BLN = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_N/AnnData_storage/BL_N.h5ad")
anndata_BLC = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_C/AnnData_storage/BL_C.h5ad")
anndata_BLA = sc.read_h5ad("C:/Users/julia/Project/ipynb_output/BL_A/AnnData_storage/BL_A.h5ad")

In [None]:
anndata_BLA.obs['batch'] = 'BL_A'
anndata_BLC.obs['batch'] = 'BL_C'
anndata_BLN.obs['batch'] = 'BL_N'

I think the results are all the same because of this line --> look into this more w/ Maurits

In [None]:
combi = anndata.concat([anndata_BLA, anndata_BLC])

In [None]:
sc.external.pp.harmony_integrate(combi, key='batch', basis='X_pca', adjusted_basis='X_pca_harmony')

In [None]:
sc.pp.neighbors(combi)
sc.tl.umap(combi)

In [None]:
sc.pl.umap(combi, color='batch', 
           legend_loc='on data', palette='tab20b',
             color_map='magma', title='BL A & BL C integrated harmony UMAP')

Using scanorama functions (not scanpy)

In [None]:
# list = [anndata_BLN.raw.X, anndata_BLC.raw.X]
# genes_list = [anndata_BLN.raw.var_names, anndata_BLC.raw.var_names]

looks like concat, but does not return the same object. This seems to be going good. still finds 1762 genes in common (on hvg genes) and 18682 in common (on all raw genes)
this function gets called in correct

In [None]:
# a, b = scanorama.merge_datasets(list, genes_list)

correct does a batch correction and integration at the same time?

In [None]:
# datasets, genes = scanorama.correct(list, genes_list)

In [None]:
# datasets

In [None]:
# genes