In [None]:
import sys
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import scvi

import cell2location

from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text for PDFs

###
results_folder = './results/BPD/'

# create paths and names to results folders for reference regression and cell2location models
ref_run_name = f'{results_folder}/ref_multiome_sc'
run_name = f'{results_folder}/ref_multiome_sp'

### Load single cell reference

In [None]:
# single cell reference
#####################################################################################################
#####################################################################################################

adata_ref = sc.read(f'./data/sc/Multiome_SoupX_BPD_Control.h5ad')

# cell2location requires unnormalized counts
adata_ref = adata_ref.raw.to_adata()

adata_ref

In [None]:

from cell2location.utils.filtering import filter_genes
selected = filter_genes(adata_ref, cell_count_cutoff=5, cell_percentage_cutoff2=0.03, nonz_mean_cutoff=1.12)

# filter the object
adata_ref = adata_ref[:, selected].copy()

adata_ref

## Estimation of reference cell type signatures (NB regression)

In [None]:
# prepare anndata for the regression model
cell2location.models.RegressionModel.setup_anndata(adata=adata_ref,
                        # 10X reaction / sample / batch
                        batch_key='DonorID',
                        # cell type, covariate used for constructing signatures
                        labels_key='Celltype',
                        # multiplicative technical effects (platform, 3' vs 5', donor effect)
                        categorical_covariate_keys=None
                       )

# create and train the regression model
from cell2location.models import RegressionModel
mod = RegressionModel(adata_ref)

mod.view_anndata_setup()

train the model to estimate the reference cell type signatures

In [None]:
# Use all data for training (validation not implemented yet, train_size=1)
mod.train(max_epochs=1000, batch_size=2500, 
          #train_size=1, lr=0.002, 
          #use_gpu=True
          )

Determine if the model needs more training. If it is still decreasing, increase max_epochs.

In [None]:
mod.plot_history(20)

In [None]:
# Save model
mod.save(f"{ref_run_name}", overwrite=True)

export the estimated cell abundance (summary of the posterior distribution)

In [None]:
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_ref = mod.export_posterior(
    adata_ref, sample_kwargs={'num_samples': 1000, 'batch_size': 2500}
)

In [None]:
adata_ref.var

In [None]:
adata_ref.var.rename(columns={'_index': 'ENSEMBL'}, inplace=True)
adata_ref.var_names = adata_ref.var['ENSEMBL']
adata_ref.var.drop(columns='ENSEMBL', inplace=True)

In [None]:
adata_ref.var

In [None]:
# Save anndata object with results
adata_file = f"{ref_run_name}/sc.h5ad"
adata_ref.write(adata_file)
adata_file

In [None]:
mod.plot_QC()

Deconvolution analysis

In [None]:
import sys
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import scvi

import cell2location

from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text for PDFs

###
results_folder = './results/BPD/'

# create paths and names to results folders for reference regression and cell2location models
ref_run_name = f'{results_folder}/ref_multiome_sc'
run_name = f'{results_folder}/ref_st_multiome'

In [None]:
adata_file = f"{ref_run_name}/sc.h5ad"
adata_ref = sc.read_h5ad(adata_file)
adata_ref

In [None]:
# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in adata_ref.varm.keys():
    inf_aver = adata_ref.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
else:
    inf_aver = adata_ref.var[[f'means_per_cluster_mu_fg_{i}'
                                    for i in adata_ref.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_ref.uns['mod']['factor_names']
inf_aver.iloc[0:5, 0:5]

In [None]:
adata_vis = sc.read_h5ad(filename="st.h5ad")
adata_vis

In [None]:
# find shared genes and subset both anndata and reference signatures
intersect = np.intersect1d(adata_vis.var_names, inf_aver.index)
adata_vis = adata_vis[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()

# prepare anndata for cell2location model
cell2location.models.Cell2location.setup_anndata(adata=adata_vis, batch_key="sample")

In [None]:
# create and train the model
mod = cell2location.models.Cell2location(
    adata_vis, cell_state_df=inf_aver,
    # the expected average cell abundance: tissue-dependent
    # hyper-prior which can be estimated from paired histology:
    N_cells_per_location=8,
    # hyperparameter controlling normalisation of
    # within-experiment variation in RNA detection:
    #detection_alpha=20
)
mod.view_anndata_setup()

In [None]:
mod.train(max_epochs=30000,
          # train using full data (batch_size=None)
          batch_size=None,
          # use all data points in training because
          # we need to estimate cell abundance at all locations
          train_size=1,
         )

In [None]:
# Save model
mod.save(f"{run_name}", overwrite=True)

In [None]:
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_vis = mod.export_posterior(
    adata_vis, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs}
)

In [None]:
# Save anndata object with results
adata_file = f"{run_name}/sp.h5ad"
adata_vis.write(adata_file)
adata_file

In [None]:
mod.plot_QC()

In [None]:
fig = mod.plot_spatial_QC_across_batches()

In [None]:
cell2location.utils.list_imported_modules()