# Mapping dieseased samples on top of healthy atlas

In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import h5py
import scipy.sparse as sparse
import anndata as ad
import gc
import scipy.stats as stats
import torch

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors as mcolors
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm
import seaborn as sb

# Analysis
import scanpy as sc
import scanpy.external as sce
import scvi

import scarches

In [None]:
# Warnings
import warnings
warnings.filterwarnings('ignore') #(action='once')

## setup matplotlib

In [None]:
# Settings

## Directory
base_dir = '/mnt/hdd/Notebooks/Gut_project/'
sc.settings.figdir = base_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_header()
sc.logging.print_versions()

import warnings
warnings.filterwarnings('ignore')

In [None]:
%run utils.ipynb

In [None]:
mymap = load_RdOrYl_cmap_settings(transparent=False)

### set paths:

In [None]:
import os

In [None]:
ref_model_dir_prefix = "/mnt/hdd/data/Healthy/"  # directory in which to store the reference model directory
surgery_model_dir_prefix = (
    "/mnt/hdd/data/Disease"  # directory in which to store the surgery model directory
)
path_reference_emb = (
    "/mnt/hdd/Notebooks/Gut_project/adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed_annotated.h5ad"  # path to reference embedding to be created
)
path_query_data = "/mnt/hdd/data/Diseased/adata_markedDoublets_normalized_initialAnno_noimmune_scvi_wodblts.h5ad"  # input test query data
# don't change the following paths:
ref_model_dir = "/mnt/hdd/data/Healthy/Models/2024-07-31_Healthy_mdata_markedDoublets_normalized_initialAnno_rmDoublets_integrated_labelsInitialCellType_layers2_hidden512_latent50_scANVI"  # don't change this
surgery_model_dir = os.path.join(
    surgery_model_dir_prefix, "surgery_model"
)  # don't change this

### read models

In [None]:
adata_ref = sc.read_h5ad('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata.h5ad')

In [None]:
adata_ref

In [None]:
#join covariates for scarches
adata_ref.obs['covariates'] = adata_ref.obs['sample']
adata_ref.obs['covariates'] = adata_ref.obs.apply(lambda row: '_'.join([row['sample'], row['kit']]), axis=1)

In [None]:
# Load the scANVI model from the saved file
model_path = f"{ref_model_dir}"
model = scvi.model.SCANVI.load(model_path, adata_ref)

# Extract the latent representation for the reference data
latent_representation = model.get_latent_representation()

# Create a new AnnData object for the latent representation
adata_ref_latent = ad.AnnData(latent_representation)

In [None]:
adata_ref_latent.obs = adata_ref.obs.loc[adata_ref.obs.index, :]

In [None]:
adata_ref_latent

In [None]:
adata_ref_latent.obs

In [None]:
adata_query_unprep = sc.read_h5ad(path_query_data)

In [None]:
adata_query_unprep.X = sparse.csr_matrix(adata_query_unprep.X)

In [None]:
del adata_query_unprep.obsm
del adata_query_unprep.varm

Note that the data should have raw counts and not normalized counts in adata.X. Let’s do a quick check to see if we have integer data:

In [None]:
adata_query_unprep.X[:10, :30].toarray()

In [None]:
adata_ref.var.head(5)

In [None]:
del adata_ref
gc.collect()

As you can see, the HLCA reference model requires ensemble IDs. Therefore, if your data includes ensembl IDs, we can proceed and use the standard scArches function to subset and pad our query AnnData. Make sure your adata_query_unprep.var.index contains the gene ids. If you instead only have gene names and no IDs for your query data, we will have to prepare your data manually (see below).

The test data already has ensembl ids as index:

In [None]:
adata_query_unprep.var.head(5)

In [None]:
ref_model_dir

In [None]:
adata_query = scarches.models.SCANVI.prepare_query_anndata(
    adata=adata_query_unprep, reference_model=ref_model_dir, inplace=False
)

Your query adata will now have the same number of genes as the number of model input features:

In [None]:
adata_query

In [None]:
# Convert all columns in obs to strings
adata_query.obs = adata_query.obs.applymap(str)

In [None]:
#join covariates for scarches
adata_query.obs['covariates'] = adata_query.obs['sample']
adata_query.obs['covariates'] = adata_query.obs.apply(lambda row: '_'.join([row['sample'], row['kit']]), axis=1)

In [None]:
adata_query

In [None]:
#join covariates for scarches
adata_ref_latent.obs['covariates'] = adata_ref_latent.obs['sample']
adata_ref_latent.obs['covariates'] = adata_ref_latent.obs.apply(lambda row: '_'.join([row['sample'], row['kit']]), axis=1)

In [None]:
surgery_model = scarches.models.SCANVI.load_query_data(
    adata_query,
    ref_model_dir,
    freeze_dropout=True,
)

In [None]:
surgery_model.registry_["setup_args"]

There are three setup arguments that were used for building the reference model, and that should be used for preparing scArches surgery as well: 1. batch_key: this key is used to specify from which batch your query dataset comes. The HLCA reference model was set up to retain variation between individuals, and so rather than treating each sample or individual as a separate batch, each dataset was considered one batch. We therefore recommend using the same logic for an HLCA query, and set an entire dataset to a single batch. If your data has further splits that could result in specific batch effects, split your data into separate batches accordingly (e.g. if part of your data was generated with 10X 3’, and the rest with 10X 5’). 2. labels_key: as the HLCA has a scANVI reference model, it used cell type labels as input for the training. These cell type labels were stored in a column named ‘scanvi_label’. We recommend not using cell type labels for surgery, and so advise to set this column to ‘unlabeled’ (see below). 3. unlabeled_category: this variable specifies how cells without label were named for this specific model. As you can see, they were in this case set to the string ‘unlabeled’.

In [None]:
#We will furthermore set the cell type key to the unlabeled_category for all our cells, and recommend doing the same for any dataset mapped to the HLCA:

adata_query.obs["scanvi_label"] = "unlabeled"

In [None]:
surgery_epochs = 500
early_stopping_kwargs_surgery = {
    "early_stopping_monitor": "elbo_train",
    "early_stopping_patience": 10,
    "early_stopping_min_delta": 0.001,
    "plan_kwargs": {"weight_decay": 0.0},
}

In [None]:
surgery_model.train(max_epochs=surgery_epochs, **early_stopping_kwargs_surgery)

Now that we have the updated model, we can calculate the low-dimensional representation or “embedding” of our query data. Importantly, this embedding is in the same space as the HLCA core/reference embedding that you loaded in the beginning of the script. Hence, we can combine the two embeddings afterwards (HLCA + your new data), and do joint clustering, UMAP embedding, label transfer etc.! The latent embedding will be stored in a new anndata under .X with the following command:

In [None]:
adata_query_latent = sc.AnnData(surgery_model.get_latent_representation(adata_query))

#Copy over .obs metadata from our query data:

adata_query_latent.obs = adata_query.obs.loc[adata_query.obs.index, :]

Now that we have our query embedding, we can combine it with the pre-existing reference embedding that we downloaded at the top of this notebook. Once we have that joint embedding, we can do all kinds of analyses on the combined reference and query, including clustering, visualization, and label transfer (see below).

Before joining the reference and the query, let’s specify for the cells from each whether they came from the reference or the query:

In [None]:
#adata_query_latent.obs["ref_or_query"] = "query"
adata_ref_latent.obs["ref_or_query"] = "ref"

We will now combine the two embeddings to enable joint clustering etc. If you expect non-unique barcodes (.obs index), set index_unique to e.g. “_” (this will add a suffix to your barcodes to ensure we can keep apart reference and query barcodes) and batch_key to the obs column that you want to use as barcode suffix (e.g. “ref_or_query”).

In [None]:
adata_query_latent.write('adata_diseased_latent.h5ad')

In [None]:
adata_ref.obsm["X_scarches_emb"] = adata_ref_latent[
    adata_ref.obs.index, :
].X  # copy over scArches/reference-based embedding

In [None]:
adata_ref.write('adata_ref_latent.h5ad')

In [None]:
adata_query_latent = sc.read_h5ad('adata_diseased_latent.h5ad') #start here

In [None]:
del adata_ref
gc.collect()

## concat to combined embedding

In [None]:
combined_emb = sc.concat(
    (adata_ref_latent, adata_query_latent), join="outer") #,index_unique="_")

In [None]:
for cat in combined_emb.obs.columns:
    if isinstance(combined_emb.obs[cat].values, pd.Categorical):
        pass
    elif pd.api.types.is_float_dtype(combined_emb.obs[cat]):
        pass
    else:
        print(
            f"Setting obs column {cat} (not categorical neither float) to strings to prevent writing error."
        )
        combined_emb.obs[cat] = combined_emb.obs[cat].astype(str)

In [None]:
combined_emb.obs.drop(['Internal ID'],axis=1,inplace=True)

In [None]:
combined_emb.write_h5ad("combined_embedding_diseased_healthy_scarches.h5ad")

In [None]:
combined_emb = sc.read_h5ad('combined_embedding_diseased_healthy_scarches.h5ad')

In [None]:
adata_query_latent = combined_emb[combined_emb.obs['ref_or_query']=='query'].copy()

In [None]:
adata_query_latent.obs

In [None]:
adata_query_latent

In [None]:
adata_ref_latent = combined_emb[combined_emb.obs['ref_or_query']=='ref'].copy()

In [None]:
# get cell_type_annotation_lv1
from anndata._io.specs import read_elem
with h5py.File('adata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated_imputed_annotated.h5ad', 'r') as f:
    # Read specific columns from `obs`
    #sample_column = f['obs/sample'][:]
    #n_counts_column = f['obs/n_counts'][:]
    #https://github.com/scverse/anndata/issues/436:
    #cell_types = read_elem(f["obs/celltype"])
    #umap = read_elem(f["obsm/X_umap"])
    anno_obs = read_elem(f["obs/cell_type_annotation_lv1"])

In [None]:
adata_ref_latent.obs['cell_type_annotation_lv1'] = anno_obs
adata_ref_latent.obs['cell_type_annotation_lv1'] = adata_ref_latent.obs['cell_type_annotation_lv1'].astype('category')

In [None]:
adata_ref_latent.X

## Label transfer
Next, we use a knn classifier to transfer the lables from the reference to the query. As the HLCA includes 5 levels of annotations (from coarse to fine), we will do the label transfer for every level of annotation. Note that some cell types don’t have annotations for higher levels, e.g. mast cells do not have level 4 or 5 annotations. For those cell types, we will “propagate” to the higher levels, i.e. you will see “3_Mast cells” in level 4 and 5 annotations. (Most cell types don’t have a level 5 annotation!) Therefore, all highest level annotations can be found under level 5.

In [None]:
knn_transformer = scarches.utils.knn.weighted_knn_trainer(
    train_adata=adata_ref_latent,
    train_adata_emb="X",  # location of our joint embedding
    n_neighbors=50,
)

Now let’s peform label transfer for the 5 levels of labels in the reference (“ann_level_1” to “ann_level_5”)

In [None]:
combined_emb

In [None]:
adata_query_latent

In [None]:
adata_ref_latent

In [None]:
adata_query_latent.obs['cell_type_annotation_lv1'] = adata_query_latent.obs['initial_cell_type']

In [None]:
adata_ref_latent

In [None]:
labels, uncert = scarches.utils.knn.weighted_knn_transfer(
    query_adata=adata_query_latent,
    query_adata_emb="X",  # location of our embedding, query_adata.X in this case
    label_keys="cell_type_annotation_lv1",  # (start of) obs column name(s) for which to transfer labels, if issue: make sure you did not already add the further steps to adata
    knn_model=knn_transformer,
    ref_adata_obs=adata_ref_latent.obs,
)

In [None]:
labels

With the commands above, we labeled every cell from the query (labels dataframe). Moreover, for each query cell we get an uncertainty score that tells you how confidently the label was assigned to the cell (uncert dataframe). This uncertainty score is based on how consistent the reference labels were among the nearest neighbors of the query cell. High label transfer uncertainty can indicate a number of things: 1. The cell lies in between two cellular phenotypes, e.g. in the case of a continuous transition of one cell type into another. 2. The cell is of a cell type or subtype not present in the reference. For example, the HLCA does not include erythrocytes. Erythrocytes in a query dataset will therefore likely be labeled with high uncertainty. Similarly, disease samples might include disease-affected cell types that look different from the cells in a healthy reference. These also likely have high label transfer uncertainty. 3. The mapping did not successfully remove batch-effects in the query data from the embedding. Query cells do not mix with the reference in the joint embedding, complicating confident label transfer. To distinguish low-uncertainty from high-uncertainty transferred labels, we will set our high-uncertainty labels to “unknown” instead of giving them a cell type label. Cells with high uncertainty should be looked into in downstream analysis.

We set the uncertainty threshold to 0.2, limiting the false positive rate to <0.5 (as per Sikkema et al., bioRxiv 2022). If you are dealing with data that you expect to look very different from your reference (e.g. mouse data or cell line data), you could consider setting this threshold higher.

In [None]:
combined_emb.obs.drop(['cell_type_annotation_lv1_transferred_label_unfiltered', 'cell_type_annotation_lv1_transfer_uncert', 'cell_type_annotation_lv1_transferred_label'], axis=1, inplace=True)

In [None]:
uncertainty_threshold = 0.2 #in the HLCA it is 0.2

#Let’s clean up the column names and add the transferred labels and matching uncertainties to our combined embedding (including both the query and the reference).

labels.rename(
    columns={
        f"{anno}": f"{anno}_transferred_label_unfiltered"
        for anno in ['cell_type_annotation_lv1']
    },
    inplace=True,
)
uncert.rename(
    columns={
        f"{anno}": f"{anno}_transfer_uncert"
        for anno in ['cell_type_annotation_lv1']
    },
    inplace=True,
)

combined_emb.obs = combined_emb.obs.join(labels)
combined_emb.obs = combined_emb.obs.join(uncert)



In [None]:
uncert

In [None]:
combined_emb

In [None]:
combined_emb.obs.cell_type_annotation_lv1_transfer_uncert

In [None]:
#Now let’s generate a filtered label column for each label, setting labels transferred with uncertainty >0.2 to “Unknown”:
uncertainty_threshold = 0.2 #in the HLCA it is 0.2

for anno in ['cell_type_annotation_lv1']:
    combined_emb.obs[f"{anno}_transferred_label"] = combined_emb.obs[
        f"{anno}_transferred_label_unfiltered"
    ].mask(
        combined_emb.obs[f"{anno}_transfer_uncert"] > uncertainty_threshold,
        "Unknown",
    )

#Let’s take a look at the percentage of cells set to “unknown” after our filtering:

print(
    f"Percentage of unknown per level, with uncertainty_threshold={uncertainty_threshold}:"
)
for anno in ['cell_type_annotation_lv1']:
    print(
        f"{anno}: {np.round(sum(combined_emb.obs[f'{anno}_transferred_label'] =='Unknown')/adata_query_latent.n_obs*100,2)}%"
    )



Important note! In some environments with older versions of scanpy/scvi-tools/scarches, there is a bug in the above code that we have not been able to properly pinpoint and fix. If you observe percentages of (close to) 100% of unknown above, you likely have the same bug and should update your packages. The transfered labels will then also be shuffled/random. (See also note at the top of this notebook).

## Visualization of the joint reference and query embedding
We will use a UMAP plot of our data to visually inspect the results of the mapping and label transfer. Calculating this will take a while on the HLCA (>.5M cells) + query.

In [None]:
combined_emb

In [None]:
sc.pp.neighbors(combined_emb, n_neighbors=30)
sc.tl.umap(combined_emb)

Let’s first take a look at where our query cells are located in the umap. If they are completely separate from the reference, this could be a sign that something went wrong in the mapping. In our case, the query cells are largely mixing with or close to the reference cells in the UMAP.


In [None]:
sc.pl.umap(combined_emb, color="ref_or_query", frameon=False,legend_fontsize=9, save='Umap_transfer_learned_superposed3.png', title= 'Joint embedding')

In [None]:
combined_emb #

In [None]:
sc.pl.umap(combined_emb, color=metadata+["ref_or_query"], frameon=False,legend_fontsize=8.5, ncols = 3, wspace = 0.9,save='Umap_transfer_learned_superposed_metadata.png')

In [None]:
sc.pl.umap(
    combined_emb,
    color=marker_genes,
    #vmax="p99",
    cmap = mymap,
    layer = 'log_dca_counts',
    #wspace=0.7,
    ncols=4,
    save = 'umap_markers_transfered_embedding_joint_imputed.png'
)

### metadata actualisation

In [None]:
## add metadata
metadata_df =read_excel_metadata(f'/mnt/hdd/data/metadata_mouse_gut.xlsx')
# Ensure folder name is the index in metadata for easier access
metadata_df.drop(metadata_df[metadata_df['kit'] == 'Multiome_ATAC_v1'].index, inplace=True)
#metadata_df.drop(metadata_df[metadata_df['condition'].isin(['Ctr','Ctr/WT'])].index, inplace=True)
metadata_df.set_index('folder name', inplace=True)
metadata_df.drop(['Sample Pooling - confounded with Project?','date','Project Name','Link_id','sample name','Cell Count [cells/µl]','Viable Cells [%]','Lib. Concentration [ng/µl]','Lib. Molarity [nM]','Average Lib. Size [bp]','cDNA Cycles','Lib. Cycles','10x Sample Index','Sequencing Depth [reads/cell]','exclusion, reason'], axis=1, inplace=True)

In [None]:
# Function to update adata.obs with metadata using a lambda function
for col in metadata_df.columns:
    try:
        combined_emb.obs[col] = combined_emb.obs['sample'].apply(lambda x: metadata_df.at[x, col])
    except KeyError as err:
        print(f'no such key: {err} in col {col}')

### label transfer

Now let’s take a look at the label transfer uncertainties per level. Regions with high uncertainty can highlight interesting cell types/states, not present in the reference. Note that uncertainties will get higher, the more detailed we go. Note that as we only used very few cells in the query here, they are more difficult to see in the joint embedding.


Now let’s take a look at the transferred labels, at every level. Note that the color for “Unknown” switches per plot, and that all cells from the reference are set to NA.

In [None]:
sc.pl.umap(
    combined_emb,
    color=[f"{anno}_transfer_uncert"],
    na_color="grey",
    ncols=2,
    size=2,
    wspace=1,
    save='Umap_transfer_learned_superposed_label_uncertainty.png'
)

In [None]:
annotation_key = f"{anno}_transferred_label_unfiltered"

In [None]:
combined_emb.obs[annotation_key] = combined_emb.obs[annotation_key].astype('category')

In [None]:
combined_emb.obs[annotation_key] = combined_emb.obs[annotation_key].cat.reorder_categories(['ISC', 'TA', 'TA (prox.))', 'early Enterocyte', 'Enterocyte', 
'Tuft prog.', 'Tuft prog. 2', 'Tuft', 
'Goblet/EEC prog. (early)', 'EEC prog. (mid)', 'EEC prog. (late/Peptide)', 'EEC (Peptide/immature)', 'X-cell (Ghrl+)',  'K-cell (Gip+)', 'L/I-cell (Glp1+/Cck+)', 'D-cell (Sst+)',
'EC prog. (late)', 'EC (immature)', 'EC (mature)','EC 2', 
 'Goblet prog. (late)', 'Goblet',  'Paneth prog.', 'Paneth', 'unknown0'])

In [None]:
combined_emb.uns[f'{annotation_key}' + '_colors'] = ['#d0d0d0',  # ISC
 '#eebcbc',  # TA
 '#fee0d2',  # TA prox
 '#c67a84',  # early Enterocyte
 '#bb4353',  # Enterocyte
 '#eca4d0',  # Tuft prog.
 '#df65b0',  # Tuft prog. 2
 '#e7298a',  # Tuft
 '#e1f3bf',  # Goblet/EEC prog.
 '#d9edf7',  # EEC prog
 '#85c6e6',  # EEC prog. (late/Peptide)
 '#46a8d9',  # EEC (peptide/immature)
 '#339a98',  # X-cell (Ghrl+)
 '#368cbf',  # K-cell (Gip+)
 '#5a72dd',  # L/I-cell (Glp1+/Cck+)
 '#243dae',  # D-cell (Sst+)
 '#d0d1e6',  # EC prog.
 '#aa9dce',  # EC (imm.)
 '#594495',  # EC (mature)
 '#725dae',  # EC 2
 '#fec44f',  # Goblet prog.
 '#dd894e',  # Goblet
 '#7BB98F',  # Paneth prog.
 '#238b45',  # Paneth
 '#ac9470'   # unknown0
]

In [None]:
sc.pl.umap(
    combined_emb,
    color=[f"{anno}_transferred_label_unfiltered"],
    na_color="grey",
    ncols=2,
    size=2,
    wspace=1,
    legend_fontsize =9,
    save='Umap_transfer_learned_superposed_transferred_labels.png',
    title = 'Transfered cell types on diseased query'
)

In [None]:
del adata_ref_latent
gc.collect()

## explore query

In [None]:
adata_query_final = (
    adata_query_unprep.copy()
)  # copy the original query adata, including gene counts

adata_query_final.obsm["X_scarches_emb"] = adata_query_latent[
    adata_query_final.obs.index, :
].X  # copy over scArches/reference-based embedding

In [None]:
combined_emb = combined_emb[combined_emb.obs["ref_or_query"]=='query'] #because of barcode overlapping -.-

In [None]:
for col in combined_emb.obs.columns:
    if col.startswith("cell_type") and "transfer" in col:
        print(col)
        adata_query_final.obs[col] = combined_emb.obs.loc[
            adata_query_final.obs.index, col
        ]

In [None]:
for ctanno in ['cell_type_annotation_lv1_transferred_label_unfiltered','cell_type_annotation_lv1_transfer_uncert','cell_type_annotation_lv1_transferred_label']:
    adata_query_final.obs[ctanno] = adata_query_final.obs[ctanno].astype(str)
adata_query_final.write('adata_query_final.h5ad')

In [None]:
adata_query_final = sc.read_h5ad('adata_query_final.h5ad')

In [None]:
del adata_query_unprep
gc.collect()

In [None]:
adata2 = sc.read_h5ad('/mnt/hdd/data/Diseased/Dbtl_detected_velocyto_scran_diseased_sct_imputed_subsetted.h5ad')

In [None]:
vars_imputed = [name for name in adata2.var_names]

In [None]:
dca = adata2.layers['log_dca_counts']

In [None]:
del adata2
gc.collect()

In [None]:
adata_query_final = adata_query_final[:, vars_imputed]

In [None]:
adata_query_final

In [None]:
adata_query_final.layers['log_dca_counts']= dca

In [None]:
del dca
gc.collect()

In [None]:
## add metadata
metadata_df =read_excel_metadata(f'/mnt/hdd/data/metadata_mouse_gut.xlsx')
# Ensure folder name is the index in metadata for easier access
metadata_df.drop(metadata_df[metadata_df['kit'] == 'Multiome_ATAC_v1'].index, inplace=True)
metadata_df.drop(metadata_df[metadata_df['condition'].isin(['Ctr','Ctr/WT'])].index, inplace=True)
metadata_df.set_index('folder name', inplace=True)
metadata_df.drop(['Sample Pooling - confounded with Project?','date','Project Name','Link_id','sample name','Cell Count [cells/µl]','Viable Cells [%]','Lib. Concentration [ng/µl]','Lib. Molarity [nM]','Average Lib. Size [bp]','cDNA Cycles','Lib. Cycles','10x Sample Index','Sequencing Depth [reads/cell]','exclusion, reason'], axis=1, inplace=True)

In [None]:
# Function to update adata.obs with metadata using a lambda function
for col in metadata_df.columns:
    try:
        adata_query_final.obs[col] = adata_query_final.obs['sample'].apply(lambda x: metadata_df.at[x, col])
    except KeyError as err:
        print(f'no such key: {err} in col {col}')

In [None]:
adata_query_final

## add phase to diseased

In [None]:
all_cc_genes, s_genes_regev, g2m_genes_regev, cc_genes_regev, cc_genes_macosko, s_genes_macosko, g2m_genes_macosko, m_genes_macosko, mg1_genes_macosko, g1s_genes_macosko = load_cell_cycle_genes(adata_query_final, genome='mus_musculus')

In [None]:
sc.tl.score_genes_cell_cycle(adata_query_final, s_genes=s_genes_regev, g2m_genes=g2m_genes_regev)

In [None]:
adata_query_final.obs['proliferation'] = list(adata_query_final.obs['phase'].isin(['G2M','S']))
adata_query_final.obs['proliferation'][adata_query_final.obs['proliferation']==True] = 'Cycling'
adata_query_final.obs['proliferation'][adata_query_final.obs['proliferation']==False] = 'Non-Cycling'

### add diseased cell cycle

In [None]:
combined_emb.obs['phase'][combined_emb.obs['ref_or_query']=='query'] = adata_query_final.obs['phase'].copy()

### covariates combined

In [None]:
sc.tl.pca(combined_emb)

n_pcs = 49

#specifiy covariates we want to check (we will quantify their correlation with the 1st 50 PCs, to see how much variance they can each explain):

covariates = [
    "sample",
'doublet_calls', 'final_doublets', 'final_doublets_cat', 'phase', 'proliferation', 'initial_cell_type','Project','sequencing','condition','kit','line','strain','enriched','enrichment proportion','diet','Index Type','sequencing machine','cell_type_annotation_lv1_transferred_label_unfiltered'
]

Create shuffled assignment of single cell platform (and processing site if included), to compare actual variance explained to variance explained expected by random. We will assign all cells of the same sample to the same value.

In [None]:
include_processing_site =True

In [None]:
# create shuffled version of single cell platform, and of Processing_site:
if include_processing_site:
    sample_to_scplatform = combined_emb.obs.groupby("sample").agg(
        {"Project": "first", "sequencing machine": "first"}
    )
else:
    sample_to_scplatform = combined_emb.obs.groupby("sample").agg(
        {"Project": "first"}
    )
for i in range(10):
    np.random.shuffle(sample_to_scplatform.Project)
    combined_emb.obs["Project_shuffled_" + str(i)] = combined_emb.obs["sample"].map(
        dict(
            zip(
                sample_to_scplatform.index,
                sample_to_scplatform.Project,
            )
        )
    )
    covariates.append("Project_shuffled_" + str(i))
    if include_processing_site:
        np.random.shuffle(sample_to_scplatform['sequencing machine'])
        combined_emb.obs["sequencing machine_shuffled_" + str(i)] = combined_emb.obs["sample"].map(
            dict(zip(sample_to_scplatform.index, sample_to_scplatform['sequencing machine']))
        )
        covariates.append("sequencing machine_shuffled_" + str(i))

Now check for every covariate, for every PC how much variance among the cells' PC scores the covariate can explain. Add this variance explained per PC up across PCs for every covariate. This will give us the total amount of variance explained per covariate.

In [None]:
from sklearn.linear_model import LinearRegression

def check_if_nan(value):
    """return Boolean version of value that is True if value is
    some type of NaN (e.g. np.nan, None, "nan" etc). 
    Example use:
    none_entries = subadata.obs.applymap(check_if_nan)
    subadata.obs = subadata.obs.mask(none_entries.values)
    """
    if value == "nan":
        return True
    elif value == None:
        return True
    if isinstance(value, float):
        if np.isnan(value):
            return True
    if value == "ND":
        return True
    return False

In [None]:
var_explained = pd.DataFrame(index=range(n_pcs), columns=covariates + ["overall"])
for pc in range(n_pcs):
    y_true_unfiltered = combined_emb.obsm["X_pca"][:, pc]
    var_explained.loc[pc, "overall"] = np.var(y_true_unfiltered)
    for cov in covariates:
        x = combined_emb.obs[cov].values.copy()
        x_nans = np.vectorize(check_if_nan)(x)
        x = x[~x_nans]
        if len(x) != 0:
            y_true = y_true_unfiltered[~x_nans].reshape(-1, 1)
            if x.dtype in ["float32", "float", "float64"]:
                x = x.reshape(-1, 1)
            else:
                if len(set(x)) == 1:
                    var_explained.loc[pc, cov] = np.nan
                    continue
                x = pd.get_dummies(x)
            x.columns = x.columns.astype(str)
            lrf = LinearRegression(fit_intercept=True).fit(
                x,
                y_true,
            )
            y_pred = lrf.predict(x)
            var_explained.loc[pc, cov] = np.var(y_pred)
total_variance_explained = np.sum(var_explained, axis=0).sort_values(ascending=False)
total_variance_explained_fractions = (
    total_variance_explained / total_variance_explained["overall"]
)

Do the same for the shuffled covariates. Calculate mean over shuffling instances, add as one value to clean fractions:

In [None]:
total_variance_explained_clean = total_variance_explained_fractions[
    [
        x
        for x in total_variance_explained_fractions.index
        if not x.startswith("sequencing machine_shuffled")
        and not x.startswith("Project_shuffled")
    ]
]
total_variance_explained_clean["Project_shuffled"] = np.mean(
    total_variance_explained_fractions[
        [
            x
            for x in total_variance_explained_fractions.index
            if x.startswith("Project_")
        ]
    ]
)
stdev_Project_shuffled = np.std(
    total_variance_explained_fractions[
        [
            x
            for x in total_variance_explained_fractions.index
            if x.startswith("Project_")
        ]
    ]
)
if include_processing_site:
    total_variance_explained_clean["sequencing machine_shuffled"] = np.mean(
        total_variance_explained_fractions[
            [
                x
                for x in total_variance_explained_fractions.index
                if x.startswith("sequencing machine_shuffled")
            ]
        ]
    )
    stdev_processing_site_shuffled = np.std(
        total_variance_explained_fractions[
            [
                x
                for x in total_variance_explained_fractions.index
                if x.startswith("sequencing machine_shuffled")
            ]
        ]
    )


Sort results:

In [None]:
total_variance_explained_clean.sort_values(ascending=False, inplace=True)

Plot:

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(
    total_variance_explained_clean[::-1].index,
    total_variance_explained_clean[::-1].values,
)
plt.title(
    f"covariate correlation with first 50 PCs of healthy and diseased samples combined",
    fontsize=14,
)  # \n({dominant_type})
plt.xticks(rotation=90)
plt.show()

## main covariates diseased

In [None]:
sc.tl.pca(adata_query_final)

n_pcs = 50

#specifiy covariates we want to check (we will quantify their correlation with the 1st 50 PCs, to see how much variance they can each explain):

covariates = [
    "sample",
'doublet_calls', 'final_doublets', 'final_doublets_cat', 'phase', 'proliferation', 'initial_cell_type','Project','sequencing','condition','kit','line','strain','enriched','enrichment proportion','diet','Index Type','sequencing machine','cell_type_annotation_lv1_transferred_label_unfiltered'
]

Create shuffled assignment of single cell platform (and processing site if included), to compare actual variance explained to variance explained expected by random. We will assign all cells of the same sample to the same value.

In [None]:
include_processing_site =True

In [None]:
# create shuffled version of single cell platform, and of Processing_site:
if include_processing_site:
    sample_to_scplatform = adata_query_final.obs.groupby("sample").agg(
        {"Project": "first", "sequencing machine": "first"}
    )
else:
    sample_to_scplatform = adata_query_final.obs.groupby("sample").agg(
        {"Project": "first"}
    )
for i in range(10):
    np.random.shuffle(sample_to_scplatform.Project)
    adata_query_final.obs["Project_shuffled_" + str(i)] = adata_query_final.obs["sample"].map(
        dict(
            zip(
                sample_to_scplatform.index,
                sample_to_scplatform.Project,
            )
        )
    )
    covariates.append("Project_shuffled_" + str(i))
    if include_processing_site:
        np.random.shuffle(sample_to_scplatform['sequencing machine'])
        adata_query_final.obs["sequencing machine_shuffled_" + str(i)] = adata_query_final.obs["sample"].map(
            dict(zip(sample_to_scplatform.index, sample_to_scplatform['sequencing machine']))
        )
        covariates.append("sequencing machine_shuffled_" + str(i))

Now check for every covariate, for every PC how much variance among the cells' PC scores the covariate can explain. Add this variance explained per PC up across PCs for every covariate. This will give us the total amount of variance explained per covariate.

In [None]:
from sklearn.linear_model import LinearRegression

def check_if_nan(value):
    """return Boolean version of value that is True if value is
    some type of NaN (e.g. np.nan, None, "nan" etc). 
    Example use:
    none_entries = subadata.obs.applymap(check_if_nan)
    subadata.obs = subadata.obs.mask(none_entries.values)
    """
    if value == "nan":
        return True
    elif value == None:
        return True
    if isinstance(value, float):
        if np.isnan(value):
            return True
    if value == "ND":
        return True
    return False

In [None]:
var_explained = pd.DataFrame(index=range(n_pcs), columns=covariates + ["overall"])
for pc in range(n_pcs):
    y_true_unfiltered = adata_query_final.obsm["X_pca"][:, pc]
    var_explained.loc[pc, "overall"] = np.var(y_true_unfiltered)
    for cov in covariates:
        x = adata_query_final.obs[cov].values.copy()
        x_nans = np.vectorize(check_if_nan)(x)
        x = x[~x_nans]
        if len(x) != 0:
            y_true = y_true_unfiltered[~x_nans].reshape(-1, 1)
            if x.dtype in ["float32", "float", "float64"]:
                x = x.reshape(-1, 1)
            else:
                if len(set(x)) == 1:
                    var_explained.loc[pc, cov] = np.nan
                    continue
                x = pd.get_dummies(x)
            x.columns = x.columns.astype(str)
            lrf = LinearRegression(fit_intercept=True).fit(
                x,
                y_true,
            )
            y_pred = lrf.predict(x)
            var_explained.loc[pc, cov] = np.var(y_pred)
total_variance_explained = np.sum(var_explained, axis=0).sort_values(ascending=False)
total_variance_explained_fractions = (
    total_variance_explained / total_variance_explained["overall"]
)

Do the same for the shuffled covariates. Calculate mean over shuffling instances, add as one value to clean fractions:

In [None]:
total_variance_explained_clean = total_variance_explained_fractions[
    [
        x
        for x in total_variance_explained_fractions.index
        if not x.startswith("sequencing machine_shuffled")
        and not x.startswith("Project_shuffled")
    ]
]
total_variance_explained_clean["Project_shuffled"] = np.mean(
    total_variance_explained_fractions[
        [
            x
            for x in total_variance_explained_fractions.index
            if x.startswith("Project_")
        ]
    ]
)
stdev_Project_shuffled = np.std(
    total_variance_explained_fractions[
        [
            x
            for x in total_variance_explained_fractions.index
            if x.startswith("Project_")
        ]
    ]
)
if include_processing_site:
    total_variance_explained_clean["sequencing machine_shuffled"] = np.mean(
        total_variance_explained_fractions[
            [
                x
                for x in total_variance_explained_fractions.index
                if x.startswith("sequencing machine_shuffled")
            ]
        ]
    )
    stdev_processing_site_shuffled = np.std(
        total_variance_explained_fractions[
            [
                x
                for x in total_variance_explained_fractions.index
                if x.startswith("sequencing machine_shuffled")
            ]
        ]
    )


Sort results:

In [None]:
total_variance_explained_clean.sort_values(ascending=False, inplace=True)

Plot:

In [None]:
plt.figure(figsize=(8, 4))
plt.bar(
    total_variance_explained_clean[::-1].index,
    total_variance_explained_clean[::-1].values,
)
plt.title(
    f"covariate correlation with first 50 PCs of diseased samples",
    fontsize=14,
)  # \n({dominant_type})
plt.xticks(rotation=90)
plt.show()

## plot query only with covariates

In [None]:
sc.pp.neighbors(adata_query_final, use_rep="X_scarches_emb")
sc.tl.umap(adata_query_final)

In [None]:
anno = 'cell_type_annotation_lv1'

In [None]:
annotation_key = f"{anno}_transferred_label"

In [None]:
adata_query_final.obs[annotation_key].value_counts()

In [None]:
adata_query_final.obs[annotation_key] = adata_query_final.obs[annotation_key].cat.reorder_categories(['ISC', 'TA', 'TA (prox.))', 'early Enterocyte', 'Enterocyte', 
'Tuft prog.', 'Tuft prog. 2', 'Tuft', 
'Goblet/EEC prog. (early)', 'EEC prog. (mid)', 'EEC prog. (late/Peptide)', 'EEC (Peptide/immature)', 'X-cell (Ghrl+)',  'K-cell (Gip+)', 'L/I-cell (Glp1+/Cck+)', 'D-cell (Sst+)',
'EC prog. (late)', 'EC (immature)', 'EC (mature)','EC 2', 
 'Goblet prog. (late)', 'Goblet',  'Paneth prog.', 'Paneth', 'unknown0','Unknown'])

In [None]:
adata_query_final.uns[f'{annotation_key}' + '_colors'] = ['#d0d0d0',  # ISC
 '#eebcbc',  # TA
 '#fee0d2',  # TA prox
 '#c67a84',  # early Enterocyte
 '#bb4353',  # Enterocyte
 '#eca4d0',  # Tuft prog.
 '#df65b0',  # Tuft prog. 2
 '#e7298a',  # Tuft
 '#e1f3bf',  # Goblet/EEC prog.
 '#d9edf7',  # EEC prog
 '#85c6e6',  # EEC prog. (late/Peptide)
 '#46a8d9',  # EEC (peptide/immature)
 '#339a98',  # X-cell (Ghrl+)
 '#368cbf',  # K-cell (Gip+)
 '#5a72dd',  # L/I-cell (Glp1+/Cck+)
 '#243dae',  # D-cell (Sst+)
 '#d0d1e6',  # EC prog.
 '#aa9dce',  # EC (imm.)
 '#594495',  # EC (mature)
 '#725dae',  # EC 2
 '#fec44f',  # Goblet prog.
 '#dd894e',  # Goblet
 '#7BB98F',  # Paneth prog.
 '#238b45',  # Paneth
 '#ac9470',   # unknown0
 '#808080' #Unknown 
]

In [None]:
annotation_key = f"{anno}_transferred_label_unfiltered"

In [None]:
adata_query_final.obs[annotation_key] = adata_query_final.obs[annotation_key].cat.reorder_categories(['ISC', 'TA', 'TA (prox.))', 'early Enterocyte', 'Enterocyte', 
'Tuft prog.', 'Tuft prog. 2', 'Tuft', 
'Goblet/EEC prog. (early)', 'EEC prog. (mid)', 'EEC prog. (late/Peptide)', 'EEC (Peptide/immature)', 'X-cell (Ghrl+)',  'K-cell (Gip+)', 'L/I-cell (Glp1+/Cck+)', 'D-cell (Sst+)',
'EC prog. (late)', 'EC (immature)', 'EC (mature)','EC 2', 
 'Goblet prog. (late)', 'Goblet',  'Paneth prog.', 'Paneth', 'unknown0'])

In [None]:
adata_query_final.uns[f'{annotation_key}' + '_colors'] = ['#d0d0d0',  # ISC
 '#eebcbc',  # TA
 '#fee0d2',  # TA prox
 '#c67a84',  # early Enterocyte
 '#bb4353',  # Enterocyte
 '#eca4d0',  # Tuft prog.
 '#df65b0',  # Tuft prog. 2
 '#e7298a',  # Tuft
 '#e1f3bf',  # Goblet/EEC prog.
 '#d9edf7',  # EEC prog
 '#85c6e6',  # EEC prog. (late/Peptide)
 '#46a8d9',  # EEC (peptide/immature)
 '#339a98',  # X-cell (Ghrl+)
 '#368cbf',  # K-cell (Gip+)
 '#5a72dd',  # L/I-cell (Glp1+/Cck+)
 '#243dae',  # D-cell (Sst+)
 '#d0d1e6',  # EC prog.
 '#aa9dce',  # EC (imm.)
 '#594495',  # EC (mature)
 '#725dae',  # EC 2
 '#fec44f',  # Goblet prog.
 '#dd894e',  # Goblet
 '#7BB98F',  # Paneth prog.
 '#238b45',  # Paneth
 '#ac9470'   # unknown0
]

In [None]:
for ctanno in ['cell_type_annotation_lv1_transferred_label_unfiltered','cell_type_annotation_lv1_transferred_label']:
    adata_query_final.obs[ctanno] = adata_query_final.obs[ctanno].astype('category')

In [None]:
adata_query_final.obs['cell_type_annotation_lv1_transfer_uncert'] = adata_query_final.obs['cell_type_annotation_lv1_transfer_uncert'].astype('float')

In [None]:
sc.pl.umap(
    adata_query_final,
    color=[
        f"{anno}_transferred_label_unfiltered",
        f"{anno}_transferred_label",
        f"{anno}_transfer_uncert",
    ],
    wspace=0.65,
    cmap = mymap,
    ncols=2,
    legend_fontsize =9,
    save='Umap_transfer_learned_superposed_transferred_labels2.png',
    title = ['Transfered cell types on diseased query unfiltered','Transfered cell types on diseased query','Transfered cell types uncertainty']
)

In [None]:
marker_genes = ['Lgr5','Olfm4','Dmbt1','Arg2','Sis','Dclk1','Sox4','Pou2f3','Muc2','Dll1','Ccl25','Lyz1','Neurog3','Neurod1','Arx','Pax4','Spdef','Lmx1a','Reg4','Isl1','Sst','Gcg','Cck','Gip','Ghrl','Sct','Fev','Lbh', 'Rnase4','Ctse', 'Slc12a8','Reg1','Slc2a2','Ada', 'Golm1', 'Tff2', 'Muc1', 'Dmbt1', 'Insr']

In [None]:
sc.pl.umap(
    adata_query_final,
    color=marker_genes,
    #vmax="p99",
    cmap = mymap,
    layer = 'log_dca_counts',
    #wspace=0.7,
    ncols=4,
    save = 'umap_markers_transfered_embedding_diseased_imputed.png'
)

In [None]:
metadata = ['pretty name','Project','kit','enriched','diet','condition','line','strain', 'phase']

In [None]:
sc.pl.umap(
    adata_query_final,
    color=metadata,
    #vmax="p99",
    cmap = mymap,
    wspace=0.75,
    ncols=2,
    legend_fontsize=9,
    save = 'umap_metadata_transfered_embedding_diseased.png'
)

In [None]:
adata_query_final.obs.drop(["modality (confounded with 'sequencing'", 'Internal ID', 'SeqID', 'target cell number', 'Read Length', '_scvi_batch', '_scvi_labels', 'leiden_2', 'leiden_3', 'leiden_sub1','sample number Minas'],axis=1,inplace=True)

In [None]:
adata_query_final

In [None]:
adata_query_final.write('adata_diseased_integrated_annotated.h5ad')