# prep environment

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc

import matplotlib as mpl
import matplotlib.pyplot as plt

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns

In [None]:
#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

In [None]:
#Set fontsize
plt.rcParams.update({'font.size': 20})

In [None]:
#Set wd
os.chdir('/hpc/group/goldsteinlab/Python')

In [None]:
#Show specific size of pandas dataframe when produced
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
%matplotlib inline

# read in datasets

First dataset from Zunitch et al. 2023 (GSE166612)

In [None]:
# Read in 10x Cell Ranger output counts matrix for H2022_10
adata_ONB_A = sc.read_10x_mtx('Zunitch_ONB/', var_names='gene_symbols', cache=True) 

In [None]:
#Add metadata to adata_RPM
adata_ONB_A.obs['orig_ident'] = 'ONB_A'
adata_ONB_A.obs['source'] = 'Zunitch'
adata_ONB_A.obs['patient'] = 'ONB_A'
adata_ONB_A.obs['cond'] = 'ONB'
adata_ONB_A.obs['orig_patients'] = 'ONB_A'

In [None]:
#QC filtering
adata_ONB_A.var['mito'] = adata_ONB_A.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_ONB_A, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_ONB_A, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_ONB_A, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_ONB_A, x='total_counts', y='n_genes_by_counts')

Next dataset is from this paper (Finlay & Ireland et al)

In [None]:
# read in 10x dataset
adata_ONB_B=sc.read_h5ad('H2023_6_multivi_1.3.h5ad')

In [None]:
# add metadata
adata_ONB_B.obs['source'] = 'Finlay_Ireland'
adata_ONB_B.obs['patient'] = 'ONB_B'
adata_ONB_B.obs['orig_patients']='ONB_B'
adata_ONB_B.obs['orig_ident']='ONB_B'
adata_ONB_B.obs['cond'] = 'ONB'

In [None]:
#QC filtering
adata_ONB_B.var['mito'] = adata_ONB_B.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_ONB_B, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_ONB_B, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_ONB_B, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_ONB_B, x='total_counts', y='n_genes_by_counts')

Next dataset is an atlas of all normosmic patients from Durante et al. 2020 (GSE139522) and Oliva et al. 2022 (GSE184117). This atlas is derived from the human OE single cell atlas created for Finlay et al. 2023 Sci Trans Med. Please see paper (PMC10317309) and accompanying Github page for further details.

In [None]:
#Read in 
adata_hu = sc.read_h5ad('Human_normal_OE_only_atlas.h5ad')

# concatenate datasets

In [None]:
#Concatenate datasets
adata = adata_hu.concatenate([adata_ONB_A, adata_ONB_B], index_unique=None, join="outer")

In [None]:
#Calculate QC statistics
adata.var['mito'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
#Plot
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
#Filter data by slicing anndata object
adata = adata[adata.obs.n_genes_by_counts < 9000, :]
adata = adata[adata.obs.total_counts > 500, :]
adata = adata[adata.obs.pct_counts_mito < 40, :]

In [None]:
#Prep for HVG and scvi
#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm")

# set up and train scvi model

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=5000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="orig_ident"
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

fig, ax = plt.subplots(figsize=(9,6))
ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=5000, batch_key="orig_ident", inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

#Can insert batch_key here if desired
scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts"
    continuous_covariate_keys=["pct_counts_mito"],
    batch_key='orig_ident'
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
#Fit model to data

#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata.obsm["X_scVI_1.1"] = latent

#Calculate neighbors using scVI model input
sc.pp.neighbors(adata, use_rep="X_scVI_1.1")
sc.tl.umap(adata, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata, key_added="leiden_scVI_1.1", resolution=2.0)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mito", "log1p_total_counts"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
# assess batch effects
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="leiden_scVI_1.1", legend_loc="on data", ax=ax, s=10, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="orig_ident", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="cluster_names", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False)

In [None]:
# feature plots
# assess expression of a variety of known genes
genes = ['leiden_scVI_1.1', 'SOX9', 'ERMN',
        'GPX6', 'PLP1', 'TRPM5', 
         'CFTR','PTPRC', 'CD68',
        'SOX2', 'DCN',
        'OLIG2', 'DCX']

sc.pl.umap(
    adata,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
#Additional QC bar graphs
adata_query.obs['cluster'] = adata.obs["leiden_scVI_1.1"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(30,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(30,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
# Find cluster markers for each leiden cluster
# Confirming presence of high quality cells in each cluster
sc.tl.rank_genes_groups(adata, 'leiden_scVI_1.1', method='wilcoxon', layer='norm', use_raw=False)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(50)

In [None]:
#Identify and subset out low quality clusters (ie doublets based on high gene counts, clusters with low average gene counts not consistent with known marker genes, etc)

bad_clust=['26']

#Filter out bad clusters
to_keep=(~adata.obs['leiden_scVI_1.1'].isin(bad_clust))

#Copy over to new anndata object
adata = adata[to_keep].copy()

Just like in the creation of the mouse integration, perform another iteration of model training each time a cell cluster is subset out until no other low quality clusters remain. 

In [None]:
# once finished, save adata object
adata.write('full_human_OE_atlas_with_2_ONBs.h5ad')

# subsetting out tumors

In [None]:
adata=sc.read_h5ad('full_human_OE_atlas_with_2_ONBs.h5ad')

In [None]:
# first subsetting out only cells from the tumors
clust=['ONB_A', 'ONB_B']

#Filter out bad clusters
to_keep=(adata.obs['orig_ident'].isin(clust))

#Copy over to new anndata object
adata_f = adata[to_keep].copy()

In [None]:
# now importantly subset out leiden clusters that do not contain normal, overlying epithelial or stromal cells

# to easily visualize this, compare global adata umap with cluster names to newly subset out object
fig, ax = plt.subplots(figsize=(6, 4))
sc.pl.umap(adata, color="leiden_scVI_1.1", legend_loc="on data", ax=ax, s=30, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(6, 4))
sc.pl.umap(adata, color="cluster_names", legend_loc="on data", ax=ax, s=30, frameon=False, save=False)

fig, ax = plt.subplots(figsize=(6, 4))
sc.pl.umap(adata_f, color="leiden_scVI_1.1", legend_loc="on data", ax=ax, s=30, frameon=False, save=False)

In [None]:
# can now reliably select leiden clusters that are distinct from normal OE and respiratory cells
# for example:

# leiden clusters unique to tumor
clust=['0', '22', '44', '32', '49']

#Filter out bad clusters
to_keep=(adata_f.obs['leiden_scVI_1.1'].isin(clust))

#Copy over to new anndata object
adata_tumor = adata_f[to_keep].copy()

In [None]:
# save adata object
adata_tumor.write('human_ONB_tumors_only.h5ad')

# plot generation

In [None]:
# to specifically label only cells from ONBs A and B on the global UMAP
adata=sc.read_h5ad('full_human_OE_atlas_with_2_ONBs.h5ad')

# Create a 1x2 grid layout
fig = plt.figure(figsize=(16, 6))
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1])

# Plot the UMAP on the left
ax0 = plt.subplot(gs[0])
condition_A = adata.obs['orig_ident'] == 'ONB_A'
condition_B = adata.obs['orig_ident'] == 'ONB_B'

# Scatter plot for 'Other' cells
scatter2 = ax0.scatter(
    adata.obsm['X_umap'][~condition_A & ~condition_B, 0],
    adata.obsm['X_umap'][~condition_A & ~condition_B, 1],
    s=2,
    c='lightgray',
    label='Other'
)

# Scatter plot for ONB-A cells
scatter1 = ax0.scatter(
    adata.obsm['X_umap'][condition_A, 0],
    adata.obsm['X_umap'][condition_A, 1],
    s=2,
    c='tab:red',
    label='ONB-A'
)

# Scatter plot for ONB-B cells
scatter_onb_1 = ax0.scatter(
    adata.obsm['X_umap'][condition_B, 0],
    adata.obsm['X_umap'][condition_B, 1],
    s=2,
    c='tab:blue',
    label='ONB-B'
)

ax0.set_title("UMAP Plot")
ax0.axis('off')

# Create custom legend handles
legend_handles = [
    Line2D([0], [0], marker='o', color='w', label='ONB-A', markerfacecolor='tab:red', markersize=8),
    Line2D([0], [0], marker='o', color='w', label='Other', markerfacecolor='lightgray', markersize=8),
    Line2D([0], [0], marker='o', color='w', label='ONB-B', markerfacecolor='tab:blue', markersize=8)
]
legend0 = ax0.legend(handles=legend_handles, loc='center left', bbox_to_anchor=(1, 0.5), title='Conditions')

# Plot UMAP on the right
ax1 = plt.subplot(gs[1])
sc.pl.umap(adata, color="cluster_names", legend_loc="on data", ax=ax1, s=4, frameon=False, show=False)

# Adjust the layout
plt.tight_layout()
plt.show()

In [None]:
# to plot gene expression matrix across ONB-A and ONB-B

# use tumors only adata object
adata=sc.read_h5ad('human_ONB_tumors_only.h5ad')

# generate new layer for scaling
adata.layers['norm_scale']=adata.layers['norm'].copy()

# scale gene values for visualization of plotting
sc.pp.scale(adata, layer='norm_scale')
# clip values so that between 0 and 1
adata.layers['norm_scale']=adata.layers['norm_scale'].clip(0,1)

# generate plot
fig, ax = plt.subplots(figsize=(3,8))
sc.pl.matrixplot(adata, ['NEUROD1', 'SOX11', 'GNG8', 'LHX2', 'CHGA', 'HES6', 'SYP', 'UCHL1', 'GRP',  #neuronal-like genes 
                         'CFTR', 'FOXI1', 'POU2F3', 'KRT8', 'KRT18', 'IGF1R', 'WWTR1', 'GRHL1', 'FGFR2', 'ITGA6', 'GRHL2'],  #non-neuronal like genes
                 'tumor_cluster_names', dendrogram=False, cmap='Reds', 
                 colorbar_title='mean expression\nin group',
                 save=False, swap_axes=True, layer='norm_scale', ax=ax,
                         categories_order=['ONB A', 'ONB B'],
                vmin=0,
                vmax=1
                )

To assess RPM and RPMA gene set scores in ONB-A and ONB-B:

In [None]:
# use tumors only adata object
adata=sc.read_h5ad('human_ONB_tumors_only.h5ad')

# set .X to normalized layer so scores are calculated with normalized data
adata.X = adata.layers['norm']

In [None]:
# Read in gene set lists (these are derived from top genes in RPM vs RPMA DE)

#RPM
RPM_up_list = pd.read_csv('/hpc/group/goldsteinlab/R/Working_directory/mouse_edgeR_RPM_up_human_versions.csv')['x']
#Convert df or series to list
RPM_up_targets = RPM_up_list.squeeze().str.strip().to_list()

#RPMA
RPMA_up_list = pd.read_csv('/hpc/group/goldsteinlab/R/Working_directory/mouse_edgeR_RPMA_up_human_versions.csv')['x']
#Convert df or series to list
RPMA_up_targets = RPMA_up_list.squeeze().str.strip().to_list()

In [None]:
# use top 50 genes
RPM_up_targets_f=RPM_up_targets[0:50]
RPMA_up_targets_f=RPMA_up_targets[0:50]

In [None]:
# calculate scores
sc.tl.score_genes(adata, RPM_up_targets_f, score_name='RPM_ONB_f', use_raw=False)
sc.tl.score_genes(adata, RPMA_up_targets_f, score_name='RPMA_ONB_f', use_raw=False)

In [None]:
df_score = sc.get.obs_df(adata, keys=['RPM_ONB_f', 'RPMA_ONB_f', 'tumor_cluster_names'])

In [None]:
from statannot import add_stat_annotation

In [None]:
# plot violin plot
gene='RPM_ONB_f'

# Set Seaborn style to plain
sns.set(style="white")

#mpl figure 
fig, ax = plt.subplots(figsize=(3,4))
ax=sns.violinplot(data=df_score, x='tumor_cluster_names', y=gene, ax=ax, ci=95, capsize=0.1,
             inner=None, order=['ONB A', 'ONB B'], palette=['tab:red', 'tab:blue'])
sns.despine()
plt.xticks(rotation=0)
ax.set_xlabel('Tumor')

sns.stripplot(data=df_score,
    x="tumor_cluster_names", 
    y=gene, 
     dodge=True, alpha=1, ax=ax, palette=['black', 'black'], size=2, order=['ONB A', 'ONB B']
)
sns.despine()
plt.xticks(rotation=0)
ax.set_xlabel('')

#Stats
ax, test_results=add_stat_annotation(ax, data=df_score, x='tumor_cluster_names', y=gene, box_pairs=[('ONB A', 'ONB B')], 
                                     test='Mann-Whitney', text_format='star', loc='outside', verbose=2, order=['ONB A', 'ONB B'])