# prep workspace

In [None]:
# Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc
import anndata as ad

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns
from statannot import add_stat_annotation

In [None]:
# Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

In [None]:
# set wd
os.chdir('/hpc/group/goldsteinlab/Python/ONB/GeoMx')

In [None]:
# Show full pandas dataframe when produced
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Set fontsize
plt.rcParams.update({'font.size': 15})

In [None]:
%matplotlib inline

# read in data

First read in anndata object from GeoMx pre-processing (R pipeline, notebook 9). This is important because this has filtered out low-quality ROIs based on standardized, established methods. We will then filter the count corrected anndata object so that it only contains these high quality ROIs

In [None]:
adata=sc.read_h5ad('/hpc/group/goldsteinlab/R/Working_directory/Nanostring/GeoMx_82_ROI.h5ad')

In [None]:
df_meta=pd.read_csv('/hpc/group/goldsteinlab/R/Working_directory/Nanostring/Geomx_metadata.csv')

In [None]:
# ensure correct .obs
array_segment= np.asanyarray(df_meta['segment'])
adata.obs['segment']=array_segment

array_ROI=np.asanyarray(df_meta['ROI_name'])
adata.obs['ROI']=array_ROI

array=np.asanyarray(df_meta['Scan_ID'])
adata.obs['Slide']=array

array=np.asanyarray(df_meta['area'])
adata.obs['area']=array

array=np.asanyarray(df_meta['nuclei'])
adata.obs['nuclei']=array

array=np.asanyarray(df_meta['Tumor'])
adata.obs['tumor']=array

array=np.asanyarray(df_meta['Grade'])
adata.obs['grade']=array

array=np.asanyarray(df_meta['Grade_hi_low'])
adata.obs['grade_hi_low']=array

array=np.asanyarray(df_meta['orig.ident'])
adata.obs['obs_name']=array
adata.obs['obs_name'] = adata.obs['obs_name'].str.replace('.dcc$', '', regex=True)

# make sure .obs_names are in list format
adata.obs_names = adata.obs['obs_name'].tolist()

Next, read in countcorrected anndata object

In [None]:
adata_cc = sc.read_h5ad('GeoMx_count_corrected.h5ad')

In [None]:
# filter out low quality ROI segments

# find common obs_names (ie ROI segments in both anndata objects)
common_obs_names = adata_cc.obs_names.intersection(adata.obs_names)

# filter based on common obs_names
adata_cc = adata_cc[common_obs_names].copy()

In [None]:
adata = adata_cc

In [None]:
# check to make sure that there are 82; object now contains only properly QC filtered ROI segments
adata

# gene set score violin plots

In [None]:
# For plotting purposes, remove ROI segments taken from areas of normal OE

to_keep=(~adata.obs['Grade_hi_low'].isin(['Normal']))
# Copy over to new anndata object
adata = adata[to_keep].copy()

In [None]:
# For some plots (OE specific), can eliminate CD45+ ROI segments

to_keep=(~adata.obs['segment'].isin(['CD45']))
#Copy over to new anndata object
adata = adata[to_keep].copy()

In [None]:
# define module scores
# Module scores based on cell type signatures
iOSN=['OLIG2', 'GNG8', 'EBF4', 'TUBB3']
INP = ['LHX2', 'EBF1', 'SOX11', 'NEUROD1']
GBC = ['HES6', 'KIT', 'CXCR4', 'ASCL1', 'SOX2']
non_neuronal = ['KRT8', 'KRT18', 'SOX9', 'POU2F3', 'FOXI1']
neuronal = ['OLIG2', 'GNG8', 'EBF4', 'TUBB3', 'CHGA', 'SYP', 'INSM1']
myeloid = ['CD68', 'C1QA', 'C1QB', 'C1QC']
lymphoid = ['CD3G', 'CD3E', 'CD4', 'CD8A', 'MS4A1']

In [None]:
# set .X to corrected quantile_normalized layer 
adata.X = adata.layers['corrected_quantile_norm']

In [None]:
#Add targets to anndata object
sc.tl.score_genes(adata, iOSN, score_name='iOSN_enriched')
sc.tl.score_genes(adata, INP, score_name='INP_enriched')
sc.tl.score_genes(adata, GBC, score_name='GBC_enriched')

sc.tl.score_genes(adata, non_neuronal, score_name='non_neuronal_enriched')
sc.tl.score_genes(adata, neuronal, score_name='neuronal_enriched')

sc.tl.score_genes(adata, myeloid, score_name='myeloid_enriched')
sc.tl.score_genes(adata, lymphoid, score_name='lymphoid_enriched')

In [None]:
# create df with scores
df_score = sc.get.obs_df(adata_f, keys=['segment', 'iOSN_enriched', 'INP_enriched',
                                        'GBC_enriched', 'non_neuronal_enriched', 'neuronal_enriched',
                                        'myeloid_enriched', 'lymphoid_enriched',
                                        'Grade_hi_low', 'Grade', 'Tumor'])

In [None]:
# aesthetic changes for plotting
df_score = df_score.replace('Tuj1', 'TUBB3')
df_score = df_score.replace('Ki-67', 'KI-67')

In [None]:
# Plot Violinplot

score='myeloid_enriched'

fig, ax = plt.subplots(figsize=(5,5))
ax=sns.violinplot(data=df_score, x='segment', y=score, ax=ax, ci=95, capsize=0.1,
         hue='grade_hi_low', palette=['tab:blue', 'tab:orange'], inner=None, order=['KI-67', 'TUBB3', 'negative'], hue_order=['Low', 'High'])
sns.stripplot(data=df_score,
    x="segment", 
    y=score, 
    hue="grade_hi_low", dodge=True, alpha=1, ax=ax, palette=['black', 'black', 'black'], size=4, order=['KI-67', 'TUBB3', 'negative']
)
sns.despine()
plt.xticks(rotation=0)
ax.set_xlabel('ROI Segment')

# Specify legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[2:], labels[2:], title='ONB Grade', bbox_to_anchor=(1, 1.02), loc='upper left')

# Stats
ax, test_results=add_stat_annotation(ax, data=df_score, x='segment', y=score, hue='grade_hi_low', box_pairs=[(('KI-67', 'Low'), ('KI-67', 'High')), (('TUBB3', 'Low'), ('TUBB3', 'High')),
                                                                                                                             (('negative', 'High'), ('negative', 'Low'))], 
                                     test='Mann-Whitney', text_format='star', loc='outside', verbose=2, order=['KI-67', 'TUBB3', 'negative'])

# generation of unbiased tumor clusters

Use same adata as in previous section (filtered adata with 82 ROIs)

In [None]:
# Becuase we are looking at tumor clusters, remove CD45+ and normal cell ROIs

# remove CD45
to_keep=(~adata.obs['segment'].isin(['CD45']))
adata = adata[to_keep].copy()

# remove normal
to_keep=(~adata.obs['Grade_hi_low'].isin(['Normal']))
adata = adata[to_keep].copy()

In [None]:
# set .X to raw count corrected layer since scvi requires raw counts
adata.X = adata.layers['raw_corrected']

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=3000,
    subset=False,
    flavor="seurat_v3"
)

In [None]:
#Calculate Poisson gene selection

df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=3000, batch_key="Tumor", inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

#Can insert batch_key here if desired
scvi.model.SCVI.setup_anndata(
    adata,
    categorical_covariate_keys=['slide'],
    continuous_covariate_keys=['area', 'nuclei'],
    batch_key='Tumor'
)

model = scvi.model.SCVI(adata, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
#Fit model to data

#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata.obsm["X_scVI_1.1"] = latent

#Calculate neighbors using scVI model input
sc.pp.neighbors(adata, use_rep="X_scVI_1.1", n_neighbors=10)
sc.tl.umap(adata, min_dist=1.0)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata, key_added="leiden_scVI_1.1", resolution=0.6)

In [None]:
# assess batch effects
sc.pl.umap(adata, color=['Tumor', 'segment', 'Grade_hi_low', 'leiden_scVI_1.1'], cmap="cmo.matter", s=800, 
           vmax="p99.99", frameon=False, save=False)

In [None]:
# checking quality of ROI segments by cluster
sc.pp.calculate_qc_metrics(adata, percent_top=None, log1p=False, inplace=True)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
             jitter=0.4, multi_panel=True, groupby='leiden_scVI_1.1')

In [None]:
# Calculate the average total_counts per Leiden cluster
average_counts_per_cluster = adata.obs.groupby('leiden_scVI_1.1')['total_counts'].mean()

# Print or further analyze the results
print(average_counts_per_cluster)

In [None]:
# Can check expression of various marker genes in a heatmap across leiden clusters

# first, for heatmap plotting purposes, create log normalized layer
adata.layers["log_corrected_quantile_norm"] = adata.layers['corrected_quantile_norm'].copy()
sc.pp.log1p(adata, layer='log_corrected_quantile_norm')


# gene list
genes = ['EZH2', 'KRT8', 'KRT18', 'MKI67', 'RUNX1', 'RUNX1T1', 'LHX2', 'NEUROD1', 'KIT',
         'ASCL1', 'MYC', 'CHGB', 'OLFM1', 'SOX11', 'SOX9', 'SOX2', 'TOP2A', 'DLL3',
         'SEZ6', 'POU2F3', 'FOXI1']

# Extract data for heatmap
df_heatmap = pd.DataFrame(adata[:, genes].layers['log_corrected_quantile_norm'], columns=genes, index=adata.obs_names)

# Manually reorder rows based on leiden clusters
leiden_order = np.argsort(adata.obs['leiden_scVI_1.1'])
df_heatmap_ordered = df_heatmap.iloc[leiden_order]

# Get leiden cluster labels for each cell
leiden_labels = adata.obs['leiden_scVI_1.1'].values[leiden_order]

# Convert the cluster labels to integers
leiden_labels_int = leiden_labels.astype(int)

# palette
palette = sns.color_palette('tab10', n_colors=len(np.unique(leiden_labels_int)))

# Plot clustermap 
sns.set(font_scale=0.8)  # Adjust font size for legend
clustermap = sns.clustermap(df_heatmap_ordered, cmap='viridis', method='ward', col_cluster=True, row_cluster=False,
                            row_colors=[palette[i] for i in leiden_labels_int], figsize=(10, 8))

# Create legend 
legend_labels = [f'Cluster {i}' for i in np.unique(leiden_labels_int)]
legend_handles = [plt.Line2D([0], [0], marker='o', color=palette[i], label=label, markersize=10) for i, label in enumerate(legend_labels)]
legend = plt.legend(handles=legend_handles, title='Leiden Clusters', loc='upper right', bbox_to_anchor=(11, 1))

# Create colorbar
cbar = clustermap.ax_heatmap.collections[0].colorbar
cbar.set_label('Expression Level', rotation=270, labelpad=15)

# Adjust the position of the color bar legend
cbar.ax.set_position([1.05, 0.4, 0.02, 0.2])  # Adjust the values as needed

# Show the plot
plt.show()

In [None]:
# remove leiden clusters with low average counts per cluster, combined with low to zero expression of any tumor markers expressed in heatmap

# for example, here removing leiden cluster 2

to_keep=(adata.obs['leiden_scVI_1.1'].isin(['0', '1']))
#Copy over to new anndata object
adata = adata[to_keep].copy()

# rename clusters "0" and "1" to "1" and "2" for plotting purposes
new_cluster_names = ['1', '2']
adata.rename_categories('leiden_scVI_1.1', new_cluster_names)

# unbiased cluster plots

In [None]:
# Stacked bar chart to assess population proportions

# set seaborn style 
sns.set(style="white")

# Create df
df_stacked_bar = pd.DataFrame({'Leiden Cluster': adata.obs['leiden_scVI_1.1'].values,
                               'Grade_hi_low': adata.obs['Grade_hi_low'].values})

# Cross-tab 
cross_tab = pd.crosstab(df_stacked_bar['Leiden Cluster'], df_stacked_bar['Grade_hi_low'], normalize='index')

# Plot 
plt.figure(figsize=(2, 6))
sns.barplot(data=cross_tab.reset_index(), x='Leiden Cluster', y='High', color='tab:red', label='High')
sns.barplot(data=cross_tab.reset_index(), x='Leiden Cluster', y='Low', color='tab:green', bottom=cross_tab['High'], label='Low')

# Labels
plt.title('Proportion of Cells in Leiden Clusters by Grade')
plt.xlabel('Leiden Cluster')
plt.ylabel('Proportion')
plt.legend(title='Grade_hi_low', loc='upper right')

# Set y-axis limit to end at 1
plt.ylim(0, 1)

# Move legend outside of plot
plt.legend(title='Grade_hi_low', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show the plot
plt.show()

In [None]:
# generate enriched gene lists per leiden cluster
sc.tl.rank_genes_groups(adata, 'leiden_scVI_1.1', method='wilcoxon', layer='corrected_quantile_norm', use_raw=False)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(100)
top_genes_list = top_genes.values.flatten().tolist()

In [None]:
# plot heatmap 

# genes (using top genes per cluster)
genes = top_genes_list

# extract df
df_heatmap = pd.DataFrame(adata[:, genes].layers['log_corrected_quantile_norm'], columns=genes, index=adata.obs_names)

# reorder rows based on leiden clusters
leiden_order = np.argsort(adata.obs['leiden_scVI_1.1'])
df_heatmap_ordered = df_heatmap.iloc[leiden_order]

# get cluster labels for each ROI
leiden_labels = adata.obs['leiden_scVI_1.1'].values[leiden_order]

# Convert to integers
leiden_labels_int = leiden_labels.astype(int)

# palette
palette = sns.color_palette('tab10', n_colors=len(np.unique(leiden_labels_int)))

# plot
sns.set(font_scale=0.8)  # Adjust font size for legend
clustermap = sns.clustermap(df_heatmap_ordered, cmap='viridis', method='ward', col_cluster=True, row_cluster=False,
                            row_colors=[palette[i] for i in leiden_labels_int], figsize=(10, 8))

# legend for leiden clusters
legend_labels = [f'Cluster {i}' for i in np.unique(leiden_labels_int)]
legend_handles = [plt.Line2D([0], [0], marker='o', color=palette[i], label=label, markersize=10) for i, label in enumerate(legend_labels)]
legend = plt.legend(handles=legend_handles, title='Leiden Clusters', loc='upper right', bbox_to_anchor=(11, 1))

# color bar
cbar = clustermap.ax_heatmap.collections[0].colorbar
cbar.set_label('Expression Level', rotation=270, labelpad=15)

# Adjust positioning
cbar.ax.set_position([1.05, 0.4, 0.02, 0.2])  # Adjust the values as needed

# show
plt.show()

In [None]:
# to make violin plots for specific genes

# make sure .X is set to normalized layer
adata.X=adata.layers['corrected_quantile_norm']

# first specify genes to query
genes=['KRT8', 'KRT18', 'RUNX1', 'RUNX1T1', 'EZH2', 'KIT', 'POU2F3', 'FOXI1',
      'TUBB3', 'MKI67', 'ASCL1', 'CXCR4', 'NEUROD1', 'EBF1', 'SOX11', 'GAP43',
      'LHX2', 'OLIG2', 'GNG8', 'SOX2', 'ERMN', 'KITLG', 'LUM', 'DCN', 'RUNX2',
      'KRT1', 'KRT2', 'KRT4', 'KRT5', 'KRT7', 'KRT9', 'KRT10',
      'KRT12', 'KRT13', 'KRT14', 'KRT15', 'KRT16', 'KRT17', 'KRT19',
      'MUC1', 'MUC5AC', 'UCHL1', 'ENO2', 'CRMP1', 'DLL3', 'SSTR2', 'SEZ6', 'HES1',
      'MYC', 'YAP1', 'CFTR', 'GPX6', 'MYCN', 'INSM1', 'TOP2A', 'NEURL1', 'PLCH2',
      'SYT7', 'DLK1', 'MEX3A', 'KDM2B', 'MYCL', 'BCL2', 'GRHL1', 'CHGA', 'CHGB',
      'GRP', 'SYP']

# create df
df_gene1 = sc.get.obs_df(adata, genes).join(adata.obs)

In [None]:
# choose gene from list above
gene='UCHL1'


# set style
sns.set(style="white")

# plot figure
fig, ax = plt.subplots(figsize=(2,4))
ax=sns.violinplot(data=df_gene1, x='leiden_scVI_1.1', y=gene, ax=ax, ci=95, capsize=0.1,
             inner=None, order=['1', '2'], palette='tab10')
sns.despine()
plt.xticks(rotation=0)
ax.set_xlabel('ROI Segment')

sns.stripplot(data=df_gene1,
    x="leiden_scVI_1.1", 
    y=gene, 
     dodge=True, alpha=1, ax=ax, palette=['black', 'black', 'black'], size=4, order=['1', '2']
)
sns.despine()
plt.xticks(rotation=0)
ax.set_xlabel('ROI Segment')

# stats
ax, test_results=add_stat_annotation(ax, data=df_gene1, x='leiden_scVI_1.1', y=gene, box_pairs=[('1', '2')], 
                                     test='Mann-Whitney', text_format='star', loc='outside', verbose=2, order=['1', '2'])

To calculate gene set scores derived from mouse RPM and RPMA ONB models:

In [None]:
# Read in humanized gene lists derived from edgeR enriched genes for RPM and RPMA tumors

#RPM
RPM_up_list = pd.read_csv('/hpc/group/goldsteinlab/R/Working_directory/mouse_edgeR_RPM_up_human_versions.csv')['x']
RPM_up_targets = RPM_up_list.squeeze().str.strip().to_list()

#RPMA
RPMA_up_list = pd.read_csv('/hpc/group/goldsteinlab/R/Working_directory/mouse_edgeR_RPMA_up_human_versions.csv')['x']
RPMA_up_targets = RPMA_up_list.squeeze().str.strip().to_list()

In [None]:
# use top 100 genes
RPM_up_targets_f=RPM_up_targets[0:100]
RPMA_up_targets_f=RPMA_up_targets[0:100]

In [None]:
# Add targets to anndata object
# make sure .X is set to the normalized layer prior to running this
sc.tl.score_genes(adata, RPM_up_targets_f, score_name='RPM_ONB_f', use_raw=False)
sc.tl.score_genes(adata, RPMA_up_targets_f, score_name='RPMA_ONB_f', use_raw=False)

In [None]:
# df
df_score = sc.get.obs_df(adata, keys=['RPM_ONB_f', 'RPMA_ONB_f', 'leiden_scVI_1.1'])

In [None]:
gene='RPMA_ONB_f'


# Set Seaborn style to plain
sns.set(style="white")

# plot violinplot
fig, ax = plt.subplots(figsize=(3,4))
ax=sns.violinplot(data=df_score, x='leiden_scVI_1.1', y=gene, ax=ax, ci=95, capsize=0.1,
             inner=None, order=['1', '2'], palette='tab10')
sns.despine()
plt.xticks(rotation=0)
ax.set_xlabel('ROI Segment')

sns.stripplot(data=df_score,
    x="leiden_scVI_1.1", 
    y=gene, 
     dodge=True, alpha=1, ax=ax, palette=['black', 'black', 'black'], size=4, order=['1', '2']
)
sns.despine()
plt.xticks(rotation=0)
ax.set_xlabel('ROI Segment')

# stats
ax, test_results=add_stat_annotation(ax, data=df_score, x='leiden_scVI_1.1', y=gene, box_pairs=[('1', '2')], 
                                     test='Mann-Whitney', text_format='star', loc='outside', verbose=2, order=['1', '2'])