# prep workspace

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc

import matplotlib as mpl
import matplotlib.pyplot as plt

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

In [None]:
#Set fontsize
plt.rcParams.update({'font.size': 20})

In [None]:
#Set wd 
os.chdir('/hpc/group/goldsteinlab/Python/ONB')

In [None]:
#Show specific size of pandas dataframe when produced
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
%matplotlib inline

# read data

In [None]:
# Read in RPM celltag .h5 dataset
adata_RPM_allo=sc.read_10x_h5('RPM_allograft_filtered_feature_bc_matrix.h5')

In [None]:
# add metadata
adata_RPM_allo.obs['genotype']='ONB'
adata_RPM_allo.obs['mouse_ident']='RPM_allograft'
adata_RPM_allo.obs['tumor_type']= 'RPM'

In [None]:
#QC plots on RPM allograft
adata_RPM_allo.var['mito'] = adata_RPM_allo.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_RPM_allo, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_RPM_allo, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_RPM_allo, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_RPM_allo, x='total_counts', y='n_genes_by_counts')

In [None]:
# Read in RPMA celltag .h5 dataset
adata_RPMA_allo=sc.read_10x_h5('RPMA_allograft_filtered_feature_bc_matrix.h5')

In [None]:
# add metadata
adata_RPMA_allo.obs['genotype']='ONB'
adata_RPMA_allo.obs['mouse_ident']='RPMA_allograft'
adata_RPMA_allo.obs['tumor_type']= 'RPMA'

In [None]:
#QC plots on RPMA allograft
adata_RPMA_allo.var['mito'] = adata_RPMA_allo.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_RPMA_allo, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_RPMA_allo, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_RPMA_allo, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_RPMA_allo, x='total_counts', y='n_genes_by_counts')

In [None]:
# read in full mouse OE atlas with primary RPM and RPMA tumors
adata=sc.read_h5ad('OE_atlas_with_ONB_tumors_all_cells.h5ad')

# concatenate and prep

In [None]:
# now re-concatenating the two different tumor objects
adata = adata.concatenate([adata_RPM_allo, adata_RPMA_allo],  index_unique=None, join="outer")

In [None]:
#QC filtering 
adata.var['mito'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
# Perform rough filtering
# Filter data by slicing anndata object
adata = adata[adata.obs.n_genes_by_counts < 8000, :]
adata = adata[adata.obs.n_genes_by_counts > 1500, :]
adata = adata[adata.obs.total_counts > 2500, :]
adata = adata[adata.obs.total_counts < 100000, :]
adata = adata[adata.obs.pct_counts_mito < 30, :]

In [None]:
#Prep for HVG and scvi

#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create layers
adata.layers["counts"] = adata.X.copy()
adata.layers["norm"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e4, layer="norm")

# scvi all cells

In [None]:
#HVG via Scanpy
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=3000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="mouse_ident"
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

fig, ax = plt.subplots(figsize=(9,6))
ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=3000, batch_key="mouse_ident", inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

#Can insert batch_key here if desired
scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    continuous_covariate_keys=["pct_counts_mito"],
    batch_key='mouse_ident'
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500,
    use_gpu=True
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
#Fit model to data

#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata.obsm["X_scVI_all_allograft_1.1"] = latent

#Calculate neighbors using scVI model input
sc.pp.neighbors(adata, use_rep="X_scVI_all_allograft_1.1")
sc.tl.umap(adata, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata, key_added="leiden_scVI_all_allograft_1.1", resolution=2.0)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mito", "log1p_total_counts"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
# assess for batch effects
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="genotype", cmap="cmo.matter", s=4, ax=ax, vmax="p99.99", frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="leiden_scVI_all_allograft_1.1", legend_loc="on data", ax=ax, s=4, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="tumor_type", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False, palette='tab10')
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="cluster_names", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False, palette='tab20')

In [None]:
#Additional QC bar graphs
adata_query.obs['cluster'] = adata.obs["leiden_scVI_all_allograft_1.1"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(25,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="log1p_total_counts", ax=ax)

In [None]:
#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(25,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
#feature plots to assess validity of gene expression across clusters
genes = ['Ptprc', 'Sox9', 'Dcn',
        'Lum', 'Vwf', 'Tagln',
        'Krt8', 'Krt18', 'Ascl1',
        'Lhx2', 'Neurod1', 'Kit',
        'Hes6', 'Hes1', 'Ezh2',
        'Sox11', 'Insm1', 'Chga',
        'Mki67', 'Top2a']

sc.pl.umap(
    adata,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

As before for integration of other atlases, removed low quality cell clusters based on gene expression patterns, average counts, and average mitochondrial DNA. Each time a cluster was removed, the model was re-trained, starting from highly variable gene selection. In the case of the RPM and RPMA allografts, it only took two iterations to remove low quality clusters.

# full atlas plots

In [None]:
# to plot UMAP coloring only primary and allograft tumor clusters

palette=[ 'lightgray', #(BG)
        'lightgray', #(Fibroblast)
       'lightgray', #(GBC)
         'lightgray', #(HBC)
        'lightgray', #(INP)
         'lightgray', #(Lymphoid)
       'lightgray', #(MV ionocyte)
         'lightgray', #(MV tuft)
        'lightgray', #(Myeloid)
         'lightgray', #(OEC)
        'lightgray', #(Pericyte)
         '#984ea3', #(RPM tumor)
         '#ff7f0e', #RPMA tumor
         'turquoise', #RPMA allograft
         'tab:blue', #RPM allograft
       'lightgray', #sus
      'lightgray', #iOSN
     'lightgray' #mOSN
        ]

fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="cluster_names", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False, palette=palette)

For optimal transport, same code was followed as that presented earlier in notebook 5, sub-section "Optimal transport"

# scvi tumors only

In [None]:
# first subset out tumor cells from the global adata object created above

keep=['RPM', 'RPMA', 'RPM_GBC_Allograft', 'RPMA_GBC_Allograft']

#Filter out bad clusters
to_keep=(adata.obs['cluster_names'].isin(keep))

#Copy over to new anndata object
adata = adata[to_keep].copy()

In [None]:
# first need to re-train scvi model to get embedding for cells

#HVG via Scanpy
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="mouse_ident"
)

In [None]:
# Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=10000, batch_key="mouse_ident", inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

#Can insert batch_key here if desired
scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    continuous_covariate_keys=["pct_counts_mito"],
    batch_key='mouse_ident'
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500,
    use_gpu=False
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
#Fit model to data

#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata.obsm["X_scVI_all_allograft_1.1"] = latent

#Calculate neighbors using scVI model input
sc.pp.neighbors(adata, use_rep="X_scVI_all_allograft_1.1")
sc.tl.umap(adata, min_dist=0.8)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata, key_added="leiden_scVI_all_allograft_1.1", resolution=1.2)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mito", "log1p_total_counts"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
# verify high quality cells

#Additional QC bar graphs
adata_query.obs['cluster'] = adata.obs["leiden_scVI_all_allograft_1.1"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(25,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(25,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
# save adata object
adata.write('ONB_primary_allografts_only_scvi.h5ad')

# PAGA and diffusion pseudotime

In [None]:
# Start with tumor adata object generated from above section (with new scvi tumor specific embeddings)
sc.pp.neighbors(adata, n_neighbors=35, use_rep='X_scVI_all_allograft_1.1')
sc.tl.umap(adata)
sc.tl.draw_graph(adata)

In [None]:
sc.pl.draw_graph(adata, color='leiden_scVI_all_allograft_1.1', legend_loc='right margin')

In [None]:
# run paga
sc.tl.leiden(adata, resolution=1.2)
sc.tl.paga(adata, groups='leiden')
sc.pl.paga(adata, threshold=0.03, show=False)
sc.tl.draw_graph(adata, init_pos='paga')
sc.pl.draw_graph(adata, color=['leiden', 'mouse_ident',
                              'leiden_scVI_all_allograft_1.1'], legend_loc='right margin')

In [None]:
# check gene expression across plot 
# this also generates feature plots for the FA plots used in the paper figures
genes = ['Tubb3', 'Neurod1',
         'Lhx2', 'Sox11',
         'Foxi1', 'Cftr',
         'Vim', 'Pou2f3', 'Trpm5', 
         'Chat', 'Avil', 'Pou2f3',
         'Sox2', 'Cyp2j6', 'Cxcl17',
         'Sox9', 'Sox10', 'Runx1',
         'Runx1t1', 'Krt8',  'Krt18',
         'Krt5', 'Trp63',
         'Ezh2', 'Kit', 'Myc',
        'Hes6', 'Mki67', 'Top2a',
        'Epcam', 'Ncam1', 'Ascl1',
        'Krt5', 'Foxj1',
        'Cfap126', 'fLuc', 'CellTag.UTR',
         'GFP.CDS']


sc.pl.draw_graph(adata, color=genes, legend_loc='right margin', vmax='p99.5',
                 color_map='cmo.dense', ncols=3, frameon=False, layer='norm'
                    )

In [None]:
# set root for pseudotime
adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden']  == '7')[0]

In [None]:
# compute
sc.tl.diffmap(adata, n_comps=30)
sc.tl.dpt(adata, n_branchings=5)

In [None]:
# plot pseudotime
with plt.rc_context({'figure.figsize': (6, 5)}):
    sc.pl.draw_graph(adata, color=['dpt_pseudotime'], legend_loc='right margin', s=10, frameon=False, save=False,
                    vmax=0.5
                    )

In [None]:
# Find cluster markers for each leiden cluster
sc.tl.rank_genes_groups(adata, 'leiden_scVI_all_allograft_1.1', method='wilcoxon', layer='norm', use_raw=False)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(30)

Next, generate pseudotime heatmap plot

In [None]:
# specify paths for each branch
paths = [('Neuronal', [7, 10, 3, 12]),
         ('MV2-like', [7, 8]),
        ('MV1/glandular-like', [7, 11, 13, 1, 9]),
        ('Mesenchymal', [7, 11, 13, 2]),
        ('Stem-like', [7, 11, 0, 5, 6])]

In [None]:
adata.obs['distance'] = adata.obs['dpt_pseudotime']

In [None]:
# set .X to the normalized layer (for plotting purposes below)
adata.X = adata.layers['norm']

# and log the data for plotting purposes 
sc.pp.log1p(adata)

# finally scale between 0 and 1 
adata.X = csr_matrix(np.clip(adata.X.toarray(), 0, 1))

In [None]:
# import
from scipy.sparse import csr_matrix

In [None]:
# specify gene names for the heatmap
gene_names =['Ascl1', 'Kit',
             'Neurod1', 'Lhx2', 'Tubb3','Runx1t1', 
             'Foxi1', 'Runx1', 'Cftr', 
             'Krt8', 'Krt18', 
            'Sox9', 'Sox10',
             'Vim', 'Cd44', 
            'Sox2', 'Mecom']

In [None]:
#Plot
_, axs = plt.subplots(ncols=5, figsize=(24, 9), gridspec_kw={'wspace': 0.05, 'left': 0.12})
plt.subplots_adjust(left=0.05, right=0.98, top=0.82, bottom=0.2)
for ipath, (descr, path) in enumerate(paths):
    _, data = sc.pl.paga_path(
        adata, path, gene_names,
        show_node_names=False,
        ax=axs[ipath],
        ytick_fontsize=12,
        left_margin=0.15,
        n_avg=200,
        annotations=['distance'],
        show_yticks=True if ipath==0 else False,
        show_colorbar=True,
        color_map='Purples',
        groups_key='leiden_scVI_all_allograft_1.1',
        color_maps_annotations={'distance': 'cmo.haline'},
        title='{} path'.format(descr),
        return_data=True,
        normalize_to_zero_one=False,
        show=False, xlim=[0,0.8])
    

plt.savefig('Pseudotime_celltag_paths_RPM_and_RPMA_allografts.pdf')  
plt.show()

# Celltag plots

Prior to running this section, please run the celltag clone calling described in the R script (8_celltag_clone_calling_analysis). 

In [None]:
# start with same adata object (primary and allograft tumors only) generated above
adata = sc.read_h5ad('ONB_primary_allografts_only_scvi.h5ad')

In [None]:
# read in celltag clone info df
df_ct = pd.read_csv('RPM_RPMA_celltag_clones.csv')

In [None]:
# Set index to barcode so it matches adata.obs
df_ct.index=df_ct['Cell barcodes']

In [None]:
adata.obs = pd.merge(adata.obs, df_ct[['CellTag_Clone', 'Cell barcodes']], left_index=True, right_index=True, how='left')

In [None]:
# check clone counts
adata.obs.CellTag_Clone.value_counts()

In [None]:
# create FA plot with clone cells plotted on top

# import
from matplotlib import gridspec


# select clone
clone='RPMA_clone_42'


fig = plt.figure(figsize=(10, 5))
gs = gridspec.GridSpec(1, 2, width_ratios=[1, 1])
 
# Plot the UMAP on the left
ax0 = plt.subplot(gs[0])
condition_h2023_6 = adata.obs['CellTag_Clone'] == clone
 
# Scatter plot for 'Other' cells
scatter2 = ax0.scatter(
    adata.obsm['X_draw_graph_fa'][~condition_h2023_6, 0],
    adata.obsm['X_draw_graph_fa'][~condition_h2023_6, 1],
    s=2,
    c='lightgray',
    label='Other'
)
 
# Scatter plot for clone cells
scatter1 = ax0.scatter(
    adata.obsm['X_draw_graph_fa'][condition_h2023_6, 0],
    adata.obsm['X_draw_graph_fa'][condition_h2023_6, 1],
    s=10,
    c='tab:blue',
    label=clone
)

ax0.set_title(clone)
ax0.axis('off')

# Adjust the layout
plt.tight_layout()
plt.show()

In [None]:
# next plot background cells (FA plot)
with plt.rc_context({'figure.figsize': (4, 4)}):
    sc.pl.draw_graph(adata, color=['mouse_ident'], frameon=False,
                     legend_loc='right margin', s=50, save=True, palette=['lavender', #RPMA allograft
                                                                                               'lavender', #RPM allograft
                                                                                               'lavender', #RPM tumor
                                                                                               'lavender', # RPMA 1
                                                                                           'lavender',  # RPMA 2
                                                                                           'lavender']) #RPMA 3

In [None]:
# Now going through process of selecting cells only from given clone, to generate connectivity FA plot

# Select rows where 'CellTag_Clone' is 'specified clone' and get the corresponding indices
indices_clone = df_ct[df_ct['CellTag_Clone'] == 'RPMA_clone_42'].index.tolist()

# filter adata to just selected clone
adata_clone = adata[indices_clone, :]

del adata_clone.uns['iroot']
sc.pp.neighbors(adata_clone, n_neighbors=35, use_rep='X_scVI_all_allograft_1.1')
sc.tl.paga(adata_clone, groups='broad_cluster_names')

In [None]:
# plot connectivities graph
with plt.rc_context({'figure.figsize': (9, 6)}):
    sc.pl.paga_compare(
    adata_clone, threshold=0.5, title='', right_margin=0.2, edge_width_scale=0.5,
    legend_fontsize=12, fontsize=12, frameon=False, edges=True, save=True, legend_loc='',
    color='broad_cluster_names', size=1500
    )