# Prep environment

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc

import matplotlib as mpl
import matplotlib.pyplot as plt

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns

In [None]:
#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

In [None]:
#Set wd 
os.chdir('/hpc/group/goldsteinlab/Python')

In [None]:
#Show specific size of pandas dataframe when produced
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
#Set fontsize
plt.rcParams.update({'font.size': 20})

In [None]:
%matplotlib inline

# Load in data

In [None]:
# Read in previously created atlas
# This was created with normal mouse OE from Ko et al 2023 and Horgue et al 2022 (GSE185168 and GSE224894)
adata_m=sc.read_h5ad('/hpc/group/goldsteinlab/tbk13_Python/EEDKOHET_WT_anndata_scVI1.6.1.h5ad')

In [None]:
adata_m.obs.groupby(['genotype']).apply(len)

In [None]:
adata_m.obs.groupby(['mouse_ident']).apply(len)

In [None]:
#Remove knockout (mutant) mice from dataset
to_keep=(~adata_m.obs['genotype'].isin(['ko']))

#Copy over to new anndata object
adata_m = adata_m[to_keep].copy()

In [None]:
#Read in 10x Cell Ranger output counts matrix for H2022_10
adata_RPM = sc.read_10x_mtx('RPM_Cas9/', var_names='gene_symbols', cache=True)   

In [None]:
#Add metadata to adata_RPM
adata_RPM.obs['genotype'] = 'ONB'
adata_RPM.obs['mouse_ident'] = 'RPM_Cas9'
adata_RPM.obs['tumor_type'] = 'RPM'

In [None]:
#QC filtering
adata_RPM.var['mito'] = adata_RPM.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_RPM, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_RPM, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_RPM, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_RPM, x='total_counts', y='n_genes_by_counts')

In [None]:
#Read in 10x Cell Ranger output counts matrix 
adata_RPMA_1 = sc.read_10x_h5('RPMA_1/filtered_feature_bc_matrix.h5')

In [None]:
#Add metadata to adata_RPM
adata_RPMA_1.obs['genotype'] = 'ONB'
adata_RPMA_1.obs['mouse_ident'] = 'RPMA_1'
adata_RPMA_1.obs['tumor_type'] = 'RPMA'

In [None]:
#QC filtering
adata_RPMA_1.var['mito'] = adata_RPMA_1.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_RPMA_1, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_RPMA_1, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_RPMA_1, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_RPMA_1, x='total_counts', y='n_genes_by_counts')

In [None]:
#Read in 10x Cell Ranger output counts matrix 
adata_RPMA_2 = sc.read_10x_mtx('RPMA_2/', var_names='gene_symbols', cache=True) 

In [None]:
#Add metadata to adata_RPM
adata_RPMA_2.obs['genotype'] = 'ONB'
adata_RPMA_2.obs['mouse_ident'] = 'RPMA_2'
adata_RPMA_2.obs['tumor_type'] = 'RPMA'

In [None]:
#QC filtering
adata_RPMA_2.var['mito'] = adata_RPMA_2.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_RPMA_2, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_RPMA_2, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_RPMA_2, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_RPMA_2, x='total_counts', y='n_genes_by_counts')

In [None]:
#Read in 10x Cell Ranger output counts matrix 
adata_RPMA_3 = sc.read_10x_mtx('RPMA_3/', var_names='gene_symbols', cache=True) 

In [None]:
#Add metadata to adata_RPM
adata_RPMA_3.obs['genotype'] = 'ONB'
adata_RPMA_3.obs['mouse_ident'] = 'RPMA_3'
adata_RPMA_3.obs['tumor_type'] = 'RPMA'

In [None]:
#QC filtering
adata_RPMA_3.var['mito'] = adata_RPMA_3.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_RPMA_3, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata_RPMA_3, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata_RPMA_3, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata_RPMA_3, x='total_counts', y='n_genes_by_counts')

# Concatenate

In [None]:
#Concatenate datasets
adata = adata_m.concatenate([adata_RPMA_2, adata_RPMA_3, adata_RPMA_1, adata_RPM], index_unique=None, join="outer")

In [None]:
#Fix var metadata categories

#gene_ids
x = adata.var.loc[:, adata.var.columns[adata.var.columns.str.match("gene_ids-\d+")]]
cols = x.T.notna().idxmax()
x = x.reset_index().melt("index")
adata.var.insert(
    0,
    "gene_ids",
    x.set_index(["index", "variable"]).loc[zip(cols.index, cols.values), :].droplevel("variable")
)
adata.var.drop(adata.var.columns[adata.var.columns.str.match("gene_ids-\d+")], inplace=True, axis=1)


#feature_types
x = adata.var.loc[:, adata.var.columns[adata.var.columns.str.match("feature_types-\d+")]]
cols = x.T.notna().idxmax()
x = x.reset_index().melt("index")
adata.var.insert(
    0,
    "feature_types",
    x.set_index(["index", "variable"]).loc[zip(cols.index, cols.values), :].droplevel("variable")
)
adata.var.drop(adata.var.columns[adata.var.columns.str.match("feature_types-\d+")], inplace=True, axis=1)


In [None]:
#Calculate QC statistics
adata.var['mito'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
#Plot
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
#apply filtering here

#Filter data by slicing anndata object
adata = adata[adata.obs.n_genes_by_counts < 8000, :]
adata = adata[adata.obs.total_counts < 80000, :]
adata = adata[adata.obs.total_counts > 2500, :]
adata = adata[adata.obs.pct_counts_mito < 30, :]

In [None]:
#Prep for HVG and scvi
# create normalized layer and log1p in .obs

#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create normalized layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm")

In [None]:
adata

# set up and train scvi model

In [None]:
#HVG via Scanpy
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=5000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="mouse_ident"
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

fig, ax = plt.subplots(figsize=(9,6))
ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=5000, batch_key="mouse_ident", inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

#Can insert batch_key here if desired
scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    categorical_covariate_keys=['genotype'],
    continuous_covariate_keys=["pct_counts_mito"],
    batch_key='mouse_ident'
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
#Fit model to data

#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata.obsm["X_scVI_1.1"] = latent

#Calculate neighbors using scVI model input
sc.pp.neighbors(adata, use_rep="X_scVI_1.1")
sc.tl.umap(adata, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata, key_added="leiden_scVI_1.1", resolution=2.0)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mito", "log1p_total_counts"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
# assess batch effects
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="genotype", cmap="cmo.matter", s=4, ax=ax, vmax="p99.99", frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="leiden_scVI_1.1", legend_loc="on data", ax=ax, s=4, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="mouse_ident", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False)

In [None]:
#Additional QC bar graphs
adata_query.obs['cluster'] = adata.obs["leiden_scVI_1.1"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
# generate featureplots to assess expression of known marker genes


genes = ['leiden_scVI_1.1', 'Sox9', 'Ermn',
        'Gpx6', 'Plp1', 'Trpm5', 
         'Cftr','Ptprc', 'Cd68',
        'Ascl3', 'Sox2', 'Dcn',
        'Pou2f3']

sc.pl.umap(
    adata,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
# Find cluster markers for each leiden cluster
# Confirming presence of high quality cells in each cluster
sc.tl.rank_genes_groups(adata, 'leiden_scVI_1.1', method='wilcoxon', layer='norm', use_raw=False)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(50)

In [None]:
#Identify and subset out low quality clusters (ie doublets based on high gene counts, clusters with low average gene counts not consistent with known marker genes, etc)

bad_clust=['9', '23']

#Filter out bad clusters
to_keep=(~adata.obs['leiden_scVI_1.1'].isin(bad_clust))

#Copy over to new anndata object
adata = adata[to_keep].copy()

From here, continue iterating through runs of scvi model until no clear low quality cell clusters are observed

start back up at "set up and train scvi model" and run through subsetting out bad clusters

Each time clusters are removed, model was run again

In [None]:
# once finished, annotate cell types in adata.obs['cluster_names']
# and save
adata.write('OE_atlas_with_ONB_tumors_all_cells.h5ad')

# tumor cells only scvi

In [None]:
#Filter out only cells from tumor mice

#No bad clusters to remove for this one
keep=['RPMA_1', 'RPMA_2', 'RPMA_3', 'RPM_Cas9']

#Filter out bad clusters
to_keep=(adata_m.obs['mouse_ident'].isin(keep))

#Copy over to new anndata object
adata_m_filter = adata_m[to_keep].copy()

In [None]:
adata=adata_m_filter

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3"
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

fig, ax = plt.subplots(figsize=(9,6))
ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=10000, batch_key="mouse_ident", inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

#Can insert batch_key here if desired
scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    #categorical_covariate_keys=[''],
    continuous_covariate_keys=["pct_counts_mito"],
    batch_key='mouse_ident'
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
#Fit model to data

#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata.obsm["X_scVI_tumor_1.1"] = latent

#Calculate neighbors using scVI model input
sc.pp.neighbors(adata, use_rep="X_scVI_tumor_1.1")
sc.tl.umap(adata, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata, key_added="leiden_scVI_tumor_1.1", resolution=4.0)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mito", "log1p_total_counts"],
    cmap="cubehelix_r",
    ncols=2,
)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="genotype", cmap="cmo.matter", ax=ax, vmax="p99.99", frameon=False, save=False, 
          palette=['#1f77b4', '#ff7f0e', '#2ca02c'])
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="leiden_scVI_tumor_1.1", legend_loc="on data", ax=ax, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="mouse_ident", legend_loc="right margin", ax=ax, frameon=False, save=False)

In [None]:
#Additional QC bar graphs
adata_query.obs['cluster'] = adata.obs["leiden_scVI_tumor_1.1"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(40,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(40,6))
sns.boxenplot(data=adata_query.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
# feature plots
# generate to assesss which cells are stromal and immune vs. tumor

genes = ['leiden_scVI_tumor_1.1', 'Sox9', 'Ermn',
        'Gpx6', 'Plp1', 'Trpm5', 
         'Cftr','Ptprc', 'Cd68',
        'Ascl3', 'Sox2', 'Dcn',
        'Pou2f3', 'Sox10', 'Pax6', 'Foxj1',
        'Cyp2a5', 'Bcl11b']

sc.pl.umap(
    adata,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
#feature plots
genes = ['leiden_scVI_tumor_1.1']

sc.pl.umap(
    adata,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False,
    s=1
)

In [None]:
#Remove everything that is not tumor (i.e. stromal/immune)

# for example
bad_clust=['53', '48', '52', '12', '32', '15', '54', '55', '57',
          '39', '37', '9', '2', '56', '38', '16', '6', '42',
          '36', '22', '25']

#Filter out bad clusters
to_keep=(~adata.obs['leiden_scVI_tumor_1.1'].isin(bad_clust))

#Copy over to new anndata object
adata_f = adata[to_keep].copy()

In [None]:
# can now plot feature plots

genes = ['tumor_type', 'Chga', 'Chgb', 'Dll3', 'Sez6', 'Sstr2', 'Bcl2'] 

sc.pl.umap(
    adata_f,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmin=0,
    vmax='p99.5',
    layer="norm",
    save=False
)

In [None]:
adata_f.write('Primary_ONB_tumors_only_scvi.h5ad')