# Figure 1

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc

import matplotlib as mpl
import matplotlib.pyplot as plt

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns

from scipy.stats import median_abs_deviation

import anndata as ad

#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

#Set fontsize
plt.rcParams.update({'font.size': 20})

In [None]:
#adata = sc.read_h5ad('file_with_concatenated_22_samples.h5ad')

In [None]:
# mitochondrial genes
adata.var["mt"] = adata.var_names.str.startswith("MT-")
# ribosomal genes
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes.
adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]"))

sc.pp.calculate_qc_metrics(
    adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True
)
adata

In [None]:
p1 = sns.displot(adata.obs["total_counts"], bins=100, kde=False)
p2 = sc.pl.violin(adata, 'total_counts')
p3 = sc.pl.violin(adata, "pct_counts_mt")
p4 = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
#Filter data by slicing anndata object
adata = adata[adata.obs.n_genes_by_counts < 8000, :]
adata = adata[adata.obs.n_genes_by_counts > 500, :] #nonzero genes
adata = adata[adata.obs.total_counts > 1000, :] #UMI
adata = adata[adata.obs.pct_counts_mt < 30, :]

In [None]:
p1 = sns.displot(adata.obs["total_counts"], bins=100, kde=False)
p2 = sc.pl.violin(adata, 'total_counts')
p3 = sc.pl.violin(adata, "pct_counts_mt")
p4 = sc.pl.scatter(adata, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
adata

In [None]:
#Prep for HVG and scvi
# create normalized layer and log1p in .obs

#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create normalized layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm") # this is relative counts normalized per cell

In [None]:
#HVG via Scanpy
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=5000,
    subset=False,
    batch_key='orig_patients',
    layer="counts",
    flavor="seurat_v3"
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

In [None]:
fig, ax = plt.subplots(figsize=(9,6))

ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=5000, inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    batch_key="orig_patients",
    continuous_covariate_keys=["pct_counts_mt"],
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train and run model
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
# Fit model to data
latent = model.get_latent_representation()
adata.obsm["X_scVI_4.0"] = latent

# Calculate neighbors using scVI latent representation
sc.pp.neighbors(adata, use_rep="X_scVI_4.0")
sc.tl.umap(adata, min_dist=0.5)

# Run Leiden clustering at multiple resolutions
resolutions = [1.0, 3.0]
for res in resolutions:
    sc.tl.leiden(adata, key_added=f"leiden_scVI_4.0_res{res}", resolution=res)


In [None]:
sc.pl.umap(adata, color="cluster_map")

In [None]:
sc.pl.umap(adata, color="orig_patients")

In [None]:
sc.pl.umap(adata, color="leiden_scVI_4.0_res1.0", legend_loc="on data")

In [None]:
markers_dict = {}
markers_dict["OSN"] = ["LHX2"]
markers_dict["SUS"] = ["ERMN"]
markers_dict["resp ciliated"] = ["FOXJ1"]
markers_dict["resp sec"] = ['CYP4B1']
markers_dict["MV"] = ["CFTR"]
markers_dict["T"] = ["CD3D"]
markers_dict["Myeloid"] = ["CD74"]

for cell_type in markers_dict:
    print("examining",cell_type,"markers\n")
    sc.pl.umap(
    adata,
    color=markers_dict[cell_type],
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=2,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
# Define clusters to remove
remove_clusters = ['14', '8', '24', '23']

# Make sure cluster labels are strings
adata.obs["leiden_scVI_4.0_res1.0"] = adata.obs["leiden_scVI_4.0_res1.0"].astype(str)

adata = adata[~adata.obs["leiden_scVI_4.0_res1.0"].isin(remove_clusters)].copy()

In [None]:
sc.pl.umap(adata, color="leiden_scVI_4.0_res1.0", legend_loc="on data")

In [None]:
#Prep for HVG and scvi

#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create normalized layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm") # this is relative counts normalized per cell

In [None]:
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=5000,
    subset=False,
    batch_key='orig_patients',
    layer="counts",
    flavor="seurat_v3"
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

In [None]:
fig, ax = plt.subplots(figsize=(9,6))

ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=5000, batch_key= 'orig_patients', inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    batch_key="orig_patients",
    continuous_covariate_keys=["pct_counts_mt"],
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
# Fit model to data
latent = model.get_latent_representation()
adata.obsm["X_scVI_5.0"] = latent

# Calculate neighbors using scVI latent representation
sc.pp.neighbors(adata, use_rep="X_scVI_5.0")
sc.tl.umap(adata, min_dist=0.5)

# Run Leiden clustering at multiple resolutions
resolutions = [1.0, 3.0]
for res in resolutions:
    sc.tl.leiden(adata, key_added=f"leiden_scVI_5.0_res{res}", resolution=res)


In [None]:
sc.pl.umap(adata, color="leiden_scVI_5.0_res3.0", legend_loc="on data")