In [None]:
import scanpy as sc
import anndata as ad
import scvi
import pandas as pd
import numpy as np
import time

SEED = 0

In [None]:
adata = sc.read_h5ad('/n/eddy_lab/Lab/mckinley/data/2025_04_15_AmhrGsD_Final.h5ad')

In [None]:
# identify and label mitochondrial genes
print("Identifying mitochondrial genes...")
mitos = np.array(adata.var[adata.var.index.str.startswith(('mt-', 'Mt-', 'MT-'))].index)
adata.var['mito'] = adata.var.index.isin(mitos)
mito_count = adata.var['mito'].sum()
print(f"  Found {mito_count} mitochondrial genes.")

# identify and label ribosomal genes
print("Identifying ribosomal genes...")
ribos = np.array(adata.var[adata.var.index.str.startswith(
    ('rps', 'rpl', 'mrps', 'mrpl', 
     'Rps', 'Rpl', 'Mrps', 'Mrpl',
     'RPS', 'RPL', 'MRPLS', 'MRPL'))].index)
adata.var['ribo'] = adata.var.index.isin(ribos)
ribo_count = adata.var['ribo'].sum()
print(f"  Found {ribo_count} ribosomal genes.")

# filter out mitochondrial and ribosomal genes for HVG selection
print("Filtering out mitochondrial and ribosomal genes for HVG selection...")
adata_hvg = adata[:, (~adata.var['mito']) & (~adata.var['ribo'])].copy()
print(f"  Retained {adata_hvg.shape[0]} cells and {adata_hvg.shape[1]} genes for HVG selection.")

In [None]:
# perform highly-variable gene selection (not using sample batch information at the subobject level).
print("Performing highly-variable gene selection (flavor='seurat_v3') on the aggregate data...")
sc.pp.highly_variable_genes(
    adata_hvg,
    flavor='seurat_v3',
    layer='raw',
    batch_key='sample',    
    n_top_genes=3000,
    subset=True
)

hvg_count = adata_hvg.var['highly_variable'].sum()
print(f"  Selected {int(hvg_count)} highly variable genes.")

# map HVG information back to the full adata object
print("Mapping HVG selection results back to the full dataset...")
adata.uns['hvg'] = adata_hvg.uns['hvg'].copy()
adata.var['highly_variable'] = adata.var.index.isin(
    list(adata_hvg.var[adata_hvg.var['highly_variable']].index)
)
adata.var['highly_variable_rank'] = adata.var.index.map(
    dict(zip(adata_hvg.var.index, adata_hvg.var['highly_variable_rank']))
)
adata.var['highly_variable_nbatches'] = adata.var.index.map(
    dict(zip(adata_hvg.var.index, adata_hvg.var['highly_variable_nbatches']))
)

In [None]:
# reset the main data matrix from the raw layer.
print("Resetting adata.X using raw counts...")
adata.X = adata.layers['raw'].copy()

# normalize
print("Normalizing total counts to a target sum of 1e4...")
sc.pp.normalize_total(
    adata,
    target_sum=1e4,
    exclude_highly_expressed=False,
    inplace=True
)

# apply log1p transformation
print("Applying log1p transformation...")
sc.pp.log1p(adata)

In [None]:
# set up and train the scVI model using the HVG-filtered subset
print("setting up scVI on the HVG-filtered data...")
scvi.model.SCVI.setup_anndata(adata_hvg, layer="raw", batch_key="sample")

print("initializing and training the scVI model...")
model = scvi.model.SCVI(adata_hvg, n_layers=2, n_latent=30, gene_likelihood="nb")
print("  starting model training with early stopping...")
model.train(early_stopping=True)
print("  scVI model training complete.")

# extract the latent representation and store it in adata.obsm.
SCVI_LATENT_KEY = "X_scVI"
print("Extracting latent representation from the trained model...")
adata.obsm[SCVI_LATENT_KEY] = model.get_latent_representation()
print(f"  latent representation stored in adata.obsm['{SCVI_LATENT_KEY}'].")

In [None]:
# compute neighbors and run Leiden clustering at multiple resolutions
n_neighbors = 100
SCVI_NEIGHBORS_KEY = f"neighbors_scVI_{n_neighbors}"
print(f"Preparing to compute neighbors using key: {SCVI_NEIGHBORS_KEY}")

resolutions = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0, 2.5, 3.0]

print("*****************************")
SCVI_NEIGHBORS_KEY = f"neighbors_scVI_{n_neighbors}"
print(f"Computing neighbors with n_neighbors = {n_neighbors} (key: {SCVI_NEIGHBORS_KEY})...")

sc.pp.neighbors(adata,
                use_rep=SCVI_LATENT_KEY,
                n_neighbors=n_neighbors,
                key_added=SCVI_NEIGHBORS_KEY)
print("  Neighbor graph computed.")

for res in resolutions:
    LEIDEN_SCVI_KEY = f"leiden_scVI_{res}"
    print(f"Starting Leiden clustering at resolution {res} (neighbors key: {SCVI_NEIGHBORS_KEY})...")
    start_time = time.time()

    sc.tl.leiden(
        adata,   
        resolution=res,
        random_state=SEED,
        flavor="igraph",
        neighbors_key=SCVI_NEIGHBORS_KEY,
        key_added=LEIDEN_SCVI_KEY
    )

    end_time = time.time()
    print(f"  Leiden clustering '{LEIDEN_SCVI_KEY}' completed in {end_time - start_time:.4f} seconds.")

In [None]:
print("Computing UMAP using the neighbor graph from scVI...")
sc.tl.umap(adata, neighbors_key=SCVI_NEIGHBORS_KEY)

SCVI_UMAP_KEY = 'X_umap_scVI'
adata.obsm[SCVI_UMAP_KEY] = adata.obsm['X_umap'].copy()
del adata.obsm['X_umap']
print(f"  UMAP stored under key '{SCVI_UMAP_KEY}'.\n")

In [None]:
sc.pl.embedding(adata, basis = 'X_umap_scVI', color = 'leiden_scVI_0.1')

In [None]:
adata.write_h5ad('/n/eddy_lab/Lab/mckinley/data/2025_04_15_AmhrGsD_Final_scVI.h5ad')