# Figure 2

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc

import matplotlib as mpl
import matplotlib.pyplot as plt

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns

from scipy.stats import median_abs_deviation

import anndata as ad

#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

#Set fontsize
plt.rcParams.update({'font.size': 20})

In [None]:
#new OE data
adata = sc.read_h5ad('/hpc/group/goldsteinlab/vmd13/Python/251022_AD_22_samples_+qc_scVI_3.5_biomarker.h5ad')

In [None]:
adata.obs["source"].value_counts()

In [None]:
#CSF data
adata2 = sc.read_h5ad("/hpc/group/goldsteinlab/vmd13/Python/250503_AD_17_samples_plus_18_CSF_+qc_scVI_1.1.h5ad")

In [None]:
adata2.obs["source"].value_counts()

In [None]:
adata_OE    = adata[adata.obs["source"] == "OE"].copy()
adata2_CSF  = adata2[adata2.obs["source"] == "CSF"].copy()

adata_OE.obs["source"]   = "OE"
adata2_CSF.obs["source"] = "CSF"

if not adata_OE.var_names.is_unique:
    adata_OE.var_names = adata_OE.var_names.make_unique()
if not adata2_CSF.var_names.is_unique:
    adata2_CSF.var_names = adata2_CSF.var_names.make_unique()

adata_OE.obs_names   = "OE_"  + adata_OE.obs_names.astype(str)
adata2_CSF.obs_names = "CSF_" + adata2_CSF.obs_names.astype(str)

adata_final = sc.concat(
    [adata_OE, adata2_CSF],
    join="outer",      # union of genes
    merge="first",     # obs merge policy
    label=None, keys=None,
    index_unique=None
)

print(adata_final)
print(adata_final.obs["source"].value_counts())

In [None]:
adata = adata_final

In [None]:
adata.obs["source"].value_counts()

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mt", "log1p_total_counts"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
# assess batch effects
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="leiden_scVI_1.1_res3.0", legend_loc="on data", ax=ax, s=4, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="orig_patients", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="Alz_status", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="source", legend_loc="right margin", ax=ax, s=4, frameon=False, save=False)
 

In [None]:
immune = adata

In [None]:
#Prep data for HVG
#logp the data for HVG
immune.obs["log1p_total_counts"] = np.log1p(immune.obs["total_counts"])

#Create layers
immune.layers["counts"] = immune.X.copy()
immune.layers["norm"] = immune.X.copy()
sc.pp.normalize_total(immune, target_sum=1e4, layer="norm")


In [None]:
sc.pp.filter_genes(immune, min_cells=1)

In [None]:
df_poisson = scvi.data.poisson_gene_selection(
    immune,
    n_top_genes=5000,
    batch_key="orig_patients",
    inplace=False,
)

In [None]:
immune.varm['df_poisson_T']= df_poisson

In [None]:
is_hvg = df_poisson.highly_variable

In [None]:
adata_query1 = immune[:, is_hvg].copy()

In [None]:
scvi.model.SCVI.setup_anndata(
    adata_query1,
    layer="counts",
    batch_key='orig_patients',
    categorical_covariate_keys=["source"],
    continuous_covariate_keys=["pct_counts_mt"]
)

In [None]:
model = scvi.model.SCVI(adata_query1, gene_likelihood="nb")

In [None]:
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=50,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

In [None]:
model.train(**train_kwargs)

In [None]:
latent = model.get_latent_representation()

In [None]:
immune.obsm["scVI_T"] = latent

In [None]:
sc.pp.neighbors(immune, use_rep="scVI_T")
sc.tl.umap(immune, min_dist=0.5)

In [None]:
# neighbors were already computed using scVI
sc.tl.leiden(immune, key_added="leiden_scVI_T", resolution=3)

In [None]:
# assess batch effects
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(immune, color="leiden_scVI_T", legend_loc="on data", ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(immune, color="orig_patients", legend_loc="right margin", ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(immune, color="Alz_status", legend_loc="right margin", ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(immune, color="source", legend_loc="right margin", ax=ax, s=20, alpha=0.5, frameon=False)


In [None]:
# generate featureplots to assess expression of known marker genes
genes = ['leiden_scVI_T', 'PTPRC','CD3D']

sc.pl.umap(
    immune,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
#QC UMAPs
sc.pl.umap(
    immune,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mt", "log1p_total_counts", "source"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
clusters_to_drop = [
    '40','21','47','51','46','37','1','19','61','54','43','11',
    '45','60','48','56','0','16','14','50','53','42','52','44',
    '59','10','3','38','58','39'
]

adata_T = immune[~immune.obs['leiden_scVI_T'].isin(clusters_to_drop)].copy()

In [None]:
genes = ['leiden_scVI_T', 'PTPRC', 'CD3G', 'CD3D']

sc.pl.umap(
    adata_T,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
#Prep data for HVG
#logp the data for HVG
adata_T.obs["log1p_total_counts"] = np.log1p(adata_T.obs["total_counts"])

#Create layers
adata_T.layers["counts"] = adata_T.X.copy()
adata_T.layers["norm"] = adata_T.X.copy()
sc.pp.normalize_total(adata_T, target_sum=1e4, layer="norm")


In [None]:
sc.pp.filter_genes(adata_T, min_cells=1)

In [None]:
df_poisson = scvi.data.poisson_gene_selection(
    adata_T,
    n_top_genes=10000,
    batch_key="orig_patients",
    inplace=False,
)

In [None]:
adata_T.varm['df_poisson_T']= df_poisson

In [None]:
is_hvg = df_poisson.highly_variable

In [None]:
adata_query2 = adata_T[:, is_hvg].copy()

In [None]:
scvi.model.SCVI.setup_anndata(
    adata_query2,
    layer="counts",
    batch_key='orig_patients',
    #categorical_covariate_keys=["source"], <- commented out intentionally
    continuous_covariate_keys=["pct_counts_mt"]
)

In [None]:
model = scvi.model.SCVI(adata_query2, gene_likelihood="nb")

In [None]:
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=50,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

In [None]:
model.train(**train_kwargs)

In [None]:
latent = model.get_latent_representation()

In [None]:
adata_T.obsm["scVI_T2"] = latent

In [None]:
sc.pp.neighbors(adata_T, use_rep="scVI_T2")
sc.tl.umap(adata_T, min_dist=0.5)

In [None]:
# neighbors were already computed using scVI
sc.tl.leiden(adata_T, key_added="leiden_scVI_T2", resolution=3)

In [None]:
# assess batch effects
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata_T, color="leiden_scVI_T2", legend_loc="on data", ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata_T, color="orig_patients", legend_loc="right margin", ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata_T, color="Alz_status", legend_loc="right margin", ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata_T, color="source", legend_loc="right margin", ax=ax, s=20, alpha=0.5, frameon=False)


In [None]:
genes = ['leiden_scVI_T2', 'PTPRC', 'CD3G', 'CD3D', 'CD8A', 'CD4']

sc.pl.umap(
    adata_T,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata_T,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mt", "log1p_total_counts", "source", "Alz_status"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
clusters_to_drop = ['67']
adata = adata_T[~adata_T.obs['leiden_scVI_T2'].isin(clusters_to_drop)].copy()


In [None]:

fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata, color="leiden_scVI_T2", legend_loc="on data", ax=ax, s=40, frameon=False, save=False)

In [None]:
#Prep for HVG and scvi
# create normalized layer and log1p in .obs

#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create normalized layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm") # this is relative counts normalized per cell

In [None]:
#HVG via Scanpy
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=12000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

In [None]:
fig, ax = plt.subplots(figsize=(9,6))

ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=12000, inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

#Can insert batch_key here if desired
scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    batch_key="orig_patients",
    #categorical_covariate_keys=['source'], #<<this is left out intentionally here
    continuous_covariate_keys=["pct_counts_mt"],
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
# Fit model to data
latent = model.get_latent_representation()
adata.obsm["X_scVI_3.1"] = latent

# Calculate neighbors using scVI latent representation
sc.pp.neighbors(adata, use_rep="X_scVI_3.1")
sc.tl.umap(adata, min_dist=0.5)

# Run Leiden clustering at multiple resolutions
resolutions = [4.0, 5.0]
for res in resolutions:
    sc.tl.leiden(adata, key_added=f"leiden_scVI_3.1_res{res}", resolution=res)


## feature plots and analysis

In [None]:
# generate featureplots to assess expression of known marker genes
genes = ['leiden_scVI_3.1_res4.0', 'PTPRC', 'CD3G', 'CD3D', 'CD8A', 'CD4']

sc.pl.umap(
    adata,
    color=genes,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
adata.obs["Alz_status"] = pd.Categorical(
    adata.obs["Alz_status"],
    categories=["Control", "Pre-Clinical", "Clinical"],
    ordered=True,
)
adata.uns.pop("orig_patients_colors", None)
adata.uns["Alz_status_colors"] = ["#8eb1dd", "#b49fcc", "#dcacd0"]
sc.settings.set_figure_params(dpi_save=300,            # 300 dpi PNG output
                              format="png")

sc.settings.figdir = "/work/vmd13/AD_figs"

sc.pl.umap(
    adata,
    color=["PTPRC", "CD3D", "CD8A", "CD4", "orig_patients", "Alz_status"],
    legend_loc="none",
    color_map="cmo.matter",
    ncols=3,
    frameon=False,
    vmax="p98",
    layer="norm",
    save="umap_marker_genes",   # saved as …/umap_marker_genes.png at 300 dpi
    show=False,
)


In [None]:
# number of cells in each Alzheimer status category
adata.obs["source"].value_counts()



In [None]:
# number of distinct patients in each Alzheimer-status group
adata.obs.groupby("Alz_status")["orig_patients"].nunique()


In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mt", "log1p_total_counts", "source","orig_patients", "Alz_status"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
import matplotlib.pyplot as plt
import sys
import seaborn as sns
from starcat import starCAT
import starcat

In [None]:
tcat = starCAT(reference='TCAT.V1', cachedir='./cache')

In [None]:
usage, scores = tcat.fit_transform(adata)

In [None]:
# Merge usages and scores with cell metadata
adata.obs = pd.merge(left=adata.obs, right=usage, how='left', left_index=True, right_index=True)

scores[scores.columns[scores.columns.str.contains('_binary')]] = scores[scores.columns[scores.columns.str.contains('_binary')]].astype('str')
adata.obs = pd.merge(left=adata.obs, right=scores, how='left', left_index=True, right_index=True)

In [None]:
sc.settings.figdir = "/work/vmd13/AD_figs"
os.makedirs(sc.settings.figdir, exist_ok=True)
sc.pl.umap(
    adata,
    color="Multinomial_Label",
    palette="tab10",
    size=10, alpha=0.3, edgecolor=None,
    frameon=False, legend_loc="right margin",
    save="250702_OE_CSF_T_cell_clusters.svg"   
)

sc.pl.umap(
    adata,
    color="source",
    palette="Paired",
    size=6, alpha=0.5, edgecolor=None,
    frameon=False, legend_loc="right margin",
    save="250702_OE_CSF_source.svg"            
)

In [None]:
adata.obs['source']     = pd.Categorical(adata.obs['source'], ['CSF', 'OE'], ordered=True)
adata.obs['Alz_status'] = pd.Categorical(
    adata.obs['Alz_status'],
    ['Control', 'Pre-Clinical', 'Clinical'],
    ordered=True
)

counts = (
    adata.obs
        .groupby(['orig_patients', 'source', 'Alz_status', 'Multinomial_Label'])
        .size()
        .rename('n')
        .reset_index()
)
counts['fraction'] = (
    counts.groupby(['orig_patients', 'source', 'Alz_status'])['n']
          .transform(lambda x: x / x.sum())
)

plot_df = (
    counts
        .pivot_table(index=['source', 'Alz_status', 'orig_patients'],
                     columns='Multinomial_Label',
                     values='fraction',
                     fill_value=0)
        .groupby(level=['source', 'Alz_status'])
        .mean()
        .loc[lambda df: df.sum(axis=1) > 0]
)

fig, ax = plt.subplots(figsize=(7, 5))

plot_df.plot(
    kind='bar',
    stacked=True,
    ax=ax,
    width=1.0,          
    cmap='tab20',
    edgecolor='white',  
    linewidth=0.3
)

for x in range(len(plot_df.index) - 1):
    ax.axvline(x + 0.5, color='white', linewidth=0.6)

ax.set_ylabel('Mean fraction per patient')
ax.set_ylim(0, 1)
ax.set_title('T-cell program composition by source and disease stage')
ax.grid(False)
ax.margins(x=0)
ax.legend(bbox_to_anchor=(1, 1))
plt.tight_layout()

fig.savefig("/work/vmd13/AD_figs/tcell_composition.svg", format="svg", bbox_inches="tight")


In [None]:
import hashlib

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


ASA_CUTOFF = 0.10
random_seed = 729242177
OUT_SVG = "/work/vmd13/CD8_ASApos.svg"

memory_sets = ["CD8_CM", "CD8_EM", "CD8_TEMRA"]
subset_order = memory_sets + ["CD8 memory"]
sources = ["CSF", "OE"]
alz_full = ["Control", "Pre-Clinical", "Clinical"]
order_oe = ["Control", "Pre-Clinical", "Clinical"]
order_csf = ["Control", "Clinical"]

N_PER_REP = 10
MAX_REPS = 3

color_src = {"OE": sns.color_palette("Paired")[5], "CSF": sns.color_palette("Paired")[1]}

adata.obs["source"] = pd.Categorical(adata.obs["source"], sources, ordered=True)
adata.obs["Alz_status"] = pd.Categorical(adata.obs["Alz_status"], alz_full, ordered=True)

def _mask_subset(name):
    return adata.obs["Multinomial_Label"].isin(memory_sets) if name == "CD8 memory" else adata.obs["Multinomial_Label"].eq(name)

def _stable_int(*parts):
    s = "|".join(map(str, parts)).encode("utf-8")
    return int.from_bytes(hashlib.md5(s).digest()[:4], "little")

def make_reps(cutoff, seed):
    rng = np.random.default_rng(_stable_int(seed, "distinct_fixed", float(cutoff)))
    asa = (adata.obs["ASA"] > cutoff)

    rows = []
    for sb in subset_order:
        df = adata.obs.loc[_mask_subset(sb), ["orig_patients", "source", "Alz_status"]].copy()
        df["ASA_pos"] = asa.loc[df.index].astype(bool)

        for (pt, src, st), grp in df.groupby(["orig_patients", "source", "Alz_status"], observed=True):
            idx = grp.index.to_numpy()
            if idx.size < N_PER_REP:
                continue

            rng.shuffle(idx)
            n_full = min(MAX_REPS, idx.size // N_PER_REP)

            for rep in range(1, n_full + 1):
                take = idx[(rep - 1) * N_PER_REP : rep * N_PER_REP]
                k = int(df.loc[take, "ASA_pos"].sum())
                rows.append(
                    dict(
                        orig_patients=pt,
                        source=src,
                        Alz_status=st,
                        Subset=sb,
                        replicate=rep,
                        k=k,
                        n=N_PER_REP,
                        percent=k / N_PER_REP,
                    )
                )

    rep_df = pd.DataFrame(rows)
    if rep_df.empty:
        return rep_df

    rep_df["Subset"] = pd.Categorical(rep_df["Subset"], subset_order, ordered=True)
    rep_df["source"] = pd.Categorical(rep_df["source"], sources, ordered=True)
    rep_df["Alz_status"] = pd.Categorical(rep_df["Alz_status"], categories=alz_full, ordered=True)
    return rep_df

def add_fallback(rep_df, cutoff):
    asa = (adata.obs["ASA"] > cutoff)

    base = rep_df.copy()
    for sb in subset_order:
        sub = adata.obs.loc[_mask_subset(sb), ["orig_patients", "source", "Alz_status"]].copy()
        sub["ASA_pos"] = asa.loc[sub.index].astype(bool).values

        for (pt, src, st), grp in sub.groupby(["orig_patients", "source", "Alz_status"], observed=True):
            had = (not base.empty) and ((base["orig_patients"] == pt) & (base["source"] == src) & (base["Alz_status"] == st) & (base["Subset"] == sb)).any()
            if had:
                continue

            n_avail = int(len(grp))
            if n_avail == 0:
                continue

            k = int(grp["ASA_pos"].sum())
            base = pd.concat(
                [
                    base,
                    pd.DataFrame(
                        [dict(orig_patients=pt, source=src, Alz_status=st, Subset=sb, replicate=1, k=k, n=n_avail, percent=k / n_avail)]
                    ),
                ],
                ignore_index=True,
            )

    if not base.empty:
        base["Subset"] = pd.Categorical(base["Subset"], subset_order, ordered=True)
        base["source"] = pd.Categorical(base["source"], sources, ordered=True)
        base["Alz_status"] = pd.Categorical(base["Alz_status"], alz_full, ordered=True)
    return base

def plot_bars(rep_df, headroom_frac=0.12):
    stats = (
        rep_df.groupby(["source", "Alz_status", "Subset"], observed=True)["percent"]
        .agg(mean="mean", sem=lambda x: x.std(ddof=1) / np.sqrt(len(x)) if len(x) > 1 else 0.0)
        .reset_index()
    )
    stats["Subset"] = pd.Categorical(stats["Subset"], subset_order, ordered=True)

    sns.set_style("whitegrid")
    g = sns.FacetGrid(stats, row="source", col="Subset", row_order=sources, col_order=subset_order, height=3.0, aspect=0.6, sharey=False)

    def draw(data, **_):
        ax = plt.gca()
        src = data.source.iat[0]
        order = order_csf if src == "CSF" else order_oe

        d = data.set_index("Alz_status").reindex(order).dropna(subset=["mean"])
        if d.empty:
            ax.set_axis_off()
            return

        x = np.arange(len(d))
        ax.bar(x, d["mean"].to_numpy(), color=color_src[src], edgecolor="k", width=0.8, zorder=1)
        ax.errorbar(x, d["mean"].to_numpy(), d["sem"].to_numpy(), fmt="none", ecolor="black", capsize=3, linewidth=1, zorder=2)

        ax.set_xticks(x)
        ax.set_xticklabels(d.index, rotation=90)
        ax.set_xlabel("")
        if ax.get_subplotspec().colspan.start == 0:
            ax.set_ylabel("% ASA+ (mean ± SEM)")
        else:
            ax.set_ylabel("")
        ax.grid(False)

        y0, y1 = ax.get_ylim()
        ax.set_ylim(y0, y1 + headroom_frac * (y1 - y0))

    g.map_dataframe(draw)
    g.set_titles(row_template="{row_name}", col_template="{col_name}")
    g.fig.subplots_adjust(right=0.8, top=0.9)
    g.fig.suptitle(f"CD8 subsets — % ASA+ (ASA>{ASA_CUTOFF:.3f})", fontsize=13)
    plt.tight_layout(rect=[0, 0, 0.79, 0.95])
    return g

def overlay_dots(g, rep_df, seed, jitter=0.15, alpha=0.55):
    pt = (
        rep_df.groupby(["source", "Subset", "Alz_status", "orig_patients"], observed=True)
        .agg(k=("k", "sum"), n=("n", "sum"))
        .reset_index()
    )
    pt["value"] = pt["k"] / pt["n"]

    for i, src in enumerate(g.row_names):
        for j, sb in enumerate(g.col_names):
            ax = g.axes[i, j]
            if (ax is None) or (not ax.axison):
                continue

            labels = [t.get_text() for t in ax.get_xticklabels()]
            if not labels:
                continue
            pos = {lab: idx for idx, lab in enumerate(labels)}

            pts = pt[(pt["source"] == src) & (pt["Subset"] == sb) & (pt["Alz_status"].isin(pos.keys()))]
            if pts.empty:
                continue

            rng = np.random.default_rng(_stable_int(seed, "jitter", src, sb))
            xlim = ax.get_xlim()
            ylim = ax.get_ylim()
            ylo, yhi = ylim

            x = pts["Alz_status"].map(pos).to_numpy(dtype=float)
            x = x + rng.uniform(-jitter, jitter, size=len(x))
            y = pts["value"].to_numpy(dtype=float)

            hi = y > yhi
            lo = y < ylo
            mid = ~(hi | lo)

            if mid.any():
                ax.scatter(
                    x[mid],
                    y[mid],
                    s=24,
                    marker="o",
                    facecolors=(1, 1, 1, alpha),
                    edgecolors=(0, 0, 0, alpha),
                    linewidths=0.8,
                    zorder=10,
                    clip_on=False,
                )
            if hi.any():
                ax.scatter(
                    x[hi],
                    np.full(hi.sum(), yhi),
                    s=30,
                    marker="^",
                    facecolors=(1, 1, 1, alpha),
                    edgecolors=(0, 0, 0, alpha),
                    linewidths=0.8,
                    zorder=11,
                    clip_on=False,
                )
            if lo.any():
                ax.scatter(
                    x[lo],
                    np.full(lo.sum(), ylo),
                    s=30,
                    marker="v",
                    facecolors=(1, 1, 1, alpha),
                    edgecolors=(0, 0, 0, alpha),
                    linewidths=0.8,
                    zorder=11,
                    clip_on=False,
                )

            ax.set_xlim(xlim)
            ax.set_ylim(ylim)

rep_df = make_reps(ASA_CUTOFF, int(random_seed))
rep_df = add_fallback(rep_df, ASA_CUTOFF)

g = plot_bars(rep_df, headroom_frac=0.12)
overlay_dots(g, rep_df, seed=int(random_seed), jitter=0.15, alpha=0.55)

g.fig.savefig(OUT_SVG, format="svg", bbox_inches="tight")
print(f"SVG saved to: {OUT_SVG}")

plt.show()
plt.close(g.fig)

## stats

In [None]:
import hashlib
import warnings

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import SpecificationWarning

warnings.filterwarnings("ignore", category=SpecificationWarning)

ASA_CUTOFF = 0.10
SIG_LEVEL = 0.05
USE_WITHIN_PANEL_BONF = True
random_seed = 729242177

memory_sets = ["CD8_CM", "CD8_EM", "CD8_TEMRA"]
subset_order = memory_sets + ["CD8 memory"]
sources = ["CSF", "OE"]
alz_full = ["Control", "Pre-Clinical", "Clinical"]

N_PER_REP = 10
MAX_REPS = 3

PVAL_DIGITS = 10
EST_DIGITS = 4

adata.obs["source"] = pd.Categorical(adata.obs["source"], sources, ordered=True)
adata.obs["Alz_status"] = pd.Categorical(adata.obs["Alz_status"], alz_full, ordered=True)


def mask_for_subset(sb: str) -> pd.Series:
    return adata.obs["Multinomial_Label"].isin(memory_sets) if sb == "CD8 memory" else adata.obs["Multinomial_Label"].eq(sb)


def stable_seed(*parts) -> int:
    s = "|".join(map(str, parts)).encode("utf-8")
    return int.from_bytes(hashlib.md5(s).digest()[:4], "little")


def make_reps_distinct_fixed(cutoff: float, seed: int) -> pd.DataFrame:
    rows = []
    rng = np.random.default_rng(stable_seed(seed, "distinct_fixed", float(cutoff)))
    ASA_pos = adata.obs["ASA"] > cutoff

    for sb in subset_order:
        df = adata.obs.loc[mask_for_subset(sb), ["orig_patients", "source", "Alz_status"]].copy()
        df["ASA_pos"] = ASA_pos.loc[df.index].astype(bool)

        for (pt, src, st), grp in df.groupby(["orig_patients", "source", "Alz_status"], observed=True):
            idx = grp.index.to_numpy()
            if len(idx) < N_PER_REP:
                continue

            rng.shuffle(idx)
            n_full = min(MAX_REPS, len(idx) // N_PER_REP)

            for rep in range(1, n_full + 1):
                take = idx[(rep - 1) * N_PER_REP : rep * N_PER_REP]
                k = int(df.loc[take, "ASA_pos"].sum())
                rows.append(
                    dict(
                        orig_patients=pt,
                        source=src,
                        Alz_status=st,
                        Subset=sb,
                        replicate=rep,
                        k=k,
                        n=N_PER_REP,
                        percent=k / N_PER_REP,
                    )
                )

    rep_df = pd.DataFrame(rows)
    if rep_df.empty:
        return rep_df

    rep_df["Subset"] = pd.Categorical(rep_df["Subset"], subset_order, ordered=True)
    rep_df["source"] = pd.Categorical(rep_df["source"], sources, ordered=True)
    rep_df["Alz_status"] = pd.Categorical(rep_df["Alz_status"], categories=alz_full, ordered=True)
    return rep_df


def glm_binom_cluster_panel(d: pd.DataFrame):
    return smf.glm(
        'percent ~ C(Alz_status, Treatment(reference="Control"))',
        data=d,
        family=sm.families.Binomial(),
        freq_weights=d["n"],
    ).fit(cov_type="cluster", cov_kwds={"groups": d["orig_patients"]})


def coef_name(level: str) -> str:
    return f'C(Alz_status, Treatment(reference="Control"))[T.{level}]'


def donor_corrected_contrasts_binom(rep_df_in: pd.DataFrame) -> pd.DataFrame:
    planned = {"CSF": [("Control", "Clinical")], "OE": [("Control", "Pre-Clinical"), ("Control", "Clinical")]}
    rows = []

    for sb in subset_order:
        for src in sources:
            d = rep_df_in[(rep_df_in["Subset"] == sb) & (rep_df_in["source"] == src)].copy()
            if d.empty:
                continue

            if src == "CSF":
                d = d[d["Alz_status"].isin(["Control", "Clinical"])].copy()
                if d.empty:
                    continue

            d["Alz_status"] = d["Alz_status"].cat.remove_unused_categories()
            if d["Alz_status"].nunique() < 2:
                continue

            res = glm_binom_cluster_panel(d)

            avail = list(d["Alz_status"].cat.categories)
            pairs_here = [pair for pair in planned[src] if pair[0] in avail and pair[1] in avail]
            m_panel = max(1, len(pairs_here))

            for a, b in pairs_here:
                nm = coef_name(b)
                if nm not in res.params.index:
                    continue

                est = float(res.params[nm])
                se = float(res.bse[nm])
                p = float(res.pvalues[nm])
                p_adj = min(p * m_panel, 1.0) if USE_WITHIN_PANEL_BONF else p

                rows.append(
                    dict(
                        Subset=sb,
                        Source=src,
                        Contrast=f"{a} vs {b}",
                        LogOddsDiff=est,
                        SE=se,
                        z=(est / se if se > 0 else np.nan),
                        p_raw=p,
                        p_adj_panel=p_adj,
                        sig=(p_adj < SIG_LEVEL),
                        n_donors=int(d["orig_patients"].nunique()),
                        n_reps=int(len(d)),
                    )
                )

    out = pd.DataFrame(rows)
    if out.empty:
        return out

    out["Subset"] = pd.Categorical(out["Subset"], subset_order, ordered=True)
    out["Source"] = pd.Categorical(out["Source"], sources, ordered=True)
    return out.sort_values(["Subset", "Source", "Contrast"]).reset_index(drop=True)


def fmt_float(x, digits: int) -> str:
    if pd.isna(x):
        return ""
    return f"{float(x):.{digits}f}"


def fmt_p(x, digits: int) -> str:
    if pd.isna(x):
        return ""
    x = float(x)
    if x != 0 and x < 10 ** (-digits):
        return f"<1e-{digits}"
    return f"{x:.{digits}f}"


rep_df = make_reps_distinct_fixed(ASA_CUTOFF, int(random_seed))
tbl = donor_corrected_contrasts_binom(rep_df)

if rep_df.empty:
    print("No replicates produced (no donor×panel had ≥ N_PER_REP cells).")
elif tbl.empty:
    print("No valid panels.")
else:
    disp = tbl.copy()
    disp["LogOddsDiff"] = disp["LogOddsDiff"].map(lambda x: fmt_float(x, EST_DIGITS))
    disp["SE"] = disp["SE"].map(lambda x: fmt_float(x, EST_DIGITS))
    disp["z"] = disp["z"].map(lambda x: fmt_float(x, EST_DIGITS))
    disp["p_raw"] = disp["p_raw"].map(lambda x: fmt_p(x, PVAL_DIGITS))
    disp["p_adj_panel"] = disp["p_adj_panel"].map(lambda x: fmt_p(x, PVAL_DIGITS))
    print(disp.to_string(index=False))