# Figure 3

In [None]:
#Import relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc

import matplotlib as mpl
import matplotlib.pyplot as plt

import cmocean

import seaborn as sns

from scipy.stats import median_abs_deviation

import anndata as ad

#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

#Set fontsize
plt.rcParams.update({'font.size': 20})

In [None]:
#import atlas
adata_OE = sc.read_h5ad('/hpc/group/goldsteinlab/vmd13/Python/251022_AD_22_samples_+qc_scVI_5.0.h5ad')

In [None]:
#subet myeloid only
myeloid_mask = adata_OE.obs["cluster_map"] == "Macrophage/DC"
adata_OE = adata_OE[myeloid_mask].copy() 
adata_OE

In [None]:
adata = adata_OE

In [None]:
#Prep for HVG and scvi
# create normalized layer and log1p in .obs

#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create normalized layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm") # this is relative counts normalized per cell

In [None]:
#HVG via Scanpy
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

In [None]:
fig, ax = plt.subplots(figsize=(9,6))

ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=10000, inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model

scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    batch_key="orig_patients",
    continuous_covariate_keys=["pct_counts_mt"],
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
# Fit model to data
latent = model.get_latent_representation()
adata.obsm["X_scVI_1.1_myeloid"] = latent

# Calculate neighbors using scVI latent representation
sc.pp.neighbors(adata, use_rep="X_scVI_1.1_myeloid")
sc.tl.umap(adata, min_dist=0.5)

# Run Leiden clustering at multiple resolutions
resolutions = [1.0, 3.0]
for res in resolutions:
    sc.tl.leiden(adata, key_added=f"leiden_scVI_1.1_myeloid_res{res}", resolution=res)


In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
sc.pl.umap(adata, color="leiden_scVI_1.1_myeloid_res1.0", legend_loc="on data", color_map="cmo.matter", vmax="p95", layer="norm", ax=ax, s=100, frameon=False, save=False)

In [None]:
# clustersto drop (as strings, Leiden labels are categorical)
to_drop = {'13', '12', '10', '17'}

# keep everything that is not in that set
mask   = ~adata.obs['leiden_scVI_1.1_myeloid_res1.0'].astype(str).isin(to_drop)
adata  = adata[mask].copy()

if adata.obs['leiden_scVI_1.1_myeloid_res1.0'].dtype.name == 'category':
    adata.obs['leiden_scVI_1.1_myeloid_res1.0'] = (
        adata.obs['leiden_scVI_1.1_myeloid_res1.0'].cat.remove_unused_categories()
    )


In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
sc.pl.umap(adata, color="orig_patients", legend_loc="right margin", color_map="cmo.matter", vmax="p95", layer="norm", ax=ax, s=100, frameon=False, save=False)

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
sc.pl.umap(adata, color="leiden_scVI_1.1_myeloid_res1.0", legend_loc="on data", color_map="cmo.matter", vmax="p95", layer="norm", ax=ax, s=100, frameon=False, save=False)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mt", "log1p_total_counts", "orig_patients", "Alz_status"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
#Prep for HVG and scvi
# create normalized layer and log1p in .obs

#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

#Create normalized layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm") # this is relative counts normalized per cell

In [None]:
#HVG via Scanpy
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
)

In [None]:
adata.var['mean_'] = np.array(adata.X.mean(0))[0]
adata.var['frac_zero'] = 1 - np.array((adata.X > 0).sum(0))[0] / adata.shape[0]

In [None]:
fig, ax = plt.subplots(figsize=(9,6))

ax.scatter(adata.var.mean_, adata.var.frac_zero, s=1)
ax.set_xscale("log")

In [None]:
#Calculate Poisson gene selection
df_poisson = scvi.data.poisson_gene_selection(
    adata, n_top_genes=10000, inplace=False
)

df_poisson[df_poisson.highly_variable].sort_values('prob_zero_enrichment_rank')

pd.crosstab(df_poisson.highly_variable, adata.var.highly_variable)

is_hvg = df_poisson.highly_variable

adata.varm['df_poisson']= df_poisson

adata_query = adata[:, is_hvg].copy()
print(adata_query)

In [None]:
#Set up scvi model
scvi.model.SCVI.setup_anndata(
    adata_query,
    layer="counts",
    batch_key="orig_patients",
    continuous_covariate_keys=["pct_counts_mt"],
)

model = scvi.model.SCVI(adata_query, gene_likelihood="nb")

model.view_anndata_setup()

In [None]:
#Train and run scvi

#Training parameters
train_kwargs = dict(
    early_stopping=True,
    early_stopping_patience=20,
    enable_model_summary=True,
    enable_progress_bar=True,
    enable_checkpointing=True,
    max_epochs=500
)

#Train and run model
#Be sure GPU is enabled to run this
model.train(**train_kwargs)

In [None]:
#Plot model results
train_elbo = model.history['elbo_train'][1:]
test_elbo = model.history['elbo_validation']

ax = train_elbo.plot()
test_elbo.plot(ax = ax)

In [None]:
# Fit model to data
latent = model.get_latent_representation()
adata.obsm["X_scVI_1.2_myeloid"] = latent

# Calculate neighbors using scVI latent representation
sc.pp.neighbors(adata, use_rep="X_scVI_1.2_myeloid")
sc.tl.umap(adata, min_dist=0.5)

# Run Leiden clustering at multiple resolutions
resolutions = [1.0, 3.0]
for res in resolutions:
    sc.tl.leiden(adata, key_added=f"leiden_scVI_1.2_myeloid_res{res}", resolution=res)


In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
sc.pl.umap(adata, color="orig_patients", legend_loc="right margin", color_map="cmo.matter", vmax="p95", layer="norm", ax=ax, s=100, frameon=False, save=False)

In [None]:
starcat.available_refs

In [None]:
tcat = starCAT(reference='MYELOID.GLIOMA.V1', cachedir='./cache')

In [None]:
usage, scores = tcat.fit_transform(adata)

In [None]:
# Merge usages and scores with cell metadata
adata.obs = pd.merge(left=adata.obs, right=usage, how='left', left_index=True, right_index=True)

In [None]:
adata

In [None]:
# Calculate neighbors using scVI latent representation
sc.pp.neighbors(adata, use_rep="X_scVI_1.2_myeloid")
sc.tl.umap(adata, min_dist=0.5)

In [None]:
adata.obs["Alz_status"] = pd.Categorical(
    adata.obs["Alz_status"],
    categories=["Control", "Pre-Clinical", "Clinical"],
    ordered=True,
)

palette = sns.color_palette("tab20", 3)

fig = sc.pl.umap(
    adata,
    color="Alz_status",
    palette=palette,
    legend_loc="right margin",
    size=100,
    frameon=False,
    show=False,
    return_fig=True,
)

plt.show()
plt.close(fig)

In [None]:
palette = sns.color_palette("tab20", 19)

fig = sc.pl.umap(
    adata,
    color="orig_patients",
    palette=palette,
    legend_loc="none",
    size=100,
    frameon=False,
    show=False,
    return_fig=True,
)

fig.savefig("/work/vmd13/250708_myeloid_orig_patients.svg", format="svg", bbox_inches="tight")

plt.show()
plt.close(fig)

In [None]:
# Simple count of cells per donor
adata.obs["orig_patients"].value_counts()

In [None]:
blue_cmap = sns.light_palette("xkcd:copper", 8, as_cmap=True)

fig = sc.pl.umap(
    adata,
    color="Macrophage",
    cmap=blue_cmap,         
    legend_loc="right margin",
    vmax="p95",
    size=40,
    frameon=False,
    show=False,
    return_fig=True,
)

fig.savefig(
    "/work/vmd13/250708_myeloid_mac_score.svg",
    format="svg",
    bbox_inches="tight"
)

plt.show()   
plt.close(fig)


In [None]:
blue_cmap = sns.light_palette("xkcd:copper", 8, as_cmap=True)

fig = sc.pl.umap(
    adata,
    color="Microglia",
    cmap=blue_cmap,          
    legend_loc="right margin",
    vmax="p95",
    size=40,
    frameon=False,
    show=False,
    return_fig=True,
)

fig.savefig(
    "/work/vmd13/250708_myeloid_microglia_score.svg",
    format="svg",
    bbox_inches="tight"
)

plt.show()   
plt.close(fig)


In [None]:
blue_cmap = sns.light_palette("xkcd:copper", 8, as_cmap=True)

fig = sc.pl.umap(
    adata,
    color="cDC",
    cmap=blue_cmap,          
    legend_loc="right margin",
    vmax="p90",
    size=40,
    frameon=False,
    show=False,
    return_fig=True,
)

fig.savefig(
    "/work/vmd13/250708_myeloid_cDC_score.svg",
    format="svg",
    bbox_inches="tight"
)

plt.show()   
plt.close(fig)


In [None]:
genes = ["CD68", "CD207", "P2RY13"]           # microglia, cDC, homeostatic microglia
outdir = "/work/vmd13/250708_single_gene_umaps"
os.makedirs(outdir, exist_ok=True)

for g in genes:
    fig = sc.pl.umap(
        adata,
        color=g,              
        cmap=cmo.matter,      
        vmax="p90",
        size=100,
        frameon=False,
        return_fig=True,
        show=False,
    )

    fig.savefig(
        f"{outdir}/{g}_umap_p90.svg",
        format="svg",
        bbox_inches="tight"
    )
    plt.close(fig)            


In [None]:
TH = 0.10
prog_cols = ["Microglia", "Macrophage", "Monocyte", "cDC", "Neutrophil"]

mg  = adata.obs["Microglia"].to_numpy()
mac = adata.obs["Macrophage"].to_numpy()
mon = adata.obs["Monocyte"].to_numpy()
cdc = adata.obs["cDC"].to_numpy()
neu = adata.obs["Neutrophil"].to_numpy()

labels = np.full(adata.n_obs, "Unassigned", dtype=object)

partner_max = np.vstack([mac, mon, cdc]).max(axis=0)
m = (mg >= TH) & (partner_max >= TH) & (neu < mg) & (neu < partner_max)
labels[m] = "Microglia-like"

m = (labels == "Unassigned") & (mac >= TH) & (mon >= TH) & (mg < mac) & (cdc < mac) & (neu < mac)
labels[m] = "Mono_macro"

m = (labels == "Unassigned") & (mg >= TH) & (mac < TH) & (mon < mg) & (cdc < mg) & (neu < mg)
labels[m] = "Microglia"

m = (labels == "Unassigned") & (mac >= TH) & (mon < TH) & (mg < mac) & (cdc < mac) & (neu < mac)
labels[m] = "Macrophage"

m = (labels == "Unassigned") & (mon >= TH) & (mg < mon) & (mac < mon) & (cdc < mon) & (neu < mon)
labels[m] = "Monocyte"

m = (labels == "Unassigned") & (cdc >= TH) & (mg < cdc) & (mac < cdc) & (mon < cdc) & (neu < cdc)
labels[m] = "cDC"

m = (labels == "Unassigned") & (neu >= TH) & (mg < neu) & (mac < neu) & (mon < neu) & (cdc < neu)
labels[m] = "Neutrophil"

adata.obs["myeloid_names"] = pd.Categorical(labels)

m = adata.obs["myeloid_names"] == "Unassigned"
if m.any():
    top = adata.obs.loc[m, prog_cols].idxmax(axis=1)
    missing = set(top) - set(adata.obs["myeloid_names"].cat.categories)
    if missing:
        adata.obs["myeloid_names"] = adata.obs["myeloid_names"].cat.add_categories(list(missing))
    adata.obs.loc[m, "myeloid_names"] = top.values

adata.obs["myeloid_names"].replace(
    {
        "Microglia": "Microglia-like",
        "Microglia-like Macrophages": "Microglia-like",
        "Mono_macro": "Macrophage",
        "Neutrophils": "Neutrophil",
    },
    inplace=True,
)

adata.obs["myeloid_names"] = adata.obs["myeloid_names"].cat.remove_unused_categories()
print(adata.obs["myeloid_names"].value_counts())

In [None]:
mask = adata.obs["myeloid_names"] != "Neutrophil"
adata = adata[mask].copy()

adata.obs["myeloid_names"] = adata.obs["myeloid_names"].cat.remove_unused_categories()

print(adata.obs["myeloid_names"].value_counts())

In [None]:
adata.obs["myeloid_names"] = adata.obs["myeloid_names"].replace(
    {"Monocyte": "Mono+Macro", "Macrophage": "Mono+Macro"}
)

adata.obs["myeloid_names"] = adata.obs["myeloid_names"].cat.remove_unused_categories()

print(adata.obs["myeloid_names"].value_counts())

In [None]:
pal = sns.color_palette(["#8BC9BD", "#625B89", "#6795B6"])

fig = sc.pl.umap(
    adata,
    color="Alz_status",      
    palette=pal,
    size=100,
    legend_loc="right margin",
    show=False,              
    return_fig=True,         
)

fig.savefig("/work/vmd13/umap_AD_status_myeloid_night.svg",
            format="svg", dpi=300, bbox_inches="tight")

plt.show()
plt.close(fig)


In [None]:
import seaborn as sns, matplotlib.pyplot as plt, scanpy as sc

pal_use = sns.color_palette("tab10",
                            adata.obs["myeloid_names"].nunique())

fig = sc.pl.umap(
    adata,
    color="myeloid_names",
    palette=pal,
    size=100,
    frameon=False,
    legend_loc="right margin",
    show=False,
    return_fig=True,
)

plt.show()
plt.close(fig)


In [None]:
genes_plot = ["P2RY13", "CD68", "CD207"]

for gene in genes_plot:
    if gene not in adata.var_names:
        print(f"{gene} not found in adata.var_names")
        continue

    vmax_val = np.percentile(adata[:, gene].X.toarray().flatten(), 90)
    fig = sc.pl.umap(
        adata,
        color=gene,
        cmap=cmocean.cm.matter,
        vmax=vmax_val,
        vmin=0,
        frameon=False,
        size=60,
        show=False,
        return_fig=True,
    )

    fig.suptitle(gene, fontsize=12, y=0.93)
    plt.tight_layout()
    fig.savefig(f"/work/vmd13/umap_{gene}_cmo_matter.svg", format="svg", dpi=300, bbox_inches="tight")
    plt.show()
    plt.close(fig)

In [None]:
pal_use = sns.color_palette("tab20",
                            adata.obs["orig_patients"].nunique())

fig = sc.pl.umap(
    adata,
    color="orig_patients",
    palette=pal_use,
    size=100,
    frameon=False,
    legend_loc="right margin",
    show=False,
    return_fig=True,
)

fig.savefig("/work/vmd13/umap_orig_patientss_tab10.svg",
            format="svg", dpi=300, bbox_inches="tight")

plt.show()
plt.close(fig)


In [None]:
outdir = "/hpc/group/goldsteinlab/vmd13/Python"
fname = os.path.join(outdir, "250601_umap_pies.svg")
dot_sz = 100

gep_cols = [
    "Complement_Immunosuppressive",
    "Scavenger_Immunosuppressive",
    "IL1B_CD83_TNF_Inflammatory",
    "Inflammatory_microglia",
]
slice_colors = ["#7fc97f", "#beaed4", "#fdc086", "#386cb0"]

os.makedirs(outdir, exist_ok=True)

fig = sc.pl.umap(
    adata,
    color="myeloid_names",
    size=dot_sz,
    show=False,
    return_fig=True,
)
ax = fig.axes[0]
ax.collections[0].set_visible(False)

vals = adata.obs[gep_cols].to_numpy()
row_sum = vals.sum(1)
m = (row_sum > 0) & np.isfinite(row_sum)
vals = vals[m] / row_sum[m][:, None]
xy = adata.obsm["X_umap"][m]

r_pts = math.sqrt(dot_sz / math.pi)
(x0, x1), (y0, y1) = ax.get_xlim(), ax.get_ylim()
x_mid, y_mid = 0.5 * (x0 + x1), 0.5 * (y0 + y1)

px_per_dx = ax.transData.transform((x_mid + 1, y_mid))[0] - ax.transData.transform((x_mid, y_mid))[0]
pt_per_dx = px_per_dx * 72 / fig.dpi
r_data = r_pts / pt_per_dx

for (xi, yi), frac in zip(xy, vals):
    start = 0.0
    for f, c in zip(frac, slice_colors):
        if f <= 0:
            continue
        ax.add_patch(Wedge((xi, yi), r_data, 360 * start, 360 * (start + f), facecolor=c, edgecolor="none"))
        start += f

ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("UMAP1")
ax.set_ylabel("UMAP2")

leg = ax.get_legend()
if leg is not None:
    leg.remove()

handles = [Patch(fc=c, label=l.replace("_", " ")) for c, l in zip(slice_colors, gep_cols)]
ax.legend(handles=handles, frameon=False, loc="upper right", title="GEP slice")

plt.tight_layout()
plt.show()

In [None]:
outdir = "/work/vmd13"
fname = os.path.join(outdir, "250601_umap_pies_halfsize.svg")
dot_sz = 100
size_scale = 0.5

gep_cols = [
    "Complement_Immunosuppressive",
    "Scavenger_Immunosuppressive",
    "IL1B_CD83_TNF_Inflammatory",
    "Inflammatory_microglia",
]
slice_colors = ["#7fc97f", "#beaed4", "#fdc086", "#386cb0"]

os.makedirs(outdir, exist_ok=True)

fig = sc.pl.umap(adata, color="myeloid_names", size=dot_sz, show=False, return_fig=True)
ax = fig.axes[0]
ax.collections[0].set_visible(False)

vals = adata.obs[gep_cols].to_numpy()
row_sum = vals.sum(1)
m = (row_sum > 0) & np.isfinite(row_sum)
vals = vals[m] / row_sum[m][:, None]
xy = adata.obsm["X_umap"][m]

r_pts = math.sqrt(dot_sz / math.pi)
(x0, x1), (y0, y1) = ax.get_xlim(), ax.get_ylim()
x_mid, y_mid = 0.5 * (x0 + x1), 0.5 * (y0 + y1)

px_per_dx = ax.transData.transform((x_mid + 1, y_mid))[0] - ax.transData.transform((x_mid, y_mid))[0]
pt_per_dx = px_per_dx * 72 / fig.dpi
r_data = (r_pts / pt_per_dx) * size_scale

for (xi, yi), frac in zip(xy, vals):
    start = 0.0
    for f, c in zip(frac, slice_colors):
        if f <= 0:
            continue
        ax.add_patch(Wedge((xi, yi), r_data, 360 * start, 360 * (start + f), facecolor=c, edgecolor="none"))
        start += f

ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel("UMAP1")
ax.set_ylabel("UMAP2")

leg = ax.get_legend()
if leg is not None:
    leg.remove()

handles = [Patch(fc=c, label=l.replace("_", " ")) for c, l in zip(slice_colors, gep_cols)]
ax.legend(handles=handles, frameon=False, loc="upper right", title="GEP slice")

plt.tight_layout()
fig.savefig(fname, format="svg", dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure to: {fname}")

In [None]:
stage_order = ["Control", "Pre-Clinical", "Clinical"]
adata.obs["Alz_status"] = pd.Categorical(adata.obs["Alz_status"], categories=stage_order, ordered=True)

counts = (
    adata.obs.groupby(["orig_patients", "Alz_status", "myeloid_names"], observed=True)
    .size()
    .rename("n")
    .reset_index()
)
counts["fraction"] = counts.groupby(["orig_patients", "Alz_status"], observed=True)["n"].transform(lambda x: x / x.sum())

plot_df = (
    counts.pivot_table(
        index=["Alz_status", "orig_patients"],
        columns="myeloid_names",
        values="fraction",
        fill_value=0,
        observed=True,
    )
    .groupby(level="Alz_status")
    .mean()
    .reindex(stage_order)
)

if "myeloid_names_colors" in adata.uns:
    colors = dict(zip(adata.obs["myeloid_names"].cat.categories, adata.uns["myeloid_names_colors"]))
else:
    cols = sns.color_palette("tab20", n_colors=plot_df.shape[1])
    colors = dict(zip(plot_df.columns, cols))

fig, ax = plt.subplots(figsize=(6, 4))
plot_df.plot(
    kind="bar",
    stacked=True,
    ax=ax,
    width=1.0,
    color=[colors[c] for c in plot_df.columns],
    edgecolor="white",
    linewidth=0.5,
)

for x in range(len(plot_df.index) - 1):
    ax.axvline(x + 0.5, color="white", linewidth=0.8)

ax.set_ylabel("Mean fraction per patient")
ax.set_ylim(0, 1)
ax.set_xlabel("")
ax.set_xticklabels(plot_df.index, rotation=0)
ax.set_title("Myeloid composition by Alzheimer stage (OE only)")
ax.grid(False)
ax.margins(x=0)
ax.legend(title="myeloid_names", bbox_to_anchor=(1.02, 0.5), loc="center left", frameon=False)

plt.tight_layout()
plt.show()

out_file = "/work/vmd13/250708_myeloid_proportions.svg"
fig.savefig(out_file, format="svg", dpi=300, bbox_inches="tight")
print("Saved to:", out_file)

In [None]:
# highest‐scoring program label
top_prog = adata.obs[prog_cols].idxmax(axis=1)

mask = adata.obs["myeloid_names"] == "Unassigned"
adata.obs.loc[mask, "myeloid_names"] = top_prog[mask]


In [None]:
programs = [
    "Complement_Immunosuppressive",
    "Scavenger_Immunosuppressive",
    "IL1B_CD83_TNF_Inflammatory",
    "Inflammatory_microglia",
]
thr = 0.50

stage_order = ["Control", "Pre-Clinical", "Clinical"]
stage_palette = {"Control": "#3468a3", "Pre-Clinical": "#64429a", "Clinical": "#a03a96"}

TARGET_W_OVER_H = 973.4199 / 375.6228
ymax_map = dict(zip(programs, [0.40, 0.40, 0.40, 0.40]))

DOT_ALPHA = 0.60
DOT_SIZE = 28
TRI_SIZE = 46
JITTER = 0.18
random_seed = 0

PAD_TOP_FRAC = 0.08
PAD_BOTTOM_FRAC = 0.03

adata.obs["Alz_status"] = pd.Categorical(adata.obs["Alz_status"], categories=stage_order, ordered=True)

vals = adata.obs[programs].to_numpy()
adata.obs[programs] = vals / vals.sum(1, keepdims=True)

flags = (adata.obs[programs] > thr).astype(int)
df = adata.obs[["orig_patients", "Alz_status"]].join(flags)

patient_frac = df.groupby(["orig_patients", "Alz_status"], observed=True)[programs].mean().reset_index()

long = patient_frac.melt(id_vars=["orig_patients", "Alz_status"], var_name="Program", value_name="Frac_pos")

g = sns.catplot(
    data=long,
    kind="bar",
    x="Alz_status",
    y="Frac_pos",
    col="Program",
    col_order=programs,
    order=stage_order,
    palette=[stage_palette[s] for s in stage_order],
    estimator="mean",
    errorbar="se",
    capsize=0.06,
    errwidth=0.6,
    width=0.7,
    height=3.0,
    aspect=0.7,
    sharey=False,
    margin_titles=True,
    legend=False,
)

w_in, _ = g.fig.get_size_inches()
g.fig.set_size_inches(w_in, w_in / TARGET_W_OVER_H)
g.fig.subplots_adjust(bottom=0.22, wspace=0.12)

for ax, prog in zip(g.axes.flat, programs):
    ymax = ymax_map[prog]
    ax.set_ylim(-ymax * PAD_BOTTOM_FRAC, ymax + ymax * PAD_TOP_FRAC)
    ticks = ax.get_yticks()
    ax.set_yticks([t for t in ticks if 0 <= t <= ymax + 1e-12])

x_map = {s: i for i, s in enumerate(stage_order)}
rng = np.random.default_rng(random_seed)

for ax, prog in zip(g.axes.flat, programs):
    ymax = ymax_map[prog]
    d = long[long["Program"] == prog].sort_values(["Alz_status", "orig_patients"]).copy()

    x = d["Alz_status"].map(x_map).astype(float).to_numpy()
    y = d["Frac_pos"].to_numpy()
    xj = x + rng.uniform(-JITTER, JITTER, size=len(d))

    in_mask = y <= ymax
    out_mask = y > ymax

    ax.scatter(xj[in_mask], y[in_mask], s=DOT_SIZE, marker="o", color="black", alpha=DOT_ALPHA, linewidths=0, zorder=10, clip_on=False)
    ax.scatter(xj[out_mask], np.full(out_mask.sum(), ymax), s=TRI_SIZE, marker="^", color="black", alpha=DOT_ALPHA, linewidths=0, zorder=11, clip_on=False)

g.set_axis_labels("", "Mean fraction\npositive per patient")
g.set_xticklabels(rotation=30, ha="right", fontsize=9)
g.set_titles("{col_name}", fontsize=10)
g.despine(left=True)

handles = [
    plt.Line2D([0], [0], marker="s", linestyle="", markersize=10,
               markerfacecolor=stage_palette[s], markeredgecolor="k")
    for s in stage_order
]
g.fig.legend(handles, stage_order, title="Alz_status", frameon=False, bbox_to_anchor=(1.03, 0.5), loc="center left")

plt.tight_layout()

svg_path = "/work/vmd13/myeloid_frac_pos_barplot.svg"
g.fig.savefig(svg_path, format="svg", dpi=300, bbox_inches="tight")
print(f"SVG saved to: {svg_path}")

plt.show()
plt.close(g.fig)

In [None]:
import scipy.stats as st
from itertools import combinations
from statsmodels.stats.multitest import multipletests

stage_order = ["Control", "Pre-Clinical", "Clinical"]
pairs = list(combinations(stage_order, 2))

props = (
    flags.groupby(adata.obs["orig_patients"])[programs].mean()
    .join(adata.obs.groupby("orig_patients")["Alz_status"].first())
    .reset_index()
)

results = []

for prog in programs:
    samples = [props.loc[props["Alz_status"] == s, prog] for s in stage_order]
    H, p_kw = st.kruskal(*samples)

    p_raw, U_vals, labels = [], [], []
    for g1, g2 in pairs:
        x = props.loc[props["Alz_status"] == g1, prog]
        y = props.loc[props["Alz_status"] == g2, prog]
        U, p = st.mannwhitneyu(x, y, alternative="two-sided")
        labels.append(f"{g1} vs {g2}")
        U_vals.append(U)
        p_raw.append(p)

    _, p_adj, _, _ = multipletests(p_raw, method="bonferroni")

    for lbl, U, p0, p1 in zip(labels, U_vals, p_raw, p_adj):
        results.append(
            {
                "Programme": prog,
                "Comparison": lbl,
                "U_stat": U,
                "p_raw": p0,
                "p_adj": p1,
                "KW_H": H,
                "KW_p": p_kw,
            }
        )

stats_df = pd.DataFrame(results)
print(stats_df)

In [None]:
# number of cells per patient
cell_counts = (
    adata.obs["orig_patients"]
         .value_counts()     # pandas Series: index = patient, value = #cells
         .sort_index()       # optional: alphabetical order
)

print(cell_counts)


In [None]:
from scipy import sparse

N_CELLS = 15
N_REPS = 3
OUTDIR = "/hpc/group/goldsteinlab/vmd13/Python/250708_myeloid_bulks_v3"
master_seed = 42

os.makedirs(OUTDIR, exist_ok=True)

mat = adata.layers["raw"] if "raw" in adata.layers else adata.X
genes = adata.var_names.to_numpy()

def seed_for_patient(pid: str, master: int) -> int:
    h = hashlib.md5(f"{master}|{pid}".encode("utf-8")).digest()
    return int.from_bytes(h[:4], "little")

cols = []
meta = []

groups = adata.obs.groupby("orig_patients", sort=True).groups
for pid, idx in groups.items():
    idx_int = adata.obs_names.get_indexer(idx)
    n_tot = idx_int.size
    if n_tot < N_CELLS * N_REPS:
        print(f"· skip {pid}: only {n_tot} cells")
        continue

    rng = np.random.default_rng(seed_for_patient(str(pid), master_seed))
    sel = rng.permutation(idx_int)[: N_CELLS * N_REPS]
    rep_blocks = np.split(sel, N_REPS)

    for r, rep_idx in enumerate(rep_blocks, 1):
        bulk_id = f"{pid}_rep{r}"

        if sparse.issparse(mat):
            counts = np.asarray(mat[rep_idx].sum(axis=0)).ravel()
            lib_umis = int(mat[rep_idx].sum())
        else:
            counts = np.asarray(mat[rep_idx].sum(axis=0)).ravel()
            lib_umis = int(mat[rep_idx].sum())

        cols.append(counts)
        meta.append(
            dict(
                bulk_id=bulk_id,
                orig_patient=pid,
                replicate=r,
                Alz_status=adata.obs.loc[idx[0], "Alz_status"],
                n_cells=int(rep_idx.size),
                lib_umis=lib_umis,
            )
        )

counts_df = pd.DataFrame(np.vstack(cols).T, index=genes, columns=[m["bulk_id"] for m in meta])
meta_df = pd.DataFrame(meta).set_index("bulk_id")

counts_df.to_csv(f"{OUTDIR}/pseudobulk_counts.csv")
meta_df.to_csv(f"{OUTDIR}/pseudobulk_meta.csv")

print(f"✔ saved {counts_df.shape[1]} pseudobulks from {meta_df['orig_patient'].nunique()} patients → {OUTDIR}")

# EdgeR library 

In [None]:
# Once switched to the proper conda environment, load necessary packages
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import random
import sc_toolbox
import anndata


import os
os.environ['R_HOME']='/hpc/group/goldsteinlab/envs/vmd13_Python_R_4_env/lib/R'

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
from rpy2.robjects import r

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [None]:
#Show specific size of pandas dataframe when produced
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [None]:
%%R
.libPaths( c( "/hpc/group/goldsteinlab/envs/vmd13_Python_R_4_env/lib/R" , .libPaths() ) )

In [None]:
%%R
setwd('/work/vmd13')

In [None]:
%%R
library("Rcpp")
library("ggplot2")
library("ggrepel")
library("repr")
library('edgeR')

In [None]:
%%R
suppressPackageStartupMessages({
  library(edgeR)
  library(limma)
})

indir <- "/hpc/group/goldsteinlab/vmd13/Python/250708_myeloid_bulks_v3"

counts <- read.csv(file.path(indir, "pseudobulk_counts.csv"),
                   row.names = 1, check.names = FALSE)
meta <- read.csv(file.path(indir, "pseudobulk_meta.csv"),
                 row.names = 1, check.names = FALSE)

meta <- meta[colnames(counts), , drop = FALSE]

na_bulk <- rownames(meta)[is.na(meta$Alz_status)]
if (length(na_bulk)) {
  counts <- counts[, !colnames(counts) %in% na_bulk, drop = FALSE]
  meta <- meta[!rownames(meta) %in% na_bulk, , drop = FALSE]
}

y <- DGEList(counts = as.matrix(counts), samples = meta)
y$samples$Alz_status <- factor(gsub("-", "", y$samples$Alz_status),
                               levels = c("Control", "PreClinical", "Clinical"))

design <- model.matrix(~ 0 + Alz_status, data = y$samples)
colnames(design) <- levels(y$samples$Alz_status)

keep <- filterByExpr(y, design = design)
y <- y[keep, , keep.lib.sizes = FALSE]
y <- calcNormFactors(y)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design)

con_PC <- makeContrasts(PreClinical - Control, levels = design)
con_CL <- makeContrasts(Clinical - Control, levels = design)

de_PC <- topTags(glmQLFTest(fit, contrast = con_PC), n = Inf)$table
de_CL <- topTags(glmQLFTest(fit, contrast = con_CL), n = Inf)$table

write.csv(de_PC, file.path(indir, "DE_preclinical_vs_control.csv"))
write.csv(de_CL, file.path(indir, "DE_clinical_vs_control.csv"))

cat("✓ finished —", nrow(design), "bulks analysed; results in", indir, "\n")

In [None]:
%%R
gmt_file <- "/hpc/group/goldsteinlab/vmd13/Python/250625_myeloid_bulks/c5.hpo.v2024.1.Hs.symbols.gmt.txt"

lines <- readLines(gmt_file)

split_line <- function(x) {
  v <- strsplit(x, "\t")[[1]]
  list(name = v[1], genes = v[-(1:2)])
}

tmp <- lapply(lines, split_line)
pathways <- setNames(lapply(tmp, `[[`, "genes"),
                     vapply(tmp, `[[`, "", "name"))

logCPM <- cpm(y, log = TRUE, prior.count = 2)

idx <- lapply(pathways, function(g) which(rownames(logCPM) %in% g))
idx <- idx[vapply(idx, length, 1L) >= 15]

cat("GO gene sets retained:", length(idx), "\n")

fry_PC <- fry(logCPM, index = idx, design = design, contrast = con_PC)
fry_CL <- fry(logCPM, index = idx, design = design, contrast = con_CL)

fry_PC <- fry_PC[order(fry_PC$FDR), ]
fry_CL <- fry_CL[order(fry_CL$FDR), ]

write.csv(fry_PC, "fry_GO_PreClinical_vs_Control.csv")
write.csv(fry_CL, "fry_GO_Clinical_vs_Control.csv")

cat("✓ fry results written (PreClinical & Clinical vs Control)\n")
print(head(fry_PC[, c("NGenes","Direction","PValue","FDR")], 10))
print(head(fry_CL[, c("NGenes","Direction","PValue","FDR")], 10))

In [None]:
import re
import matplotlib.pyplot as plt
import seaborn.objects as so

indir = "/hpc/group/goldsteinlab/vmd13/Python/250708_myeloid_bulks_v2"
fry_PC = pd.read_csv(f"{indir}/fry_HPO_PreClinical_vs_Control.csv", index_col=0)
fry_CL = pd.read_csv(f"{indir}/fry_HPO_Clinical_vs_Control.csv", index_col=0)

pattern = re.compile(r"(brain|csf|mri|neuro)", flags=re.IGNORECASE)

def filter_hpo(df):
    return df.loc[df.index.to_series().str.contains(pattern)]

def fry_bar_up(df, title, top_n=20):
    df = filter_hpo(df).query("Direction == 'Up'").copy()
    if df.empty:
        return None

    df["FDR"] = df["FDR"].replace(0, 1e-300)
    df["-log10FDR"] = -np.log10(df["FDR"])

    top = df.sort_values("FDR").head(top_n).iloc[::-1]

    (
        so.Plot(top, y=top.index, x="-log10FDR")
        .add(so.Bar(width=0.8, color="#D95F02"))
        .label(x="-log10 FDR", y="", title=f"{title}  (Up pathways)")
        .theme({"axes.grid": False})
        .on(plt.figure(figsize=(7, 0.35 * len(top))))
        .plot()
    )
    plt.tight_layout()
    return plt.gcf()

fry_bar_up(fry_PC, "Pre-Clinical vs Control")
fry_bar_up(fry_CL, "Clinical vs Control")
plt.show()

In [None]:
indir = "/hpc/group/goldsteinlab/vmd13/Python/250708_myeloid_bulks_v2"
fry_PC = pd.read_csv(f"{indir}/fry_HPO_PreClinical_vs_Control.csv", index_col=0)
fry_CL = pd.read_csv(f"{indir}/fry_HPO_Clinical_vs_Control.csv",    index_col=0)

pattern = re.compile(r"(brain|csf|mri|neuro)", flags=re.IGNORECASE)
def filter_hpo(df):
    return df.loc[df.index.to_series().str.contains(pattern)]

def fry_bar(df, title, svg_path, top_n=10):
    df = df.query("Direction == 'Up'").copy()
    df = filter_hpo(df)
    if df.empty:
        print(f"No matching Up pathways for {title}")
        return

    df["FDR"].replace(0, 1e-300, inplace=True)
    df["-log10FDR"] = -np.log10(df["FDR"])

    top = df.sort_values("FDR").head(top_n)       

    (
      so.Plot(top, y=top.index, x="-log10FDR")
        .add(so.Bar(width=0.8, color="#D95F02"))
        .label(x="-log10 FDR", y="", title=title)
        .theme({"axes.grid": False})
        .on(fig := plt.figure(figsize=(7, 0.38*len(top))))
        .plot()
    )
    plt.xticks([0, 0.5, 1.0, 1.5, 2.0])
    plt.tight_layout()
    fig.savefig(svg_path, format="svg", dpi=300, bbox_inches="tight")
    print(f"✓ saved {svg_path}")
    plt.close(fig)

fry_bar(fry_PC, "Pre-Clinical vs Control  (Top 10 Up pathways)",
        "/work/vmd13/fry_HPO_brain_csf_PreClinical_vs_Control.svg")

fry_bar(fry_CL, "Clinical vs Control  (Top 10 Up pathways)",
        "/work/vmd13/fry_HPO_brain_csf_Clinical_vs_Control.svg")


In [None]:
indir = "/hpc/group/goldsteinlab/vmd13/Python/250708_myeloid_bulks_v2"
fry_PC = pd.read_csv(f"{indir}/fry_HPO_PreClinical_vs_Control.csv", index_col=0)
fry_CL = pd.read_csv(f"{indir}/fry_HPO_Clinical_vs_Control.csv", index_col=0)

pattern = re.compile(r"(brain|csf|mri|neuro)", flags=re.IGNORECASE)

def filter_hpo(df):
    return df.loc[df.index.to_series().str.contains(pattern)]

def fry_bar(df, title, svg_path, top_n=10):
    df = filter_hpo(df).query("Direction == 'Up'").copy()
    if df.empty:
        return

    df["FDR"] = df["FDR"].replace(0, 1e-300)
    df["-log10FDR"] = -np.log10(df["FDR"])
    top = df.sort_values("FDR").head(top_n)

    fig = plt.figure(figsize=(7, 0.38 * len(top)))
    (
        so.Plot(top, y=top.index, x="-log10FDR")
        .add(so.Bar(width=0.8, color="#D95F02"))
        .label(x="-log10 FDR", y="", title=title)
        .theme({"axes.grid": False})
        .on(fig)
        .plot()
    )
    plt.xticks([0, 0.5, 1.0, 1.5, 2.0])
    plt.tight_layout()
    fig.savefig(svg_path, format="svg", dpi=300, bbox_inches="tight")
    plt.close(fig)

fry_bar(fry_PC, "Pre-Clinical vs Control (Top 10 Up pathways)", "/work/vmd13/fry_HPO_brain_csf_PreClinical_vs_Control.svg")
fry_bar(fry_CL, "Clinical vs Control (Top 10 Up pathways)", "/work/vmd13/fry_HPO_brain_csf_Clinical_vs_Control.svg")

In [None]:
%%R
suppressPackageStartupMessages({
  library(ggplot2)
  library(ggrepel)
})

sig_cut <- 0.05
lab_cut <- 1.0
out_pdf <- "/work/vmd13/quadrant_DE_preclin_clinical_crop2.5.pdf"
out_png <- "/work/vmd13/quadrant_DE_preclin_clinical_crop2.5.png"

both <- merge(de_PC[, c("logFC", "FDR")],
              de_CL[, c("logFC", "FDR")],
              by = "row.names", suffixes = c("_PC", "_CL"))
colnames(both)[1] <- "gene"

bad <- grepl("^UTY$|\\.|^ENSG|^MT-|^H2|^H3", both$gene, perl = TRUE)
both <- both[!bad, ]

both$cat <- "Not sig"
both$cat[both$FDR_PC < sig_cut & both$FDR_CL < sig_cut & both$logFC_PC > 0 & both$logFC_CL > 0] <- "Up in both"
both$cat[both$FDR_PC < sig_cut & both$FDR_CL < sig_cut & both$logFC_PC < 0 & both$logFC_CL < 0] <- "Down in both"
both$cat[both$FDR_PC < sig_cut & both$FDR_CL >= sig_cut & both$logFC_PC > 0] <- "Up PC only"
both$cat[both$FDR_PC < sig_cut & both$FDR_CL >= sig_cut & both$logFC_PC < 0] <- "Down PC only"
both$cat[both$FDR_CL < sig_cut & both$FDR_PC >= sig_cut & both$logFC_CL > 0] <- "Up CL only"
both$cat[both$FDR_CL < sig_cut & both$FDR_PC >= sig_cut & both$logFC_CL < 0] <- "Down CL only"

both$cat <- factor(both$cat, levels = c("Up in both", "Down in both",
                                       "Up PC only", "Down PC only",
                                       "Up CL only", "Down CL only", "Not sig"))

pal <- c("Up in both" = "#e41a1c",
         "Down in both" = "#377eb8",
         "Up PC only" = "#984ea3",
         "Down PC only" = "#4daf4a",
         "Up CL only" = "#ff7f00",
         "Down CL only" = "#a65628",
         "Not sig" = "grey75")

both$label <- ifelse(
  (abs(both$logFC_PC) >= lab_cut | abs(both$logFC_CL) >= lab_cut) &
    (both$FDR_PC < sig_cut | both$FDR_CL < sig_cut),
  both$gene, NA
)

p <- ggplot(both, aes(x = logFC_PC, y = logFC_CL, colour = cat)) +
  geom_hline(yintercept = 0, colour = "grey70") +
  geom_vline(xintercept = 0, colour = "grey70") +
  geom_point(size = 1.2, alpha = 0.8) +
  geom_text_repel(aes(label = label),
                  max.overlaps = 800,
                  box.padding = 0.3,
                  point.padding = 0.2,
                  min.segment.length = 0,
                  size = 2.6,
                  seed = 42) +
  scale_colour_manual(values = pal, name = NULL, drop = FALSE) +
  coord_cartesian(xlim = c(-2.5, 2.5), ylim = c(-2.5, 2.5)) +
  labs(x = "log2 FC  (Pre-Clinical − Control)",
       y = "log2 FC  (Clinical − Control)",
       title = "Quadrant plot of differential expression") +
  theme_classic() +
  theme(legend.position = "right")

ggsave(out_pdf, plot = p, width = 6, height = 6, units = "in")
ggsave(out_png, plot = p, width = 6, height = 6, units = "in", dpi = 300)

print(p)

In [None]:
%%R
n_up_both   <- sum(both$cat == "Up in both")
n_down_both <- sum(both$cat == "Down in both")

cat("\nConcordant DE genes (FDR <", sig_cut, "in both contrasts):\n",
    "  Up in both   :", n_up_both, "\n",
    "  Down in both :", n_down_both, "\n")


In [None]:
%%R
suppressPackageStartupMessages({
  library(edgeR)
  library(ggplot2)
  library(reshape2)
})

logCPM <- cpm(y, log = TRUE, prior.count = 2)

genes <- c("HLA-DQA2", "EGR3")
present <- intersect(genes, rownames(logCPM))

expr_df <- as.data.frame(t(logCPM[present, , drop = FALSE]))
expr_df$Alz_status <- y$samples$Alz_status

expr_long <- melt(expr_df, id.vars = "Alz_status",
                  variable.name = "Gene", value.name = "logCPM")

expr_long$Alz_status <- factor(expr_long$Alz_status,
                               levels = c("Control", "PreClinical", "Clinical"),
                               labels = c("Control", "Pre-Clinical", "Clinical"))

stage_pairs <- combn(levels(expr_long$Alz_status), 2, simplify = FALSE)

for (g in genes) {
  cat("\n", g, "\n", sep = "")
  gene_dat <- expr_long[expr_long$Gene == g, ]
  p_raw <- sapply(stage_pairs, function(pr) {
    x <- gene_dat$logCPM[gene_dat$Alz_status == pr[1]]
    y <- gene_dat$logCPM[gene_dat$Alz_status == pr[2]]
    wilcox.test(x, y, exact = FALSE)$p.value
  })
  p_adj <- p.adjust(p_raw, method = "BH")
  for (i in seq_along(stage_pairs)) {
    pr <- stage_pairs[[i]]
    cat(sprintf("  %s vs %s : raw p = %.3g  adj p = %.3g\n",
                pr[1], pr[2], p_raw[i], p_adj[i]))
  }
}

p <- ggplot(expr_long, aes(x = Alz_status, y = logCPM, fill = Alz_status)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.8, width = 0.65) +
  scale_fill_manual(values = c("#bfbfbf", "#625c86", "#7295b4"), guide = "none") +
  facet_wrap(~ Gene, scales = "free_y") +
  labs(x = "", y = "log2 CPM", title = "Pseudobulk expression by Alzheimer stage") +
  theme_classic() +
  theme(strip.text = element_text(size = 11),
        axis.text.x = element_text(angle = 30, hjust = 1))

out_png <- "/work/vmd13/pseudobulk_HLADQA2_EGR3_boxplots.png"
ggsave(out_png, plot = p, width = 6, height = 4, units = "in", dpi = 300)

print(p)