In [None]:
!pip install pymc arviz pandas matplotlib -q
# --- The rest of your script is unchanged ---
import pymc as pm
import numpy as np
import arviz as az
import pandas as pd
import matplotlib.pyplot as plt

# --- 1. Data Simulation with Structured Noise ---
np.random.seed(42)
n_gene_families = 20
genes_per_family = 500

base_proportions = np.array([0.70, 0.15, 0.15])
observed_counts = np.zeros((n_gene_families, 3), dtype=int)

for i in range(n_gene_families):
    noise1 = np.random.normal(0, 0.02)
    noise2 = -noise1 + np.random.normal(0, 0.005)
    noise0 = np.random.normal(0, 0.01)
    noisy_props = base_proportions + np.array([noise0, noise1, noise2])
    noisy_props = np.abs(noisy_props)
    noisy_props /= noisy_props.sum()
    observed_counts[i, :] = np.random.multinomial(genes_per_family, noisy_props)

print("--- Using Data with Anti-Correlated Fluctuation ---")
print(pd.DataFrame(observed_counts, columns=['Topology 1', 'Topology 2', 'Topology 3']).head())
print("-" * 60)

# --- 2. Define the Three Competing Models ---

# MODEL 1: Standard Hierarchical ILS Model (Baseline)
with pm.Model() as model_ils:
    concentration = pm.Lognormal('concentration', mu=np.log(10), sigma=1, shape=3)
    proportions = pm.Dirichlet('proportions', a=concentration, shape=(n_gene_families, 3))
    pm.Multinomial('obs', n=genes_per_family, p=proportions, observed=observed_counts, shape=(n_gene_families,3))
    trace_ils = pm.sample(2000, tune=2500, chains=4, target_accept=0.99, idata_kwargs={'log_likelihood': True})

# MODEL 2: Refined IIM 2.0 "Soft Constraint" Model
with pm.Model() as model_iim_refined:
    mu_p0 = pm.Beta('mu_p0', alpha=7, beta=3)
    kappa_p0 = pm.HalfNormal('kappa_p0', sigma=10)
    p0 = pm.Beta('p0', alpha=mu_p0 * kappa_p0, beta=(1 - mu_p0) * kappa_p0, shape=n_gene_families)

    remaining_prob = 1 - p0
    kappa_minority = pm.HalfNormal('kappa_minority', sigma=20)
    prop_p1 = pm.Beta('prop_p1', alpha=0.5 * kappa_minority, beta=0.5 * kappa_minority, shape=n_gene_families)

    p1 = remaining_prob * prop_p1
    p2 = remaining_prob * (1 - prop_p1)

    proportions = pm.math.stack([p0, p1, p2], axis=1)
    pm.Multinomial('obs', n=genes_per_family, p=proportions, observed=observed_counts, shape=(n_gene_families,3))
    trace_iim_refined = pm.sample(2000, tune=2500, chains=4, target_accept=0.99, idata_kwargs={'log_likelihood': True})

# MODEL 3: Advanced IIM 3.0 "Correlated Fluctuation" Model
with pm.Model() as model_iim_correlated:
    mu_logits = pm.Normal("mu_logits", mu=0, sigma=1.5, shape=3)
    sd_dist = pm.HalfNormal.dist(sigma=2.5, shape=3)
    chol, corrs, stds = pm.LKJCholeskyCov("chol", n=3, eta=2.0, sd_dist=sd_dist, compute_corr=True)
    logits = pm.MvNormal("logits", mu=mu_logits, chol=chol, shape=(n_gene_families, 3))
    proportions = pm.Deterministic("proportions", pm.math.softmax(logits, axis=1))
    pm.Multinomial('obs', n=genes_per_family, p=proportions, observed=observed_counts, shape=(n_gene_families,3))
    trace_iim_correlated = pm.sample(2000, tune=2500, chains=4, target_accept=0.99, idata_kwargs={'log_likelihood': True})


# --- 4. The Final Model Showdown ---
print("\n" + "="*60)
print("           FINAL MODEL SHOWDOWN")
print("="*60)

comparison_data = {
    'Standard ILS': trace_ils,
    'Refined IIM 2.0': trace_iim_refined,
    'Advanced IIM 3.0 (Correlated)': trace_iim_correlated
}
compare_df = az.compare(comparison_data, ic='waic')
print(compare_df)

# --- 5. Examine the Correlation Parameter ---
az.plot_posterior(trace_iim_correlated, var_names=["chol_corr"])
plt.show()

In [None]:
!pip install pymc arviz pandas matplotlib -q
# --- The rest of your script is unchanged ---
import pymc as pm
import numpy as np
import arviz as az
import pandas as pd
import matplotlib.pyplot as plt

# --- 1. Data Simulation with Structured Noise ---
np.random.seed(42)
n_gene_families = 20
genes_per_family = 500

base_proportions = np.array([0.70, 0.15, 0.15])
observed_counts = np.zeros((n_gene_families, 3), dtype=int)

for i in range(n_gene_families):
    noise1 = np.random.normal(0, 0.02)
    noise2 = -noise1 + np.random.normal(0, 0.005)
    noise0 = np.random.normal(0, 0.01)
    noisy_props = base_proportions + np.array([noise0, noise1, noise2])
    noisy_props = np.abs(noisy_props)
    noisy_props /= noisy_props.sum()
    observed_counts[i, :] = np.random.multinomial(genes_per_family, noisy_props)

print("--- Using Data with Anti-Correlated Fluctuation ---")
print(pd.DataFrame(observed_counts, columns=['Topology 1', 'Topology 2', 'Topology 3']).head())
print("-" * 60)

# --- 2. Define the Three Competing Models ---

# MODEL 1: Standard Hierarchical ILS Model (Baseline)
with pm.Model() as model_ils:
    concentration = pm.Lognormal('concentration', mu=np.log(10), sigma=1, shape=3)
    proportions = pm.Dirichlet('proportions', a=concentration, shape=(n_gene_families, 3))
    pm.Multinomial('obs', n=genes_per_family, p=proportions, observed=observed_counts, shape=(n_gene_families,3))
    trace_ils = pm.sample(2000, tune=2500, chains=4, target_accept=0.99, idata_kwargs={'log_likelihood': True})

# MODEL 2: Refined IIM 2.0 "Soft Constraint" Model
with pm.Model() as model_iim_refined:
    mu_p0 = pm.Beta('mu_p0', alpha=7, beta=3)
    kappa_p0 = pm.HalfNormal('kappa_p0', sigma=10)
    p0 = pm.Beta('p0', alpha=mu_p0 * kappa_p0, beta=(1 - mu_p0) * kappa_p0, shape=n_gene_families)

    remaining_prob = 1 - p0
    kappa_minority = pm.HalfNormal('kappa_minority', sigma=20)
    prop_p1 = pm.Beta('prop_p1', alpha=0.5 * kappa_minority, beta=0.5 * kappa_minority, shape=n_gene_families)

    p1 = remaining_prob * prop_p1
    p2 = remaining_prob * (1 - prop_p1)

    proportions = pm.math.stack([p0, p1, p2], axis=1)
    pm.Multinomial('obs', n=genes_per_family, p=proportions, observed=observed_counts, shape=(n_gene_families,3))
    trace_iim_refined = pm.sample(2000, tune=2500, chains=4, target_accept=0.99, idata_kwargs={'log_likelihood': True})

# MODEL 3: Advanced IIM 3.0 "Correlated Fluctuation" Model
with pm.Model() as model_iim_correlated:
    mu_logits = pm.Normal("mu_logits", mu=0, sigma=1.5, shape=3)
    sd_dist = pm.HalfNormal.dist(sigma=2.5, shape=3)
    chol, corrs, stds = pm.LKJCholeskyCov("chol", n=3, eta=2.0, sd_dist=sd_dist, compute_corr=True)
    logits = pm.MvNormal("logits", mu=mu_logits, chol=chol, shape=(n_gene_families, 3))
    proportions = pm.Deterministic("proportions", pm.math.softmax(logits, axis=1))
    pm.Multinomial('obs', n=genes_per_family, p=proportions, observed=observed_counts, shape=(n_gene_families,3))
    trace_iim_correlated = pm.sample(2000, tune=2500, chains=4, target_accept=0.99, idata_kwargs={'log_likelihood': True})


# --- 4. The Final Model Showdown ---
print("\n" + "="*60)
print("           FINAL MODEL SHOWDOWN")
print("="*60)

comparison_data = {
    'Standard ILS': trace_ils,
    'Refined IIM 2.0': trace_iim_refined,
    'Advanced IIM 3.0 (Correlated)': trace_iim_correlated
}
compare_df = az.compare(comparison_data, ic='waic')
print(compare_df)

# --- 5. Examine the Correlation Parameter ---
az.plot_posterior(trace_iim_correlated, var_names=["chol_corr"])
plt.show()

# --- 6. Inspect the Winning Model's Parameters ---

# Print the summary statistics for the correlated model's trace
# This will show the mean values and narrow confidence intervals for the parameters
summary = az.summary(trace_iim_correlated, var_names=["chol_corr"])
print(summary)

# Plot the posterior using a histogram instead of a smooth curve
# This avoids the KDE error and will visualize the result.
az.plot_posterior(trace_iim_correlated, var_names=["chol_corr"], kind='hist')
plt.show()

In [None]:
import pymc as pm
import numpy as np
import arviz as az
import matplotlib.pyplot as plt
import xarray as xr

# Observed counts
observed_counts = np.array([11904, 2816, 2697])
total_counts = observed_counts.sum()

# Re-declare Standard ILS model and sample PPC
with pm.Model() as model_set:
    proportions = pm.Dirichlet('proportions', a=np.array([1, 1, 1]))
    multinom_obs = pm.Multinomial('obs', n=total_counts, p=proportions, observed=observed_counts)
    trace_set = pm.sample(1000, tune=1000, chains=2, cores=1) # Add sampling for trace_set
    ppc_set = pm.sample_posterior_predictive(trace_set, var_names=["obs"])


# Re-declare IIM Correlated model and sample PPC
with pm.Model() as model_iim_correlated:
    mu_logits = pm.Normal("mu_logits", mu=0, sigma=3.0, shape=3)
    sd_dist = pm.HalfNormal.dist(sigma=3.0, shape=3)
    chol, corrs, stds = pm.LKJCholeskyCov("chol", n=3, eta=4.0, sd_dist=sd_dist, compute_corr=True)
    logits = pm.MvNormal("logits", mu=mu_logits, chol=chol, shape=3)
    proportions_iim = pm.Deterministic("proportions", pm.math.softmax(logits))
    multinom_obs = pm.Multinomial('obs', n=total_counts, p=proportions_iim, observed=observed_counts)
    trace_iim_correlated = pm.sample(1000, tune=1000, chains=2, cores=1) # Add sampling for trace_iim_correlated
    ppc_iim = pm.sample_posterior_predictive(trace_iim_correlated, var_names=["obs"])

# DEBUG: Print available keys to verify
print("Keys in ppc_set:", ppc_set.keys())
print("Keys in ppc_iim:", ppc_iim.keys())

# Grab the actual variable name that was sampled
varname = list(ppc_set.keys())[0]  # Assuming only one variable returned

# Build ArviZ InferenceData manually
def build_ppc_idata(ppc_dict, varname):
    # Extract the data array from the Dataset and reshape it
    obs_samples = ppc_dict[varname].values.reshape(-1, observed_counts.shape[0])
    return az.InferenceData(
        posterior_predictive=xr.Dataset({
            varname: (["draw", "obs_dim"], obs_samples)
        })
    )

# Build ID for plotting
ppc_set_idata = build_ppc_idata(ppc_set, varname)
ppc_iim_idata = build_ppc_idata(ppc_iim, varname)

# Plot PPCs
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

az.plot_ppc(ppc_set_idata, ax=axes[0], kind='kde', observed=True)
axes[0].set_title("PPC: Standard ILS")

az.plot_ppc(ppc_iim_idata, ax=axes[1], kind='kde', observed=True)
axes[1].set_title("PPC: IIM 3.0 (Correlated)")

plt.tight_layout()
plt.show()