In [1]:
import numpy as np
import polars as pl
import os

import cmdstanpy
import arviz as az

import iqplot
import bebi103

import bokeh.io
from bokeh.models import Legend
bokeh.io.output_notebook()

In [None]:
# Load data set
# df = pl.read_csv("data/gandhi_et_al_night_six_activity.csv")


In [3]:
df.head()

location,genotype,activity
i64,str,f64
1,"""het""",18.398333
2,"""wt""",6.268333
3,"""het""",3.235
4,"""mut""",16.531667
5,"""het""",18.688333


## Exploratory Data Analysis (EDA)

Let's start by displaying the first few rows of the activity data to get an overview and check for outliers.

### Visualize activity distribution by genotype
We will use iqplot and Bokeh to visualize the distribution of activity for each genotype and look for outliers.

In [4]:
import iqplot
import bokeh.io
bokeh.io.output_notebook()

df_pd = df.to_pandas()

p = iqplot.strip( 
    df_pd, 
    q='activity', 
    cats='genotype', 
    jitter=True, 
    marker_kwargs={'size': 8}, 
    frame_width=600, 
    frame_height=400 
)
bokeh.io.show(p)



## Student-t Distribution Modeling with Stan

Now we'll model the activity data using a Student-t distribution, which has heavier tails than a normal distribution <br> and is more robust to outliers. We'll estimate the parameters (mu, sigma, nu) for each genotype separately.

In [5]:
print("Unique genotypes:", df["genotype"].unique().to_list())
print("Data summary by genotype:")
print(df.group_by("genotype").agg([
    pl.count("activity").alias("n"),
    pl.mean("activity").alias("mean_activity"),
    pl.std("activity").alias("std_activity"),
    pl.min("activity").alias("min_activity"),
    pl.max("activity").alias("max_activity")
]))

Unique genotypes: ['het', 'mut', 'wt']
Data summary by genotype:
shape: (3, 6)
┌──────────┬─────┬───────────────┬──────────────┬──────────────┬──────────────┐
│ genotype ┆ n   ┆ mean_activity ┆ std_activity ┆ min_activity ┆ max_activity │
│ ---      ┆ --- ┆ ---           ┆ ---          ┆ ---          ┆ ---          │
│ str      ┆ u32 ┆ f64           ┆ f64          ┆ f64          ┆ f64          │
╞══════════╪═════╪═══════════════╪══════════════╪══════════════╪══════════════╡
│ mut      ┆ 22  ┆ 30.378939     ┆ 35.621781    ┆ 6.468333     ┆ 138.205      │
│ wt       ┆ 17  ┆ 10.105588     ┆ 4.30067      ┆ 4.07         ┆ 19.255       │
│ het      ┆ 34  ┆ 11.252451     ┆ 5.906493     ┆ 3.235        ┆ 33.961667    │
└──────────┴─────┴───────────────┴──────────────┴──────────────┴──────────────┘


## Prior predictive checks:

In [17]:
def plot_prior_predictive_checks_student_t(df):    
    n_samples = len(df)

    rng = np.random.default_rng()
    n_prior_draws = 100

    mu_prior_samples = rng.normal(15, 5, n_prior_draws)
    sigma_prior_samples = rng.normal(10, 5, n_prior_draws)
    nu_prior_samples = 5 + np.abs(rng.normal(0, 2.5, n_prior_draws))

    bokeh.io.output_notebook()
    p = bokeh.plotting.figure(width=700, height=400, title="Prior Predictive Checks (Student-t)")

    for i in range(n_prior_draws):
        mu = mu_prior_samples[i]
        sigma = sigma_prior_samples[i]
        nu = nu_prior_samples[i]

        y_sim = rng.standard_t(df=nu, size=n_samples) * sigma + mu
        
        p = iqplot.ecdf(pl.DataFrame({"y": y_sim}), q="y", line_kwargs={"alpha":0.1, "color":"blue"}, p=p)


    bokeh.io.show(p)


In [18]:
plot_prior_predictive_checks_student_t(df)

In [42]:
genotypes = df["genotype"].unique().to_list()
models = {}
activity_data_dict = {}

for genotype in genotypes:
    print(f"\nFitting model for genotype: {genotype}")

    genotype_data = df.filter(pl.col("genotype") == genotype)
    activity_data = genotype_data["activity"].to_numpy()

    stan_data = {
        'n': len(activity_data),
        'y': activity_data
    }

    sm = cmdstanpy.CmdStanModel(stan_file="stan_files/student_t_model.stan")

    with bebi103.stan.disable_logging():
        # Pass `iter_sampling` to `sm.sample()`
        fit = sm.sample(data=stan_data, iter_sampling=2000)

    samples = az.from_cmdstanpy(fit, posterior_predictive="y_rep")

    models[genotype] = samples
    activity_data_dict[genotype] = activity_data

21:23:34 - cmdstanpy - INFO - compiling stan file /root/projects/wis-stats/submission/stan_files/student_t_model.stan to exe file /root/projects/wis-stats/submission/stan_files/student_t_model



Fitting model for genotype: wt


21:23:47 - cmdstanpy - INFO - compiled model executable: /root/projects/wis-stats/submission/stan_files/student_t_model


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

Fitting model for genotype: het


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                

Fitting model for genotype: mut


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                


In [43]:
bokeh.io.show(
    bebi103.viz.corner(models['het'], parameters = ['mu', 'sigma', 'nu'],
                       show_contours = True,
    )
)

In [32]:
for genotype in genotypes:
    activity_data = activity_data_dict[genotype]
    samples = models[genotype]
    mu_samples = samples.posterior['mu'].values.flatten()
    mu_mean = np.mean(mu_samples)
    real_mean = np.mean(activity_data)

    print(f'Student-t model for genotype {genotype}:')
    print(f'Mean of μ samples: {mu_mean:.3f}')
    print(f'Mean of real data: {real_mean:.3f}')

Student-t model for genotype het:
Mean of μ samples: 10.215
Mean of real data: 11.252
Student-t model for genotype mut:
Mean of μ samples: 17.649
Mean of real data: 30.379
Student-t model for genotype wt:
Mean of μ samples: 9.781
Mean of real data: 10.106


In [44]:
def plot_predictive_ecdf(samples, activity_data): 
    y_rep = (
        samples.posterior_predictive['y_rep']
        .stack({"sample": ("chain", "draw")})
        .transpose("sample", "y_rep_dim_0")
    )

    bokeh.io.show(
        bebi103.viz.predictive_ecdf(y_rep, data=activity_data)
    )

In [45]:
for genotype in genotypes:
    print(f'Plotting predictive ECDF for genotype: {genotype}')
    plot_predictive_ecdf(models[genotype], activity_data_dict[genotype])

Plotting predictive ECDF for genotype: wt


Plotting predictive ECDF for genotype: het


Plotting predictive ECDF for genotype: mut


In [46]:
def plot_predictive_ecdf_diff(samples, activity_data):
    p = None
    y_rep = samples.posterior_predictive['y_rep'].stack({'sample': ('chain', 'draw')}).transpose('sample', 'y_rep_dim_0').values

    p = bebi103.viz.predictive_ecdf(y_rep, p=p, data = activity_data, diff='ecdf')

    bokeh.io.show(p)

In [47]:
bebi103.stan.check_all_diagnostics(models['wt'])

Effective sample size looks reasonable for all parameters.

Rhat looks reasonable for all parameters.

0 of 8000 (0.0%) iterations ended with a divergence.

0 of 8000 (0.0%) iterations saturated the maximum tree depth of 10.

E-BFMI indicated no pathological behavior.


0

In [48]:
bebi103.stan.check_all_diagnostics(models['mut'])

Effective sample size looks reasonable for all parameters.

Rhat looks reasonable for all parameters.

0 of 8000 (0.0%) iterations ended with a divergence.

0 of 8000 (0.0%) iterations saturated the maximum tree depth of 10.

E-BFMI indicated no pathological behavior.


0

In [49]:
bebi103.stan.check_all_diagnostics(models['het'])

Effective sample size looks reasonable for all parameters.

Rhat looks reasonable for all parameters.

0 of 8000 (0.0%) iterations ended with a divergence.

0 of 8000 (0.0%) iterations saturated the maximum tree depth of 10.

E-BFMI indicated no pathological behavior.


0

Diagnostics for all models are good without any issues with sampling

In [50]:
for genotype in genotypes:
    print(f'Plotting predictive ECDF difference for genotype: {genotype}')
    plot_predictive_ecdf_diff(models[genotype], activity_data_dict[genotype])

Plotting predictive ECDF difference for genotype: wt


Plotting predictive ECDF difference for genotype: het


Plotting predictive ECDF difference for genotype: mut


Looking at the different ECDF plots, we can see the data is consistent with the model for all three strains,<br>
with some difficulties with estimating the mean for the mutated genotype <br>
We can take a look and see if it is still an improvement from using a normal distribution.

In [53]:
genotypes = ['mut']

for genotype in genotypes:
    print(f"\nFitting model for genotype: {genotype}")

    genotype_data = df.filter(pl.col("genotype") == genotype)
    activity_data = genotype_data["activity"].to_numpy()

    stan_data = {
        'n': len(activity_data),
        'y': activity_data
    }

    sm = cmdstanpy.CmdStanModel(stan_file="stan_files/normal_mut.stan")

    with bebi103.stan.disable_logging():
        # Pass `iter_sampling` to `sm.sample()`
        fit = sm.sample(data=stan_data, iter_sampling=2000)

    samples_normal_mut = az.from_cmdstanpy(fit, posterior_predictive="y_rep")



21:27:55 - cmdstanpy - INFO - compiling stan file /root/projects/wis-stats/submission/stan_files/normal_mut.stan to exe file /root/projects/wis-stats/submission/stan_files/normal_mut



Fitting model for genotype: mut


21:28:11 - cmdstanpy - INFO - compiled model executable: /root/projects/wis-stats/submission/stan_files/normal_mut


chain 1 |          | 00:00 Status

chain 2 |          | 00:00 Status

chain 3 |          | 00:00 Status

chain 4 |          | 00:00 Status

                                                                                                                                                                                                                                                                                                                                


In [57]:
bebi103.stan.check_all_diagnostics(samples_normal_mut)

Effective sample size looks reasonable for all parameters.

Rhat looks reasonable for all parameters.

0 of 8000 (0.0%) iterations ended with a divergence.

0 of 8000 (0.0%) iterations saturated the maximum tree depth of 10.

E-BFMI indicated no pathological behavior.


0

In [55]:
p = None
y_rep = samples_normal_mut.posterior_predictive['y_rep'].stack({'sample': ('chain', 'draw')}).transpose('sample', 'y_rep_dim_0').values

p = bebi103.viz.predictive_ecdf(y_rep, p=p, data = activity_data, diff='ecdf')

bokeh.io.show(p)

In [56]:
az.compare({"mut_student":models['mut'] ,"mut_normal": samples_normal_mut})



Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
mut_student,0,-93.362769,2.633606,0.0,1.0,9.299916,0.0,False,log
mut_normal,1,-116.166857,5.606538,22.804087,2.859935e-13,14.506578,7.184353,True,log


Just from looking at the ecdf we can see more values from the real data breakout from the two <br>
standard deviations lines, than in the student's-t model. <br>
Model comparison shows that the Student-t model provides the best fit for this data, effectively accommodating outliers through its heavier tails.