In [None]:
import numpy as np
from cmdstanpy import CmdStanModel
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

## Priors [0-4 pts]
- Is it explained why particular priors for parameters were selected [1 pt]
- Have prior predictive checks been done for parameters (are parameters simulated from priors make sense) [1 pt]
- Have prior predictive checks been done for measurements (are measurements simulated from priors make sense) [1 pt]
- How prior parameters were selected [1 pt]

In [None]:
model_2_ppc = CmdStanModel(stan_file='stan/model_2_ppc.stan')

In [None]:
model_2_ppc_sim = model_2_ppc.sample(data={'sigma':0.8, 'drive_rating': 2.5}, iter_warmup=1, fixed_param=True, seed=10062022)
df_ppc_sim = model_2_ppc_sim.draws_pd()
df_ppc_sim

## Posterior analysis (model 2) [0-4 pts]
- were there any issues with the sampling? if there were what kind of ideas for mitigation were used [1 pt]
- are the samples from posterior predictive distribution analyzed [1 pt]
 are the data consistent with posterior predictive samples and is it sufficiently commented (if they are not then is the justification provided)
have parameter marginal disrtibutions been analyzed (histograms of individual parametes plus summaries, are they diffuse or concentrated, what can we say about values) [1 pt]

In [None]:
model_2 = CmdStanModel(stan_file='stan/model_2.stan')

In [None]:
df = pd.read_csv('data/processed_data/data.csv')
unique_drivers = df['DriverId'].unique()
driver_id_map = {driver: idx + 1 for idx, driver in enumerate(unique_drivers)}
df['DriverId'] = df['DriverId'].map(driver_id_map)
drivers = df['DriverId'].values

unique_team = df['TeamId'].unique()
team_id_map = {team: idx + 1 for idx, team in enumerate(unique_team)}
df['TeamId'] = df['TeamId'].map(team_id_map)
teams = df['TeamId'].values

unique_engine = df['Engine'].unique()
engine_id_map = {engine: idx + 1 for idx, engine in enumerate(unique_engine)}
df['Engine'] = df['Engine'].map(engine_id_map)
engines = df['Engine'].values

unique_season = df['Season'].unique()
season_id_map = {season: idx + 1 for idx, season in enumerate(unique_season)}
df['Season'] = df['Season'].map(season_id_map)
seasons = df['Season'].values

In [None]:
def standardize_group(group):
    mean = group['Rating'].mean()
    std = group['Rating'].std()
    group['Rating'] = (group['Rating'] - mean) / std
    return group


df = df.groupby('Season', group_keys=False, observed=True).apply(standardize_group)
ratings = df["Rating"].values
df['Position'] = df['Position'].astype(int)

In [None]:
order_col = ['DriverId', 'Rating', 'TeamId', 'Engine', 'Season','Position']
df = df[order_col]
df.head()

In [None]:
model_2_data = {'N': len(df),
                'C': len([*team_id_map.values()]),
                'E': len([*engine_id_map.values()]),
                'D': len([*driver_id_map.values()]),
                'Y': len([*season_id_map.values()]),
                'driver_rating': ratings,
                'engine': engines,
                'constructor': teams,                
                'driver': drivers,
                'year': seasons,
                'position': df['Position'] - 1} 

model_2_fit = model_2.sample(data=model_2_data, seed=25062025,iter_warmup=1000)

In [None]:
drivers_names = ['hamilton', 'russell', 'tsunoda']
fig, axes = plt.subplots(1, len(drivers_names), figsize=(5 * len(drivers_names), 4), sharey=True)

n_bins = np.arange(22) - 0.5

for d_i, d_name in enumerate(drivers_names):
    ax = axes[d_i]
    driver_id = driver_id_map[d_name]
    results = df[df['DriverId'] == driver_id]
    results_idx = results.index

    ax.hist((results['Position'] + 1).tolist(),
            bins=n_bins,
            rwidth=0.9,
            histtype='step',
            edgecolor='black',
            density=True,
            label='Observed')

    ax.hist(model_2_fit.stan_variable('y_hat').T[results_idx].flatten() + 1,
            bins=n_bins,
            rwidth=0.9,
            color='cornflowerblue',
            edgecolor='royalblue',
            alpha=0.7,
            density=True,
            label='Simulated')

    ax.set_xticks(range(22))
    ax.set_xlim([0, 21])
    ax.set_yticks([])
    ax.set_title(d_name.upper() + '\nfinishing positions (2020–2024)', fontsize=11)
    ax.set_xlabel('Position')
    ax.legend(loc='upper right', fontsize=8)

fig.tight_layout()
plt.show()


## Model comaprison [0-4 pts]
- Have models been compared using information criteria [1 pt]
- Have result for WAIC been discussed (is there a clear winner, or is there an overlap, were there any warnings) [1 pt]
- Have result for PSIS-LOO been discussed (is there a clear winner, or is there an overlap, were there any warnings) [1 pt]
- Whas the model comparison discussed? Do authors agree with information criteria? Why in your opinion one model better than another [1 pt]