In [None]:
import numpy as np
from cmdstanpy import CmdStanModel
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

## Priors [0-4 pts]
- Is it explained why particular priors for parameters were selected [1 pt]
- Have prior predictive checks been done for parameters (are parameters simulated from priors make sense) [1 pt]
- Have prior predictive checks been done for measurements (are measurements simulated from priors make sense) [1 pt]
- How prior parameters were selected [1 pt]

In [None]:
model_1_ppc = CmdStanModel(stan_file='stan/model_2_ppc.stan')

In [None]:
model_1_ppc_sim = model_1_ppc.sample(data={'sigma':0.8, 'drive_rating': 2.5}, iter_warmup=1, fixed_param=True, seed=10062022)
df_ppc_sim = model_1_ppc_sim.draws_pd()
df_ppc_sim

## The Prior tests were prepared for the best, average, and weakest driver.

### The driver with the best results.

In [None]:
fig, axes = plt.subplots(3, 6, figsize=(8*4, 15))

sigmas = [0.8, 1.0, 1.2]

for s_i in range(3):
    sigma = {'sigma':sigmas[s_i], 'drive_rating': 2.5} # Standard scaler for driver rating ang avg_pos
    model_1_ppc_sim = model_1_ppc.sample(data=sigma, iter_warmup=1, fixed_param=True, seed=10062022)
    axes[s_i, 0].hist(model_1_ppc_sim.stan_variable('engine').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 0].set_yticks([])
    axes[s_i, 0].set_title(f'engine Normal(0,{sigmas[s_i]})')


    axes[s_i, 1].hist(model_1_ppc_sim.stan_variable('alpha_driver').flatten() * 2.5, bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 1].set_yticks([])
    axes[s_i, 1].set_title(f'alpha_driver Normal(0,{sigmas[s_i]})$')

    axes[s_i, 2].hist(model_1_ppc_sim.stan_variable('year').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 2].set_yticks([])
    axes[s_i, 2].set_title(f'beta_avg_drive Normal(0,{sigmas[s_i]})')


    axes[s_i, 3].hist(model_1_ppc_sim.stan_variable('constructor').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 3].set_yticks([])
    axes[s_i, 3].set_title(f'constructor Normal(0,{sigmas[s_i]})$')


    axes[s_i, 4].hist(model_1_ppc_sim.stan_variable('theta').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 4].set_yticks([])
    axes[s_i, 4].set_title(f'theta = engine + constructor - alpha_driver\n * drive_rating + year')

    # pos_min = 1, pos_max = 20
    n_bins = np.arange(22) - 0.5
    axes[s_i, 5].hist(model_1_ppc_sim.stan_variable('y_ppc').flatten()+1, bins=n_bins, rwidth=1, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 5].set_xticks(range(22))
    axes[s_i, 5].set_xlim([0, 21])
    axes[s_i, 5].set_yticks([])
    axes[s_i, 5].set_title('Position')

axes[2, 0].set_xlabel('engine')
axes[2, 1].set_xlabel('constructor')
axes[2, 2].set_xlabel('alpha_driver')
axes[2, 3].set_xlabel('beta_avg_drive')
axes[2, 4].set_xlabel('theta')
axes[2, 5].set_xlabel('Position')

fig.tight_layout()
plt.show()

### An average driver from the middle of the field.

In [None]:
fig, axes = plt.subplots(3, 6, figsize=(8*4, 15))

sigmas = [0.8, 1.0, 1.2]

for s_i in range(3):
    sigma = {'sigma':sigmas[s_i], 'drive_rating': 0} # Standard scaler for driver rating ang avg_pos
    model_1_ppc_sim = model_1_ppc.sample(data=sigma, iter_warmup=1, fixed_param=True, seed=10062022)
    axes[s_i, 0].hist(model_1_ppc_sim.stan_variable('engine').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 0].set_yticks([])
    axes[s_i, 0].set_title(f'engine Normal(0,{sigmas[s_i]})')


    axes[s_i, 1].hist(model_1_ppc_sim.stan_variable('alpha_driver').flatten() * 2.5, bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 1].set_yticks([])
    axes[s_i, 1].set_title(f'alpha_driver Normal(0,{sigmas[s_i]})$')

    axes[s_i, 2].hist(model_1_ppc_sim.stan_variable('year').flatten() *0.1, bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 2].set_yticks([])
    axes[s_i, 2].set_title(f'beta_avg_drive Normal(0,{sigmas[s_i]})')


    axes[s_i, 3].hist(model_1_ppc_sim.stan_variable('constructor').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 3].set_yticks([])
    axes[s_i, 3].set_title(f'constructor Normal(0,{sigmas[s_i]})$')


    axes[s_i, 4].hist(model_1_ppc_sim.stan_variable('theta').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 4].set_yticks([])
    axes[s_i, 4].set_title(f'theta = engine + constructor + alpha_driver\n * drive_rating + year')

    # pos_min = 1, pos_max = 20
    n_bins = np.arange(22) - 0.5
    axes[s_i, 5].hist(model_1_ppc_sim.stan_variable('y_ppc').flatten()+1, bins=n_bins, rwidth=1, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 5].set_xticks(range(22))
    axes[s_i, 5].set_xlim([0, 21])
    axes[s_i, 5].set_yticks([])
    axes[s_i, 5].set_title('Position')

axes[2, 0].set_xlabel('engine')
axes[2, 1].set_xlabel('constructor')
axes[2, 2].set_xlabel('alpha_driver')
axes[2, 3].set_xlabel('beta_avg_drive')
axes[2, 4].set_xlabel('theta')
axes[2, 5].set_xlabel('Position')

fig.tight_layout()
plt.show()

### The driver with the worst results.

In [None]:
fig, axes = plt.subplots(3, 6, figsize=(8*4, 15))

sigmas = [0.8, 1.0, 1.2]

for s_i in range(3):
    sigma = {'sigma':sigmas[s_i], 'drive_rating': -2.5} # Standard scaler for driver rating ang avg_pos
    model_1_ppc_sim = model_1_ppc.sample(data=sigma, iter_warmup=1, fixed_param=True, seed=10062022)
    axes[s_i, 0].hist(model_1_ppc_sim.stan_variable('engine').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 0].set_yticks([])
    axes[s_i, 0].set_title(f'engine Normal(0,{sigmas[s_i]})')


    axes[s_i, 1].hist(model_1_ppc_sim.stan_variable('alpha_driver').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 1].set_yticks([])
    axes[s_i, 1].set_title(f'alpha_driver Normal(0,{sigmas[s_i]})$')

    axes[s_i, 2].hist(model_1_ppc_sim.stan_variable('year').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 2].set_yticks([])
    axes[s_i, 2].set_title(f'beta_avg_drive Normal(0,{sigmas[s_i]})')


    axes[s_i, 3].hist(model_1_ppc_sim.stan_variable('constructor').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 3].set_yticks([])
    axes[s_i, 3].set_title(f'constructor Normal(0,{sigmas[s_i]})$')


    axes[s_i, 4].hist(model_1_ppc_sim.stan_variable('theta').flatten(), bins=100, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 4].set_yticks([])
    axes[s_i, 4].set_title(f'theta = engine + constructor + alpha_driver\n * drive_rating + year')

    # pos_min = 1, pos_max = 20
    n_bins = np.arange(22) - 0.5
    axes[s_i, 5].hist(model_1_ppc_sim.stan_variable('y_ppc').flatten()+1, bins=n_bins, rwidth=1, color='cornflowerblue', edgecolor='royalblue', density=True)
    axes[s_i, 5].set_xticks(range(22))
    axes[s_i, 5].set_xlim([0, 21])
    axes[s_i, 5].set_yticks([])
    axes[s_i, 5].set_title('Position')

axes[2, 0].set_xlabel('engine')
axes[2, 1].set_xlabel('constructor')
axes[2, 2].set_xlabel('alpha_driver')
axes[2, 3].set_xlabel('beta_avg_drive')
axes[2, 4].set_xlabel('theta')
axes[2, 5].set_xlabel('Position')

fig.tight_layout()
plt.show()

We choose sigma 1.0 for our model

## Posterior analysis (model 2) [0-4 pts]
- were there any issues with the sampling? if there were what kind of ideas for mitigation were used [1 pt]
- are the samples from posterior predictive distribution analyzed [1 pt]
 are the data consistent with posterior predictive samples and is it sufficiently commented (if they are not then is the justification provided)
have parameter marginal disrtibutions been analyzed (histograms of individual parametes plus summaries, are they diffuse or concentrated, what can we say about values) [1 pt]

In [None]:
model_1 = CmdStanModel(stan_file='stan/model_2.stan')

In [None]:
df = pd.read_csv('data/processed_data/data.csv')
df = df.dropna()
unique_drivers = df['DriverId'].unique()
driver_id_map = {driver: idx + 1 for idx, driver in enumerate(unique_drivers)}
df['DriverId'] = df['DriverId'].map(driver_id_map)
drivers = df['DriverId'].values

unique_team = df['TeamId'].unique()
team_id_map = {team: idx + 1 for idx, team in enumerate(unique_team)}
df['TeamId'] = df['TeamId'].map(team_id_map)
teams = df['TeamId'].values

unique_engine = df['Engine'].unique()
engine_id_map = {engine: idx + 1 for idx, engine in enumerate(unique_engine)}
df['Engine'] = df['Engine'].map(engine_id_map)
engines = df['Engine'].values

unique_season = df['Season'].unique()
season_id_map = {season: idx + 1 for idx, season in enumerate(unique_season)}
df['Season'] = df['Season'].map(season_id_map)
seasons = df['Season'].values

In [None]:
def standardize_group(group):
    mean = group['Rating'].mean()
    std = group['Rating'].std()
    group['Rating'] = (group['Rating'] - mean) / std
    return group


df = df.groupby('Season', group_keys=False, observed=True).apply(standardize_group)
ratings = df["Rating"].values
df['Position'] = df['Position'].astype(int)

In [None]:
order_col = ['DriverId', 'Rating', 'TeamId', 'Engine', 'Season','Position']
df = df[order_col]
df.head()

In [None]:
model_1_data = {'N': len(df),
                'C': len([*team_id_map.values()]),
                'E': len([*engine_id_map.values()]),
                'D': len([*driver_id_map.values()]),
                'Y': len([*season_id_map.values()]),
                'driver_rating': ratings,
                'engine': engines,
                'constructor': teams,                
                'driver': drivers,
                'year': seasons,
                'position': df['Position'] - 1} 

model_1_fit = model_1.sample(data=model_1_data, seed=25062025,iter_warmup=1000)

In [None]:
# pos_min = 1, pos_max = 20
n_bins = np.arange(22) - 0.5

drivers_names = ['bottas']
for d_i, d_name in enumerate(drivers_names):
    results = df[df['DriverId'] == 1]
    plt.hist((results['Position']+1).tolist(), bins=n_bins, rwidth=1, histtype='step', edgecolor='black', density=True, label='Observed')
    results_idx = results.index
    plt.hist(model_1_fit.stan_variable('y_hat').T[results_idx].flatten()+1, bins=n_bins, rwidth=1, color='cornflowerblue', edgecolor='royalblue', density=True, label='Simulated')
    plt.xticks(range(22))
    plt.xlim([0, 21])
    plt.yticks([])
    plt.title(d_name.upper() + ' finishing positions in years 2020-2024')
    plt.legend()
    plt.xlabel(r'position')

fig.tight_layout()
plt.show()

## Model comaprison [0-4 pts]
- Have models been compared using information criteria [1 pt]
- Have result for WAIC been discussed (is there a clear winner, or is there an overlap, were there any warnings) [1 pt]
- Have result for PSIS-LOO been discussed (is there a clear winner, or is there an overlap, were there any warnings) [1 pt]
- Whas the model comparison discussed? Do authors agree with information criteria? Why in your opinion one model better than another [1 pt]