In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [128]:
df = pd.read_parquet('../../data/curated/adjusted_probabilities_2022')
season_df = pd.read_parquet('../../data/curated/clean_stats_13-22').query('season == 2022')

In [129]:
df.head()

Unnamed: 0,player_id,match_id,player_first_name,player_last_name,player_team,new_0,new_1,new_2,new_3,exp_votes
0,11904,16117,Tom,Liberatore,Western Bulldogs,0.996153,0.002482,0.001176,0.00019,0.005403
1,11945,16117,Steven,May,Melbourne,0.992642,0.005093,0.00196,0.000305,0.009927
2,11972,16117,Max,Gawn,Melbourne,0.934318,0.031216,0.029109,0.005358,0.105507
3,12015,16117,Tom,McDonald,Melbourne,0.992527,0.004218,0.002834,0.000421,0.011148
4,12034,16117,Adam,Tomlinson,Melbourne,0.995796,0.002834,0.001181,0.000189,0.005762


In [130]:
players = list(df.player_id.unique())

In [131]:
match_ids = list(df.match_id.unique())

In [132]:
num_simulations = 200

votes_3 = []
votes_2 = []
votes_1 = []

# simulate 10,000 seasons 
for _ in tqdm(range(num_simulations), desc="Simulations"):
    
    # simulate every game in the season, choose a 3-2-1 vote
    for match in match_ids:
        sub_df = df.query('match_id == @match')
        match_players = sub_df.player_id.tolist()

        match_prob_3 = sub_df.new_3.tolist()
        match_prob_2 = sub_df.new_2.tolist()
        match_prob_1 = sub_df.new_1.tolist()

        vote_3 = np.random.choice(match_players, 1, p=match_prob_3)[0]
        vote_2 = vote_3  # Initialize with a value that requires re-selection
        
        # ensure same player is not chosen as vote 3 and 2
        while vote_2 == vote_3:
            vote_2 = np.random.choice(match_players, 1, p=match_prob_2)[0]

        vote_1 = vote_3  
        while vote_1 == vote_3 or vote_1 == vote_2:
            vote_1 = np.random.choice(match_players, 1, p=match_prob_1)[0]

        votes_3.append(vote_3)
        votes_2.append(vote_2)
        votes_1.append(vote_1)


Simulations: 100%|██████████| 200/200 [00:20<00:00,  9.60it/s]


In [133]:
player_votes = {}

In [134]:
for player in tqdm(players):
    
    votes = 3*votes_3.count(player) + 2*votes_2.count(player) + 1*votes_2.count(player)
    
    player_votes[player] = votes / num_simulations

100%|██████████| 677/677 [00:00<00:00, 874.52it/s]


In [135]:
votes_df = pd.DataFrame(player_votes.items(), columns=['player_id', 'predicted_votes'])

In [136]:
votes_df.sort_values('predicted_votes', ascending=False)[:10]

Unnamed: 0,player_id,predicted_votes
21,12411,26.895
373,12329,24.855
25,12437,21.54
232,12061,20.85
56,12269,20.01
337,12596,18.42
195,12418,17.355
408,12685,17.07
144,12022,16.44
10,12277,16.155


In [137]:
player_info = df[['player_id', 'player_first_name', 'player_last_name', 'player_team']].drop_duplicates('player_id')

In [138]:
total_df = pd.merge(player_info, votes_df, on='player_id').sort_values('predicted_votes', ascending=False)

In [139]:
actual_votes = season_df.groupby(['player_first_name', 'player_last_name'])['brownlow_votes'].sum()

In [140]:
final_df = pd.merge(total_df, actual_votes, on=['player_first_name', 'player_last_name'])

In [141]:
def get_name(row):
    
    return f"{row['player_first_name'][0]}. {row['player_last_name']}"

In [142]:
final_df['player'] = final_df.apply(get_name, axis=1)

In [143]:
final_df['error'] = final_df['predicted_votes'] - final_df['brownlow_votes']

In [144]:
cols = [
    'player', 'player_team', 'predicted_votes', 'brownlow_votes', 'error'
]
final_df = final_df[cols]

In [145]:
final_df[:10]

Unnamed: 0,player,player_team,predicted_votes,brownlow_votes,error
0,C. Oliver,Melbourne,26.895,25.0,1.895
1,T. Miller,Gold Coast,24.855,27.0,-2.145
2,C. Petracca,Melbourne,21.54,24.0,-2.46
3,L. Neale,Brisbane Lions,20.85,28.0,-7.15
4,P. Cripps,Carlton,20.01,29.0,-8.99
5,A. Brayshaw,Fremantle,18.42,25.0,-6.58
6,C. Mills,Sydney,17.355,21.0,-3.645
7,S. Walsh,Carlton,17.07,14.0,3.07
8,J. Cameron,Geelong,16.44,19.0,-2.56
9,M. Bontempelli,Western Bulldogs,16.155,10.0,6.155
