In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
df = pd.read_parquet('../../data/curated/adjusted_probabilities_2022.parquet')
season_df = pd.read_parquet('../../data/curated/clean_stats_13-22.parquet').query('season == 2022')

In [3]:
df.head()

Unnamed: 0,player_id,match_id,player_first_name,player_last_name,player_team,new_0,new_1,new_2,new_3,exp_votes
0,11904,16117,Tom,Liberatore,Western Bulldogs,0.998848,0.000859,0.000269,2.5e-05,0.001471
1,11945,16117,Steven,May,Melbourne,0.995647,0.003407,0.000831,0.000115,0.005415
2,11972,16117,Max,Gawn,Melbourne,0.946869,0.027525,0.021477,0.004129,0.082865
3,12015,16117,Tom,McDonald,Melbourne,0.998376,0.001224,0.000359,4.2e-05,0.002066
4,12034,16117,Adam,Tomlinson,Melbourne,0.999494,0.000411,8.7e-05,8e-06,0.000608


In [4]:
players = list(df.player_id.unique())

In [5]:
match_ids = list(df.match_id.unique())

In [6]:
num_simulations = 10_000

votes_3 = []
votes_2 = []
votes_1 = []

# simulate 10,000 seasons 
for _ in tqdm(range(num_simulations), desc="Simulations"):
    
    # simulate every game in the season, choose a 3-2-1 vote
    for match in match_ids:
        sub_df = df.query('match_id == @match')
        match_players = sub_df.player_id.tolist()

        match_prob_3 = sub_df.new_3.tolist()
        match_prob_2 = sub_df.new_2.tolist()
        match_prob_1 = sub_df.new_1.tolist()

        vote_3 = np.random.choice(match_players, 1, p=match_prob_3)[0]
        vote_2 = vote_3  # Initialize with a value that requires re-selection
        
        # ensure same player is not chosen as vote 3 and 2
        while vote_2 == vote_3:
            vote_2 = np.random.choice(match_players, 1, p=match_prob_2)[0]

        vote_1 = vote_3  
        while vote_1 == vote_3 or vote_1 == vote_2:
            vote_1 = np.random.choice(match_players, 1, p=match_prob_1)[0]

        votes_3.append(vote_3)
        votes_2.append(vote_2)
        votes_1.append(vote_1)


Simulations:   0%|          | 0/10000 [00:00<?, ?it/s]

Simulations: 100%|██████████| 10000/10000 [17:15<00:00,  9.66it/s]


In [7]:
player_votes = {}

In [8]:
for player in tqdm(players):
    
    votes = 3*votes_3.count(player) + 2*votes_2.count(player) + 1*votes_2.count(player)
    
    player_votes[player] = votes / num_simulations

100%|██████████| 677/677 [00:46<00:00, 14.46it/s]


In [9]:
votes_df = pd.DataFrame(player_votes.items(), columns=['player_id', 'predicted_votes'])

In [10]:
votes_df.sort_values('predicted_votes', ascending=False)[:10]

Unnamed: 0,player_id,predicted_votes
21,12411,28.0323
373,12329,25.9869
232,12061,23.2338
25,12437,21.7239
56,12269,21.5412
408,12685,19.2183
337,12596,19.1724
195,12418,18.9243
471,12171,17.6607
144,12022,15.8229


In [11]:
player_info = df[['player_id', 'player_first_name', 'player_last_name', 'player_team']].drop_duplicates('player_id')

In [12]:
total_df = pd.merge(player_info, votes_df, on='player_id').sort_values('predicted_votes', ascending=False)

In [13]:
actual_votes = season_df.groupby(['player_first_name', 'player_last_name'])['brownlow_votes'].sum()

In [14]:
final_df = pd.merge(total_df, actual_votes, on=['player_first_name', 'player_last_name'])

In [15]:
def get_name(row):
    
    return f"{row['player_first_name'][0]}. {row['player_last_name']}"

final_df['player'] = final_df.apply(get_name, axis=1)
final_df['error'] = final_df['predicted_votes'] - final_df['brownlow_votes']

cols = [
    'player', 'player_team', 'predicted_votes', 'brownlow_votes', 'error'
]
final_df = final_df[cols]

In [16]:
final_df[:10]

Unnamed: 0,player,player_team,predicted_votes,brownlow_votes,error
0,C. Oliver,Melbourne,28.0323,25.0,3.0323
1,T. Miller,Gold Coast,25.9869,27.0,-1.0131
2,L. Neale,Brisbane Lions,23.2338,28.0,-4.7662
3,C. Petracca,Melbourne,21.7239,24.0,-2.2761
4,P. Cripps,Carlton,21.5412,29.0,-7.4588
5,S. Walsh,Carlton,19.2183,14.0,5.2183
6,A. Brayshaw,Fremantle,19.1724,25.0,-5.8276
7,C. Mills,Sydney,18.9243,21.0,-2.0757
8,R. Laird,Adelaide,17.6607,10.0,7.6607
9,J. Cameron,Geelong,15.8229,19.0,-3.1771


In [17]:
html_table = final_df[:10].to_html(index=False)
print(html_table)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>player</th>
      <th>player_team</th>
      <th>predicted_votes</th>
      <th>brownlow_votes</th>
      <th>error</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>C. Oliver</td>
      <td>Melbourne</td>
      <td>28.0323</td>
      <td>25.0</td>
      <td>3.0323</td>
    </tr>
    <tr>
      <td>T. Miller</td>
      <td>Gold Coast</td>
      <td>25.9869</td>
      <td>27.0</td>
      <td>-1.0131</td>
    </tr>
    <tr>
      <td>L. Neale</td>
      <td>Brisbane Lions</td>
      <td>23.2338</td>
      <td>28.0</td>
      <td>-4.7662</td>
    </tr>
    <tr>
      <td>C. Petracca</td>
      <td>Melbourne</td>
      <td>21.7239</td>
      <td>24.0</td>
      <td>-2.2761</td>
    </tr>
    <tr>
      <td>P. Cripps</td>
      <td>Carlton</td>
      <td>21.5412</td>
      <td>29.0</td>
      <td>-7.4588</td>
    </tr>
    <tr>
      <td>S. Walsh</td>
      <td>Carlton</td>
      <td>19.2183</t