# Preprocessing 3

In this notebook we engineer new features that represent statistics as a proportion of the entire games stats,
this allows us to account for the changing nature of the game that promotes fast ball movement, high possession games.
While also controlling for statistics from the 2020 season in which games were played with shorter quarter lengths.

Realistically when it comes to brownlow votes, it does not matter what the absolute stats are, only the relative performance to other players within the match.

We merge the freshly processed coaches votes, while also including a feature that represents a player's previous polling performance. 

In [204]:
import pandas as pd
import numpy as np

In [205]:
df = pd.read_parquet('../../data/raw/cleaned_stats_12-22_fixed_bv.parquet')

In [206]:
# drop rows where time on ground is zero
# i.e. a sub that was not utilised
df = df.query('time_on_ground_percentage > 0')

In [207]:
df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,intercept_marks,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,player_position,season
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,1.0,15.0,11.6,0.0,2.0,1.0,1.0,R,2012
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,2.0,12.0,9.8,0.0,1.0,2.0,1.0,CHF,2012
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,3.0,0.0,15.0,12.4,0.0,1.0,0.0,3.0,WR,2012
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,1.0,0.0,23.0,15.9,0.0,2.0,1.0,0.0,RR,2012
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,8.0,0.0,7.0,16.0,0.0,2.0,0.0,3.0,CHB,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9103,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,1.0,8.0,5.6,0.0,1.0,3.0,0.0,INT,2022
9104,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,1.0,0.0,21.0,6.1,0.0,1.0,1.0,0.0,INT,2022
9105,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,0.0,23.0,5.8,0.0,0.0,2.0,1.0,FPR,2022
9106,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,3.0,13.0,2.3,0.0,2.0,2.0,0.0,WR,2022


In [208]:
df.columns

Index(['match_id', 'match_home_team', 'match_away_team', 'match_date',
       'match_round', 'match_home_team_goals', 'match_home_team_behinds',
       'match_home_team_score', 'match_away_team_goals',
       'match_away_team_behinds', 'match_away_team_score', 'match_margin',
       'match_winner', 'player_id', 'player_first_name', 'player_last_name',
       'player_team', 'kicks', 'marks', 'handballs', 'disposals',
       'effective_disposals', 'disposal_efficiency_percentage', 'goals',
       'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
       'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
       'brownlow_votes', 'contested_possessions', 'uncontested_possessions',
       'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
       'goal_assists', 'time_on_ground_percentage', 'afl_fantasy_score',
       'centre_clearances', 'stoppage_clearances', 'score_involvements',
       'metres_gained', 'turnovers', 'intercepts', 'tackles_insi

In [209]:
SC = pd.read_parquet('../../data/curated/cleaned_supercoach_12-22.parquet')

In [210]:
SC

Unnamed: 0,match_id,player_id,SC
0,13960.0,12030.0,103.0
1,13960.0,12026.0,84.0
2,13960.0,12021.0,87.0
3,13960.0,11647.0,126.0
4,13960.0,10973.0,104.0
...,...,...,...
93738,16346.0,12469.0,51.0
93739,16346.0,12820.0,44.0
93740,16346.0,12343.0,52.0
93741,16346.0,12419.0,37.0


In [211]:
df = pd.merge(df, SC, on=['player_id', 'match_id'], how='left')

In [212]:
# SC scores merged successfully
df[df.isna().any(axis=1)]

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,player_position,season,SC


In [213]:
counted_stats = [
    'kicks', 'marks', 'handballs', 'disposals',
    'effective_disposals', 'goals', 'afl_fantasy_score', 'SC',
    'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
    'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
    'contested_possessions', 'uncontested_possessions',
    'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
    'goal_assists',
    'centre_clearances', 'stoppage_clearances', 'score_involvements',
    'metres_gained', 'turnovers', 'intercepts', 'tackles_inside_fifty',
    'contest_def_losses', 'contest_def_one_on_ones',
    'contest_off_one_on_ones', 'contest_off_wins', 'def_half_pressure_acts',
    'effective_kicks', 'f50_ground_ball_gets', 'ground_ball_gets',
    'hitouts_to_advantage', 'intercept_marks',
    'marks_on_lead', 'score_launches', 'shots_at_goal', 'spoils'
]

In [214]:
counted_stat_totals = df.groupby('match_id')[counted_stats].sum().add_suffix('_total')

In [215]:
# sum all instances of a stat occuring for all players in each match
counted_stat_totals

Unnamed: 0_level_0,kicks_total,marks_total,handballs_total,disposals_total,effective_disposals_total,goals_total,afl_fantasy_score_total,SC_total,behinds_total,hitouts_total,...,def_half_pressure_acts_total,effective_kicks_total,f50_ground_ball_gets_total,ground_ball_gets_total,hitouts_to_advantage_total,intercept_marks_total,marks_on_lead_total,score_launches_total,shots_at_goal_total,spoils_total
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13960,434,203,321,755,535.0,19,3253.0,3276.0,18,80,...,243.0,281.0,38.0,204.0,15.0,44.0,11.0,37.0,46.0,64.0
13961,397,161,298,695,505.0,30,2968.0,3297.0,22,64,...,251.0,269.0,29.0,204.0,13.0,18.0,12.0,54.0,57.0,75.0
13962,403,164,286,689,516.0,36,3091.0,3300.0,32,73,...,272.0,282.0,30.0,195.0,23.0,23.0,3.0,70.0,75.0,50.0
13963,424,172,259,683,488.0,28,3060.0,3301.0,24,80,...,244.0,284.0,26.0,196.0,22.0,29.0,6.0,55.0,57.0,81.0
13964,442,192,326,768,561.0,29,3236.0,3301.0,23,63,...,229.0,294.0,34.0,212.0,17.0,36.0,6.0,60.0,60.0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16342,411,181,285,696,535.0,32,2897.0,3299.0,18,59,...,261.0,303.0,35.0,173.0,16.0,20.0,17.0,55.0,59.0,59.0
16343,403,145,285,688,468.0,23,2934.0,3298.0,18,82,...,307.0,241.0,36.0,191.0,22.0,36.0,14.0,51.0,53.0,78.0
16344,476,227,269,745,550.0,22,3303.0,3300.0,15,79,...,303.0,328.0,36.0,193.0,23.0,29.0,16.0,41.0,47.0,70.0
16345,425,162,253,678,473.0,21,2930.0,3301.0,21,66,...,285.0,273.0,33.0,188.0,27.0,40.0,11.0,44.0,47.0,83.0


In [216]:
df = pd.merge(df, counted_stat_totals, on='match_id', how='inner')

In [217]:
for stat in counted_stats:
    df[stat+'_proportion'] = df[stat] / df[stat+'_total']

In [218]:
df.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,def_half_pressure_acts_proportion,effective_kicks_proportion,f50_ground_ball_gets_proportion,ground_ball_gets_proportion,hitouts_to_advantage_proportion,intercept_marks_proportion,marks_on_lead_proportion,score_launches_proportion,shots_at_goal_proportion,spoils_proportion
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.024691,0.032028,0.0,0.034314,0.0,0.0,0.090909,0.054054,0.021739,0.015625
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.004115,0.02847,0.0,0.02451,0.066667,0.0,0.181818,0.027027,0.043478,0.015625
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.037037,0.035587,0.026316,0.019608,0.0,0.068182,0.0,0.027027,0.0,0.046875
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.016461,0.021352,0.131579,0.039216,0.0,0.022727,0.0,0.054054,0.021739,0.0
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.020576,0.042705,0.0,0.014706,0.0,0.181818,0.0,0.054054,0.0,0.046875


In [219]:
df.loc[df['bounces_proportion'].isna()]

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,def_half_pressure_acts_proportion,effective_kicks_proportion,f50_ground_ball_gets_proportion,ground_ball_gets_proportion,hitouts_to_advantage_proportion,intercept_marks_proportion,marks_on_lead_proportion,score_launches_proportion,shots_at_goal_proportion,spoils_proportion
69784,15653,Port Adelaide,Adelaide,2020-06-13,2,17,8,110,5,5,...,0.012500,0.013158,0.057143,0.027778,0.000000,0.000000,0.090909,0.057143,0.057143,0.000000
69785,15653,Port Adelaide,Adelaide,2020-06-13,2,17,8,110,5,5,...,0.004167,0.030702,0.028571,0.016667,0.045455,0.076923,0.000000,0.000000,0.085714,0.051724
69786,15653,Port Adelaide,Adelaide,2020-06-13,2,17,8,110,5,5,...,0.016667,0.030702,0.000000,0.022222,0.000000,0.000000,0.000000,0.000000,0.057143,0.000000
69787,15653,Port Adelaide,Adelaide,2020-06-13,2,17,8,110,5,5,...,0.004167,0.039474,0.085714,0.022222,0.000000,0.000000,0.090909,0.028571,0.028571,0.000000
69788,15653,Port Adelaide,Adelaide,2020-06-13,2,17,8,110,5,5,...,0.004167,0.017544,0.028571,0.027778,0.000000,0.038462,0.000000,0.028571,0.000000,0.051724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79408,15952,Port Adelaide,Western Bulldogs,2021-05-15,9,12,5,77,15,6,...,0.014925,0.010753,0.028571,0.015306,0.000000,0.000000,0.000000,0.000000,0.043478,0.000000
79409,15952,Port Adelaide,Western Bulldogs,2021-05-15,9,12,5,77,15,6,...,0.002985,0.010753,0.028571,0.020408,0.000000,0.000000,0.000000,0.000000,0.021739,0.000000
79410,15952,Port Adelaide,Western Bulldogs,2021-05-15,9,12,5,77,15,6,...,0.014925,0.014337,0.000000,0.025510,0.000000,0.040000,0.000000,0.026316,0.000000,0.013699
79411,15952,Port Adelaide,Western Bulldogs,2021-05-15,9,12,5,77,15,6,...,0.017910,0.014337,0.000000,0.000000,0.266667,0.000000,0.000000,0.026316,0.021739,0.013699


In [220]:
# find stats where none occur in a match and hence player proportion is n/a
s = df.isna().any()
na_counts = list(s[s].index)
na_counts

['bounces_proportion',
 'contest_def_losses_proportion',
 'contest_off_wins_proportion',
 'marks_on_lead_proportion',
 'score_launches_proportion']

In [221]:
df.fillna(0, inplace=True)

# Coaches Votes

In [222]:
c_votes = pd.read_parquet('../../data/raw/coaches_votes.parquet')

In [223]:
df = pd.merge(df, c_votes, on=['match_id', 'player_id'], how='inner')

In [224]:
# coaches votes merged successfully
df[df.isna().any(axis=1)]

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,effective_kicks_proportion,f50_ground_ball_gets_proportion,ground_ball_gets_proportion,hitouts_to_advantage_proportion,intercept_marks_proportion,marks_on_lead_proportion,score_launches_proportion,shots_at_goal_proportion,spoils_proportion,coaches_votes


In [225]:
# loss/win by x amount has same value for all players in the match
df.match_margin.unique()

array([ 63,  44,  22,  41,  69,   2,   4,  49,  91,  25,  13, 108,  18,
        21, 129,  92,  60,  59,  29,  17,  81,  56,  30,  24,   5,  65,
        10,  36,   1,   8,  42,  34,  38,  37,  19,  67,  43,   7,  35,
        66,  50,  27,  58,  61,  40,  12, 101,  20,  62,  48,  28,  26,
        16, 115,  54,   6,  97,  95,  46,   3,  78,  32, 126,  84,  94,
        33,  23,  47,  71, 162,  14,  53,  31, 119,  72,  52,  11, 120,
        76,  98,  70,  96,  82,  64,  45, 128,  15,   0,  68,  79, 148,
        39,  55,   9, 135,  83, 100,  90,  77,  75,  86, 122, 113,  74,
       121,  93,  99, 145, 111,  87, 110,  85, 105, 103,  57,  73,  89,
       138, 112,  88, 102,  80,  51, 104, 109, 133])

In [226]:
# lets engineer a feature that captures how much a team won/lost by
def winning_margin(row):
    if row['match_winner'] == row['player_team']:
        return row['match_margin']
    # note in the event of a draw match_winner == "Draw" => return -1*0 = 0
    else:
        return -1*row['match_margin']

In [227]:
df['winning_margin'] = df.apply(winning_margin, axis=1)

In [228]:
df.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,f50_ground_ball_gets_proportion,ground_ball_gets_proportion,hitouts_to_advantage_proportion,intercept_marks_proportion,marks_on_lead_proportion,score_launches_proportion,shots_at_goal_proportion,spoils_proportion,coaches_votes,winning_margin
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,0.034314,0.0,0.0,0.090909,0.054054,0.021739,0.015625,0.0,-63
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,0.02451,0.066667,0.0,0.181818,0.027027,0.043478,0.015625,0.0,63
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.026316,0.019608,0.0,0.068182,0.0,0.027027,0.0,0.046875,2.0,-63
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.131579,0.039216,0.0,0.022727,0.0,0.054054,0.021739,0.0,0.0,63
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,0.014706,0.0,0.181818,0.0,0.054054,0.0,0.046875,7.0,63


# Previous Polling Performance

In [229]:
yearly_votes = pd.DataFrame(df.groupby(['season', 'player_id'])['brownlow_votes'].agg(['sum',
                                                                                      'count']))

In [230]:
yearly_votes.reset_index(inplace=True)
# adjust season by 1 so that the df's merge as intended
yearly_votes['season'] = yearly_votes['season'] + 1
yearly_votes['average_votes_prev'] = yearly_votes['sum'] / yearly_votes['count']
yearly_votes = yearly_votes.rename(columns={'brownlow_votes': 'prev_total_bv'})
yearly_votes.sort_values('average_votes_prev')

Unnamed: 0,season,player_id,sum,count,average_votes_prev
7303,2023,13026,0.0,1,0.000000
5924,2021,12820,0.0,7,0.000000
5925,2021,12821,0.0,2,0.000000
3457,2018,11721,0.0,8,0.000000
3456,2018,11720,0.0,22,0.000000
...,...,...,...,...,...
4765,2020,11844,33.0,20,1.650000
2828,2017,11706,35.0,21,1.666667
2278,2016,11844,31.0,18,1.722222
5874,2021,12769,9.0,5,1.800000


In [231]:
# we now have a feature that contains the average amount of votes a player polled per game for the previous year
new_df = pd.merge(df, yearly_votes, on=['season', 'player_id'], how='left')

In [232]:
# to confirm, Dangerfield won the brownlow in 2016 with 35 votes across 22 games => 1.59 vote average
# one of those matches would be the case where the stats were not recorded, therefore in this dataset 35/21 = 1.67

new_df.query('season == 2017 & player_last_name == "Dangerfield"')[['player_last_name', 'average_votes_prev']].head(1)

Unnamed: 0,player_last_name,average_votes_prev
43397,Dangerfield,1.666667


In [233]:
new_df.sort_values(['match_date', 'match_id'], inplace=True)

In [234]:
new_df.reset_index(drop=True, inplace=True)

# Dummy Variables

In [235]:
# adds feature that equals 1 if a player obtains 30+ disposals and 2+ goals
new_df['30_and_2'] = ((new_df['disposals'] >= 30) & (new_df['goals'] >= 2)).astype(int)

# adds feature that equals 1 if a player manages 5 or more goals
new_df['high_goal_scorer'] = (new_df['goals'] >= 5).astype(int)

# if winning_margin does not predict well, include dummy variable whether the team won or not
# in the event of a draw, will return 0
new_df['won_match'] = (new_df['player_team'] == new_df['match_winner']).astype(int)

## Captains

In [236]:
captains = pd.read_parquet('../../data/curated/season_captains_12-22.parquet')

In [237]:
captains

Unnamed: 0,season,player_id,player_team,player,match_round
0,2012,10942,Sydney,Adam Goodes,1
1,2012,10942,Sydney,Adam Goodes,2
2,2012,10942,Sydney,Adam Goodes,3
3,2012,10942,Sydney,Adam Goodes,4
4,2012,10942,Sydney,Adam Goodes,5
...,...,...,...,...,...
5557,2021,11671,Richmond,Trent Cotchin,19
5558,2021,11671,Richmond,Trent Cotchin,20
5559,2021,11671,Richmond,Trent Cotchin,21
5560,2021,11671,Richmond,Trent Cotchin,22


In [238]:
new_df = pd.merge(new_df, captains, on=['season', 'player_id', 'match_round', 'player_team'],
         how='left')

In [239]:
new_df['is_captain'] = new_df['player'].notna().astype(int)

In [240]:
new_df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,coaches_votes,winning_margin,sum,count,average_votes_prev,30_and_2,high_goal_scorer,won_match,player,is_captain
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,-63,,,,0,0,0,,0
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,63,,,,0,0,1,Adam Goodes,1
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,2.0,-63,,,,0,0,0,,0
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,63,,,,0,0,1,,0
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,7.0,63,,,,0,0,1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93738,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,14,0.0,7.0,0.000000,0,0,1,,0
93739,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,14,2.0,17.0,0.117647,0,0,1,,0
93740,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,-14,,,,0,0,0,,0
93741,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,-14,,,,0,0,0,,0


# Player Position

Convert player positions to their general position

i.e. 

left back pocket, right back pocket, left half back, right half back => defender

full back, centre half back => key defender

likewise for forwards

ruck remains the same

left wing,  right wing => wing

rover, ruck rover, centre => rover

In [241]:
list(new_df.player_position.unique())

['R',
 'CHF',
 'WR',
 'RR',
 'CHB',
 'BPL',
 'FPL',
 'HFFL',
 'WL',
 'FPR',
 'FB',
 'C',
 'HBFR',
 'INT',
 'HFFR',
 'BPR',
 'RK',
 'FF',
 'HBFL',
 'SUB']

In [242]:
convert_pos = {
    'CHB': 'key_defender',
    'RR': 'rover',
    'HFFL': 'forward',
    'HBFR': 'defender',
    'HBFL': 'defender',
    'C': 'rover',
    'FB': 'key_defender',
    'CHF': 'key_forward',
    'WL': 'wing',
    'INT': 'INT',
    'FPL': 'forward',
    'R': 'rover',
    'WR': 'wing',
    'RK': 'ruck',
    'BPR': 'defender',
    'FF': 'key_forward',
    'SUB': 'sub',
    'HFFR': 'forward',
    'FPR': 'forward',
    'BPL': 'defender'
}
new_df.replace({'player_position': convert_pos}, inplace=True)

In [243]:
new_df = pd.get_dummies(new_df, columns=['player_position'], dtype=int)

In [244]:
new_df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,is_captain,player_position_INT,player_position_defender,player_position_forward,player_position_key_defender,player_position_key_forward,player_position_rover,player_position_ruck,player_position_sub,player_position_wing
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0,0,0,0,0,0,1,0,0,0
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,1,0,0,0,0,1,0,0,0,0
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0,0,0,0,0,0,0,0,0,1
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0,0,0,0,0,0,1,0,0,0
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93738,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0,1,0,0,0,0,0,0,0,0
93739,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0,1,0,0,0,0,0,0,0,0
93740,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0,0,0,1,0,0,0,0,0,0
93741,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0,0,0,0,0,0,0,0,0,1


In [245]:
new_df.columns[:50]

Index(['match_id', 'match_home_team', 'match_away_team', 'match_date',
       'match_round', 'match_home_team_goals', 'match_home_team_behinds',
       'match_home_team_score', 'match_away_team_goals',
       'match_away_team_behinds', 'match_away_team_score', 'match_margin',
       'match_winner', 'player_id', 'player_first_name', 'player_last_name',
       'player_team', 'kicks', 'marks', 'handballs', 'disposals',
       'effective_disposals', 'disposal_efficiency_percentage', 'goals',
       'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
       'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
       'brownlow_votes', 'contested_possessions', 'uncontested_possessions',
       'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
       'goal_assists', 'time_on_ground_percentage', 'afl_fantasy_score',
       'centre_clearances', 'stoppage_clearances', 'score_involvements',
       'metres_gained', 'turnovers', 'intercepts', 'tackles_insi

In [246]:
new_df.columns[50:100]

Index(['contest_def_losses', 'contest_def_one_on_ones',
       'contest_off_one_on_ones', 'contest_off_wins', 'def_half_pressure_acts',
       'effective_kicks', 'f50_ground_ball_gets', 'ground_ball_gets',
       'hitouts_to_advantage', 'hitout_win_percentage', 'intercept_marks',
       'marks_on_lead', 'pressure_acts', 'rating_points', 'ruck_contests',
       'score_launches', 'shots_at_goal', 'spoils', 'season', 'SC',
       'kicks_total', 'marks_total', 'handballs_total', 'disposals_total',
       'effective_disposals_total', 'goals_total', 'afl_fantasy_score_total',
       'SC_total', 'behinds_total', 'hitouts_total', 'tackles_total',
       'rebounds_total', 'inside_fifties_total', 'clearances_total',
       'clangers_total', 'free_kicks_for_total', 'free_kicks_against_total',
       'contested_possessions_total', 'uncontested_possessions_total',
       'contested_marks_total', 'marks_inside_fifty_total',
       'one_percenters_total', 'bounces_total', 'goal_assists_total',
      

In [247]:
new_df.columns[100:150]

Index(['tackles_inside_fifty_total', 'contest_def_losses_total',
       'contest_def_one_on_ones_total', 'contest_off_one_on_ones_total',
       'contest_off_wins_total', 'def_half_pressure_acts_total',
       'effective_kicks_total', 'f50_ground_ball_gets_total',
       'ground_ball_gets_total', 'hitouts_to_advantage_total',
       'intercept_marks_total', 'marks_on_lead_total', 'score_launches_total',
       'shots_at_goal_total', 'spoils_total', 'kicks_proportion',
       'marks_proportion', 'handballs_proportion', 'disposals_proportion',
       'effective_disposals_proportion', 'goals_proportion',
       'afl_fantasy_score_proportion', 'SC_proportion', 'behinds_proportion',
       'hitouts_proportion', 'tackles_proportion', 'rebounds_proportion',
       'inside_fifties_proportion', 'clearances_proportion',
       'clangers_proportion', 'free_kicks_for_proportion',
       'free_kicks_against_proportion', 'contested_possessions_proportion',
       'uncontested_possessions_proportion'

In [248]:
new_df.columns[150:]

Index(['def_half_pressure_acts_proportion', 'effective_kicks_proportion',
       'f50_ground_ball_gets_proportion', 'ground_ball_gets_proportion',
       'hitouts_to_advantage_proportion', 'intercept_marks_proportion',
       'marks_on_lead_proportion', 'score_launches_proportion',
       'shots_at_goal_proportion', 'spoils_proportion', 'coaches_votes',
       'winning_margin', 'sum', 'count', 'average_votes_prev', '30_and_2',
       'high_goal_scorer', 'won_match', 'player', 'is_captain',
       'player_position_INT', 'player_position_defender',
       'player_position_forward', 'player_position_key_defender',
       'player_position_key_forward', 'player_position_rover',
       'player_position_ruck', 'player_position_sub', 'player_position_wing'],
      dtype='object')

In [251]:
# reorder columns of dataframe
reorder_cols = [
    # team/match information
    'match_id', 'match_home_team', 'match_away_team', 'match_date',
    'match_round', 'season', 'match_home_team_goals', 'match_home_team_behinds',
    'match_home_team_score', 'match_away_team_goals',
    'match_away_team_behinds', 'match_away_team_score', 

    'match_margin', 'match_winner', 

    # player information
    'player_id', 'player_first_name', 'player_last_name', 'player_team', 'is_captain',

    # player position
    'player_position_defender', 'player_position_rover',
    'player_position_key_defender', 'player_position_key_forward',
    'player_position_forward', 'player_position_INT',
    'player_position_ruck', 'player_position_sub', 'player_position_wing',

    # player stats
    'kicks', 'marks', 'handballs', 'disposals',
    'effective_disposals', 'disposal_efficiency_percentage', 'goals',
    'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
    'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
    'contested_possessions', 'uncontested_possessions',
    'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
    'goal_assists', 'time_on_ground_percentage', 'afl_fantasy_score', 'SC',
    'centre_clearances', 'stoppage_clearances', 'score_involvements',
    'metres_gained', 'turnovers', 'intercepts', 'tackles_inside_fifty',
    'contest_def_losses', 'contest_def_one_on_ones',
    'contest_off_one_on_ones', 'contest_off_wins', 'def_half_pressure_acts',
    'effective_kicks', 'f50_ground_ball_gets', 'ground_ball_gets',
    'hitouts_to_advantage', 'hitout_win_percentage', 'intercept_marks',
    'marks_on_lead', 'pressure_acts',  'ruck_contests',
    'score_launches', 'shots_at_goal', 'spoils', 

    # engineered stats
    'rating_points',
    'winning_margin', 'won_match',
    '30_and_2', 'high_goal_scorer', 

    # player stat proportions

    'kicks_proportion', 'marks_proportion', 'handballs_proportion', 'disposals_proportion',
    'effective_disposals_proportion', 'goals_proportion',
    'afl_fantasy_score_proportion', 'SC_proportion', 'behinds_proportion',
    'hitouts_proportion', 'tackles_proportion', 'rebounds_proportion',
    'inside_fifties_proportion', 'clearances_proportion',
    'clangers_proportion', 'free_kicks_for_proportion',
    'free_kicks_against_proportion', 'contested_possessions_proportion',
    'uncontested_possessions_proportion', 'contested_marks_proportion',
    'marks_inside_fifty_proportion', 'one_percenters_proportion',
    'bounces_proportion', 'goal_assists_proportion',
    'centre_clearances_proportion', 'stoppage_clearances_proportion',
    'score_involvements_proportion', 'metres_gained_proportion',
    'turnovers_proportion', 'intercepts_proportion',
    'tackles_inside_fifty_proportion', 'contest_def_losses_proportion',
    'contest_def_one_on_ones_proportion',
    'contest_off_one_on_ones_proportion', 'contest_off_wins_proportion',
    'def_half_pressure_acts_proportion', 'effective_kicks_proportion',
    'f50_ground_ball_gets_proportion', 'ground_ball_gets_proportion',
    'hitouts_to_advantage_proportion', 'intercept_marks_proportion',
    'marks_on_lead_proportion', 'score_launches_proportion',
    'shots_at_goal_proportion', 'spoils_proportion',

    # team match totals
    'kicks_total', 'marks_total', 'handballs_total', 'disposals_total',
    'effective_disposals_total', 'goals_total', 'afl_fantasy_score_total',
    'SC_total', 'behinds_total', 'hitouts_total', 'tackles_total',
    'rebounds_total', 'inside_fifties_total', 'clearances_total',
    'clangers_total', 'free_kicks_for_total', 'free_kicks_against_total',
    'contested_possessions_total', 'uncontested_possessions_total',
    'contested_marks_total', 'marks_inside_fifty_total',
    'one_percenters_total', 'bounces_total', 'goal_assists_total',
    'centre_clearances_total', 'stoppage_clearances_total',
    'score_involvements_total', 'metres_gained_total', 'turnovers_total',
    'intercepts_total',
    'tackles_inside_fifty_total', 'contest_def_losses_total',
    'contest_def_one_on_ones_total', 'contest_off_one_on_ones_total',
    'contest_off_wins_total', 'def_half_pressure_acts_total',
    'effective_kicks_total', 'f50_ground_ball_gets_total',
    'ground_ball_gets_total', 'hitouts_to_advantage_total',
    'intercept_marks_total', 'marks_on_lead_total', 'score_launches_total',
    'shots_at_goal_total', 'spoils_total', 

    # player votes
    'coaches_votes', 'average_votes_prev', 'brownlow_votes'
]

In [252]:
new_df = new_df[reorder_cols]

In [254]:
# do not have previous years brownlow numbers for 2012, therefore we drop the season from the data
new_df = new_df.query('season > 2012')

In [255]:
new_df.isna().sum().sort_values(ascending=False)[:5]

average_votes_prev                 8782
match_id                              0
f50_ground_ball_gets_proportion       0
intercepts_proportion                 0
tackles_inside_fifty_proportion       0
dtype: int64

In [256]:
# n/a values for players that did not play previous season (first year player, injured for entire season etc.)
new_df.loc[new_df.average_votes_prev.isna(), 'average_votes_prev'] = 0

In [257]:
new_df.season.unique()

array([2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [258]:
new_df.to_parquet('../../data/curated/clean_stats_13-22.parquet')