# Preprocessing 3

In this notebook we engineer new features that represent statistics as a proportion of the entire games stats,
this allows us to account for the changing nature of the game that promotes fast ball movement, high possession games.
While also controlling for statistics from the 2020 season in which games were played with shorter quarter lengths.

Realistically when it comes to brownlow votes, it does not matter what the absolute stats are, only the relative performance to other players within the match.

We merge the freshly processed coaches votes, while also including a feature that represents a player's previous polling performance. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('../../data/raw/cleaned_stats_12-22_fixed_bv.parquet')

In [3]:
df.query('season == 2022 & match_round == 21 & player_last_name == "Daicos"')['rebounds']

7891    0
7909    7
Name: rebounds, dtype: int64

In [4]:
# drop rows where time on ground is zero
# i.e. a sub that was not utilised
df = df.query('time_on_ground_percentage > 0')

In [5]:
df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,intercept_marks,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,player_position,season
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,1.0,15.0,11.6,0.0,2.0,1.0,1.0,R,2012
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,2.0,12.0,9.8,0.0,1.0,2.0,1.0,CHF,2012
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,3.0,0.0,15.0,12.4,0.0,1.0,0.0,3.0,WR,2012
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,1.0,0.0,23.0,15.9,0.0,2.0,1.0,0.0,RR,2012
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,8.0,0.0,7.0,16.0,0.0,2.0,0.0,3.0,CHB,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9103,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,1.0,8.0,5.6,0.0,1.0,3.0,0.0,INT,2022
9104,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,1.0,0.0,21.0,6.1,0.0,1.0,1.0,0.0,INT,2022
9105,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,0.0,23.0,5.8,0.0,0.0,2.0,1.0,FPR,2022
9106,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,3.0,13.0,2.3,0.0,2.0,2.0,0.0,WR,2022


In [5]:
df.columns

Index(['match_id', 'match_home_team', 'match_away_team', 'match_date',
       'match_round', 'season', 'match_home_team_goals',
       'match_home_team_behinds', 'match_home_team_score',
       'match_away_team_goals', 'match_away_team_behinds',
       'match_away_team_score', 'match_margin', 'match_winner', 'player_id',
       'player_first_name', 'player_last_name', 'player_team',
       'player_position', 'kicks', 'marks', 'handballs', 'disposals',
       'effective_disposals', 'disposal_efficiency_percentage', 'goals',
       'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
       'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
       'brownlow_votes', 'contested_possessions', 'uncontested_possessions',
       'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
       'goal_assists', 'time_on_ground_percentage', 'afl_fantasy_score',
       'centre_clearances', 'stoppage_clearances', 'score_involvements',
       'metres_gained', 'tur

In [7]:
SC = pd.read_csv('../../data/raw/supercoach_12-22.csv', index_col=0)[['match_id', 'player_id', 'SC']]

In [8]:
SC

Unnamed: 0,match_id,player_id,SC
0,13966,11260.0,113.0
1,13966,11135.0,38.0
2,13966,11927.0,48.0
3,13966,11327.0,95.0
4,13966,12048.0,46.0
...,...,...,...
94222,16346,12009.0,88.0
94223,16346,12634.0,24.0
94224,16346,12419.0,37.0
94225,16346,12516.0,121.0


In [9]:
df = pd.merge(df, SC, on=['player_id', 'match_id'], how='left')

In [10]:
# SC scores merged successfully
df[df.isna().any(axis=1)]

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,hitout_win_percentage,intercept_marks,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,SC


In [11]:
counted_stats = [
    'kicks', 'marks', 'handballs', 'disposals',
    'effective_disposals', 'goals', 'afl_fantasy_score', 'SC',
    'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
    'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
    'contested_possessions', 'uncontested_possessions',
    'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
    'goal_assists',
    'centre_clearances', 'stoppage_clearances', 'score_involvements',
    'metres_gained', 'turnovers', 'intercepts', 'tackles_inside_fifty',
    'contest_def_losses', 'contest_def_one_on_ones',
    'contest_off_one_on_ones', 'contest_off_wins', 'def_half_pressure_acts',
    'effective_kicks', 'f50_ground_ball_gets', 'ground_ball_gets',
    'hitouts_to_advantage', 'hitout_win_percentage', 'intercept_marks',
    'marks_on_lead', 'score_launches', 'shots_at_goal', 'spoils'
]

In [12]:
counted_stat_totals = df.groupby('match_id')[counted_stats].sum().add_suffix('_total')

In [13]:
# sum all instances of a stat occuring for all players in each match
counted_stat_totals

Unnamed: 0_level_0,kicks_total,marks_total,handballs_total,disposals_total,effective_disposals_total,goals_total,afl_fantasy_score_total,SC_total,behinds_total,hitouts_total,...,effective_kicks_total,f50_ground_ball_gets_total,ground_ball_gets_total,hitouts_to_advantage_total,hitout_win_percentage_total,intercept_marks_total,marks_on_lead_total,score_launches_total,shots_at_goal_total,spoils_total
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13960,434,203,321,755,535,19,3253,3276.0,18,80,...,281,38,204,15,153,44,11,37,46,64
13961,397,161,298,695,505,30,2968,3297.0,22,64,...,269,29,204,13,290,18,12,54,57,75
13962,403,164,286,689,516,36,3091,3300.0,32,73,...,282,30,195,23,268,23,3,70,75,50
13963,424,172,259,683,488,28,3060,3301.0,24,80,...,284,26,196,22,170,29,6,55,57,81
13964,442,192,326,768,561,29,3236,3301.0,23,63,...,294,34,212,17,187,36,6,60,60,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16342,411,181,285,696,535,32,2897,3299.0,18,59,...,303,35,173,16,160,20,17,55,59,59
16343,403,145,285,688,468,23,2934,3298.0,18,82,...,241,36,191,22,272,36,14,51,53,78
16344,476,227,269,745,550,22,3303,3300.0,15,79,...,328,36,193,23,247,29,16,41,47,70
16345,425,162,253,678,473,21,2930,3301.0,21,66,...,273,33,188,27,143,40,11,44,47,83


In [14]:
# remove stat totals where stat did not take place in game
# prevents divide by 0 in future cell
s = counted_stat_totals.eq(0).any()
drop_stat_count = list(s[s].index)

In [15]:
drop_stat_count

['bounces_total',
 'contest_def_losses_total',
 'contest_off_wins_total',
 'marks_on_lead_total',
 'score_launches_total']

In [16]:
counted_stat_totals.drop(columns=drop_stat_count, inplace=True)

In [17]:
new_counted_stats = list(set(counted_stats) - set(
    ['bounces',
    'contest_def_losses',
    'contest_off_wins',
    'marks_on_lead',
    'score_launches']
 ))

In [18]:
# 46 - 5 = 41
len(new_counted_stats)

41

In [19]:
df = pd.merge(df, counted_stat_totals, on='match_id', how='inner')

In [20]:
df.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,contest_off_one_on_ones_total,def_half_pressure_acts_total,effective_kicks_total,f50_ground_ball_gets_total,ground_ball_gets_total,hitouts_to_advantage_total,hitout_win_percentage_total,intercept_marks_total,shots_at_goal_total,spoils_total
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,19,243,281,38,204,15,153,44,46,64
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,19,243,281,38,204,15,153,44,46,64
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,19,243,281,38,204,15,153,44,46,64
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,19,243,281,38,204,15,153,44,46,64
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,19,243,281,38,204,15,153,44,46,64


In [58]:
c_votes = pd.read_parquet('../../data/raw/coaches_votes.parquet')

In [59]:
df = pd.merge(df, c_votes, on=['match_id', 'player_id'], how='inner')

In [60]:
# coaches votes merged successfully
df[df.isna().any(axis=1)]

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,score_involvements_proportion,contest_off_one_on_ones_proportion,effective_disposals_proportion,rebounds_proportion,goal_assists_proportion,intercepts_proportion,clangers_proportion,effective_kicks_proportion,free_kicks_against_proportion,coaches_votes


In [61]:
df.columns

Index(['match_id', 'match_home_team', 'match_away_team', 'match_date',
       'match_round', 'season', 'match_home_team_goals',
       'match_home_team_behinds', 'match_home_team_score',
       'match_away_team_goals',
       ...
       'score_involvements_proportion', 'contest_off_one_on_ones_proportion',
       'effective_disposals_proportion', 'rebounds_proportion',
       'goal_assists_proportion', 'intercepts_proportion',
       'clangers_proportion', 'effective_kicks_proportion',
       'free_kicks_against_proportion', 'coaches_votes'],
      dtype='object', length=157)

In [62]:
# loss/win by x amount has same value for all players in the match
df.match_margin.unique()

array([ 63,  44,  22,  41,  69,   2,   4,  49,  91,  25,  13, 108,  18,
        21, 129,  92,  60,  59,  29,  17,  81,  56,  30,  24,   5,  65,
        10,  36,   1,   8,  42,  34,  38,  37,  19,  67,  43,   7,  35,
        66,  50,  27,  58,  61,  40,  12, 101,  20,  62,  48,  28,  26,
        16, 115,  54,   6,  97,  95,  46,   3,  78,  32, 126,  84,  94,
        33,  23,  47,  71, 162,  14,  53,  31, 119,  72,  52,  11, 120,
        76,  98,  70,  96,  82,  64,  45, 128,  15,   0,  68,  79, 148,
        39,  55,   9, 135,  83, 100,  90,  77,  75,  86, 122, 113,  74,
       121,  93,  99, 145, 111,  87, 110,  85, 105, 103,  57,  73,  89,
       138, 112,  88, 102,  80,  51, 104, 109, 133])

In [63]:
# lets engineer a feature that captures how much a team won/lost by
def winning_margin(row):
    if row['match_winner'] == row['player_team']:
        return row['match_margin']
    # note in the event of a draw match_winner == "Draw" => return -1*0 = 0
    else:
        return -1*row['match_margin']

In [64]:
df['winning_margin'] = df.apply(winning_margin, axis=1)

In [65]:
df.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,score_involvements_proportion,contest_off_one_on_ones_proportion,effective_disposals_proportion,rebounds_proportion,goal_assists_proportion,intercepts_proportion,clangers_proportion,effective_kicks_proportion,free_kicks_against_proportion,coaches_votes
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.028777,0.0,0.037383,0.027027,0.0,0.0,0.026549,0.032028,0.05,0.0
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.035971,0.157895,0.028037,0.040541,0.083333,0.015267,0.026549,0.02847,0.0,0.0
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.007194,0.0,0.028037,0.067568,0.0,0.053435,0.053097,0.035587,0.025,2.0
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.057554,0.0,0.028037,0.0,0.0,0.007634,0.00885,0.021352,0.025,0.0
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.021583,0.0,0.028037,0.013514,0.0,0.099237,0.0,0.042705,0.0,7.0


In [66]:
for stat in new_counted_stats:
    df[stat+'_proportion'] = df[stat] / df[stat+'_total']

In [67]:
df.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,score_involvements_proportion,contest_off_one_on_ones_proportion,effective_disposals_proportion,rebounds_proportion,goal_assists_proportion,intercepts_proportion,clangers_proportion,effective_kicks_proportion,free_kicks_against_proportion,coaches_votes
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.028777,0.0,0.037383,0.027027,0.0,0.0,0.026549,0.032028,0.05,0.0
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.035971,0.157895,0.028037,0.040541,0.083333,0.015267,0.026549,0.02847,0.0,0.0
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.007194,0.0,0.028037,0.067568,0.0,0.053435,0.053097,0.035587,0.025,2.0
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.057554,0.0,0.028037,0.0,0.0,0.007634,0.00885,0.021352,0.025,0.0
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.021583,0.0,0.028037,0.013514,0.0,0.099237,0.0,0.042705,0.0,7.0


In [68]:
df.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,score_involvements_proportion,contest_off_one_on_ones_proportion,effective_disposals_proportion,rebounds_proportion,goal_assists_proportion,intercepts_proportion,clangers_proportion,effective_kicks_proportion,free_kicks_against_proportion,coaches_votes
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.028777,0.0,0.037383,0.027027,0.0,0.0,0.026549,0.032028,0.05,0.0
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.035971,0.157895,0.028037,0.040541,0.083333,0.015267,0.026549,0.02847,0.0,0.0
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.007194,0.0,0.028037,0.067568,0.0,0.053435,0.053097,0.035587,0.025,2.0
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.057554,0.0,0.028037,0.0,0.0,0.007634,0.00885,0.021352,0.025,0.0
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.021583,0.0,0.028037,0.013514,0.0,0.099237,0.0,0.042705,0.0,7.0


In [69]:
yearly_votes = pd.DataFrame(df.groupby(['season', 'player_id'])['brownlow_votes'].agg(['sum',
                                                                                      'count']))

In [70]:
yearly_votes.reset_index(inplace=True)
# adjust season by 1 so that the df's merge as intended
yearly_votes['season'] = yearly_votes['season'] + 1
yearly_votes['average_votes_prev'] = yearly_votes['sum'] / yearly_votes['count']
yearly_votes = yearly_votes.rename(columns={'brownlow_votes': 'prev_total_bv'})
yearly_votes.sort_values('average_votes_prev')

Unnamed: 0,season,player_id,sum,count,average_votes_prev
7303,2023,13026,0.0,1,0.000000
5924,2021,12820,0.0,7,0.000000
5925,2021,12821,0.0,2,0.000000
3457,2018,11721,0.0,8,0.000000
3456,2018,11720,0.0,22,0.000000
...,...,...,...,...,...
4765,2020,11844,33.0,20,1.650000
2828,2017,11706,35.0,21,1.666667
2278,2016,11844,31.0,18,1.722222
5874,2021,12769,9.0,5,1.800000


In [89]:
# we now have a feature that contains the average amount of votes a player polled per game
# for the previous year
new_df = pd.merge(df, yearly_votes, on=['season', 'player_id'], how='left')

In [90]:
# to confirm, Dangerfield won the brownlow in 2016 with 35 votes across 22 games => 1.59 vote average
# one of those matches would be the case where the stats were not recorded, therefore in this dataset 35/21 = 1.67

new_df.query('season == 2017 & player_last_name == "Dangerfield"')[['player_last_name', 'average_votes_prev']].head(1)

Unnamed: 0,player_last_name,average_votes_prev
43397,Dangerfield,1.666667


In [91]:
new_df.sort_values(['match_date', 'match_id'], inplace=True)

In [92]:
new_df.reset_index(drop=True, inplace=True)

In [93]:
# if winning_margin does not predict well, include dummy variable whether the team won or not
# in the event of a draw, will return 0
new_df['won_match'] = (new_df['player_team'] == new_df['match_winner']).astype(int)

In [94]:
# adds feature that equals 1 if a player obtains 30+ disposals and 2+ goals
new_df['30_and_2'] = ((new_df['disposals'] >= 30) & (new_df['goals'] >= 2)).astype(int)

# adds feature that equals 1 if a player manages 5 or more goals
new_df['high_goal_scorer'] = (new_df['goals'] >= 5).astype(int)

# Captains

In [95]:
captains = pd.read_csv('../../data/curated/player_is_captain.csv', index_col=0)

In [96]:
captains

Unnamed: 0,season,player_id,player_team,player,match_round
0,2012,10942,Sydney,Adam Goodes,1
1,2012,10942,Sydney,Adam Goodes,2
2,2012,10942,Sydney,Adam Goodes,3
3,2012,10942,Sydney,Adam Goodes,4
4,2012,10942,Sydney,Adam Goodes,5
...,...,...,...,...,...
5557,2021,11671,Richmond,Trent Cotchin,19
5558,2021,11671,Richmond,Trent Cotchin,20
5559,2021,11671,Richmond,Trent Cotchin,21
5560,2021,11671,Richmond,Trent Cotchin,22


In [97]:
new_df = pd.merge(new_df, captains, on=['season', 'player_id', 'match_round', 'player_team'],
         how='left')

In [98]:
new_df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,effective_kicks_proportion,free_kicks_against_proportion,coaches_votes,sum,count,average_votes_prev,won_match,30_and_2,high_goal_scorer,player
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.032028,0.050000,0.0,,,,0,0,0,
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.028470,0.000000,0.0,,,,1,0,0,Adam Goodes
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.035587,0.025000,2.0,,,,0,0,0,
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.021352,0.025000,0.0,,,,1,0,0,
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0.042705,0.000000,7.0,,,,1,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93738,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0.020339,0.000000,0.0,0.0,7.0,0.000000,1,0,0,
93739,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0.027119,0.000000,0.0,2.0,17.0,0.117647,1,0,0,
93740,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0.010169,0.025641,0.0,,,,0,0,0,
93741,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0.023729,0.025641,0.0,,,,0,0,0,


In [99]:
new_df['is_captain'] = new_df['player'].notna().astype(int)

# Player Position

Convert player positions to their general position

i.e. 

left back pocket, right back pocket, left half back, right half back => defender

full back, centre half back => key defender

likewise for forwards

ruck remains the same

left wing,  right wing => wing

rover, ruck rover, centre => rover

In [100]:
new_df.query('player_position == "RK"')[['player_last_name', 'player_first_name', 'player_team']]

Unnamed: 0,player_last_name,player_first_name,player_team
16,Mumford,Shane,Sydney
30,Giles,Jonathan,Greater Western Sydney
59,Maric,Ivan,Richmond
68,Kreuzer,Matthew,Carlton
90,Jolly,Darren,Collingwood
...,...,...,...
93630,English,Tim,Western Bulldogs
93670,Pittonet,Marc,Carlton
93683,Cameron,Darcy,Collingwood
93701,Hickey,Tom,Sydney


In [101]:
list(new_df.player_position.unique())

['R',
 'CHF',
 'WR',
 'RR',
 'CHB',
 'BPL',
 'FPL',
 'HFFL',
 'WL',
 'FPR',
 'FB',
 'C',
 'HBFR',
 'INT',
 'HFFR',
 'BPR',
 'RK',
 'FF',
 'HBFL',
 'SUB']

In [102]:
convert_pos = {
    'CHB': 'key_defender',
    'RR': 'rover',
    'HFFL': 'forward',
    'HBFR': 'defender',
    'HBFL': 'defender',
    'C': 'rover',
    'FB': 'key_defender',
    'CHF': 'key_forward',
    'WL': 'wing',
    'INT': 'INT',
    'FPL': 'forward',
    'R': 'rover',
    'WR': 'wing',
    'RK': 'ruck',
    'BPR': 'defender',
    'FF': 'key_forward',
    'SUB': 'sub',
    'HFFR': 'forward',
    'FPR': 'forward',
    'BPL': 'defender'
}
new_df.replace({'player_position': convert_pos}, inplace=True)

In [103]:
new_df = pd.get_dummies(new_df, columns=['player_position'], dtype=int)

In [104]:
new_df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,season,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,...,is_captain,player_position_defender,player_position_rover,player_position_key_defender,player_position_key_forward,player_position_forward,player_position_INT,player_position_ruck,player_position_sub,player_position_wing
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0,0,1,0,0,0,0,0,0,0
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,1,0,0,0,1,0,0,0,0,0
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0,0,0,0,0,0,0,0,0,1
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0,0,1,0,0,0,0,0,0,0
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,2012,5,7,37,14,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93738,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0,0,0,0,0,0,1,0,0,0
93739,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0,0,0,0,0,0,1,0,0,0
93740,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0,0,0,0,0,1,0,0,0,0
93741,16346,St Kilda,Sydney,2022-08-21,23,2022,11,8,74,13,...,0,0,0,0,0,0,0,0,0,1


In [106]:
new_df.columns[150:]

Index(['goal_assists_proportion', 'intercepts_proportion',
       'clangers_proportion', 'effective_kicks_proportion',
       'free_kicks_against_proportion', 'coaches_votes', 'sum', 'count',
       'average_votes_prev', 'won_match', '30_and_2', 'high_goal_scorer',
       'player', 'is_captain', 'player_position_defender',
       'player_position_rover', 'player_position_key_defender',
       'player_position_key_forward', 'player_position_forward',
       'player_position_INT', 'player_position_ruck', 'player_position_sub',
       'player_position_wing'],
      dtype='object')

In [107]:
# reorder columns of dataframe
reorder_cols = [
    # team/match information
    'match_id', 'match_home_team', 'match_away_team', 'match_date',
    'match_round', 'match_home_team_goals', 'match_home_team_behinds',
    'match_home_team_score', 'match_away_team_goals',
    'match_away_team_behinds', 'match_away_team_score', 'match_margin',
    'match_winner',  'season',

    # player information
    'player_id', 'player_first_name', 'player_last_name', 'player_team', 'is_captain',
    
    # player position
    'player_position_defender', 'player_position_rover',
    'player_position_key_defender', 'player_position_key_forward',
    'player_position_forward', 'player_position_INT',
    'player_position_ruck', 'player_position_sub', 'player_position_wing',

    # player stats
    'kicks', 'marks', 'handballs', 'disposals',
    'effective_disposals', 'disposal_efficiency_percentage', 'goals',
    'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
    'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
    'contested_possessions', 'uncontested_possessions',
    'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
    'goal_assists', 'time_on_ground_percentage', 'afl_fantasy_score', 'SC',
    'centre_clearances', 'stoppage_clearances', 'score_involvements',
    'metres_gained', 'turnovers', 'intercepts', 'tackles_inside_fifty',
    'contest_def_losses', 'contest_def_one_on_ones',
    'contest_off_one_on_ones', 'contest_off_wins', 'def_half_pressure_acts',
    'effective_kicks', 'f50_ground_ball_gets', 'ground_ball_gets',
    'hitouts_to_advantage', 'hitout_win_percentage', 'intercept_marks',
    'marks_on_lead', 'pressure_acts',  'ruck_contests',
    'score_launches', 'shots_at_goal', 'spoils', 
    
    # engineered stats
    'rating_points',
    'winning_margin', 'won_match',
    '30_and_2', 'high_goal_scorer', 

    # player stat proportions
    'kicks_proportion', 'marks_proportion', 'handballs_proportion',
    'disposals_proportion', 'effective_disposals_proportion',
    'goals_proportion', 'afl_fantasy_score_proportion', 'SC_proportion', 'behinds_proportion', 'hitouts_proportion',
    'tackles_proportion', 'rebounds_proportion',
    'inside_fifties_proportion', 'clearances_proportion',
    'clangers_proportion', 'free_kicks_for_proportion',
    'free_kicks_against_proportion', 'contested_possessions_proportion',
    'uncontested_possessions_proportion', 'contested_marks_proportion',
    'marks_inside_fifty_proportion', 'one_percenters_proportion',
    'goal_assists_proportion',
    'centre_clearances_proportion', 'stoppage_clearances_proportion',
    'score_involvements_proportion', 'metres_gained_proportion',
    'turnovers_proportion', 'intercepts_proportion',
    'tackles_inside_fifty_proportion',
    'contest_def_one_on_ones_proportion',
    'contest_off_one_on_ones_proportion',
    'def_half_pressure_acts_proportion', 'effective_kicks_proportion',
    'f50_ground_ball_gets_proportion', 'ground_ball_gets_proportion',
    'hitouts_to_advantage_proportion', 'hitout_win_percentage_proportion',
    'intercept_marks_proportion',
    'shots_at_goal_proportion',
    'spoils_proportion',

    # team match totals
    'kicks_total', 'marks_total', 'handballs_total',
    'disposals_total', 'effective_disposals_total', 'goals_total', 'afl_fantasy_score_total', 'SC_total',
    'behinds_total', 'hitouts_total', 'tackles_total', 'rebounds_total',
    'inside_fifties_total', 'clearances_total', 'clangers_total',
    'free_kicks_for_total', 'free_kicks_against_total',
    'contested_possessions_total', 'uncontested_possessions_total',
    'contested_marks_total', 'marks_inside_fifty_total',
    'one_percenters_total', 'goal_assists_total',
    'centre_clearances_total', 'stoppage_clearances_total',
    'score_involvements_total', 'metres_gained_total', 'turnovers_total',
    'intercepts_total', 'tackles_inside_fifty_total',
    'contest_def_one_on_ones_total',
    'contest_off_one_on_ones_total',
    'def_half_pressure_acts_total', 'effective_kicks_total',
    'f50_ground_ball_gets_total', 'ground_ball_gets_total',
    'hitouts_to_advantage_total', 'hitout_win_percentage_total',
    'intercept_marks_total',
    'shots_at_goal_total', 'spoils_total',
    # player votes
    'coaches_votes', 'average_votes_prev', 'brownlow_votes'
]

In [108]:
new_df = new_df[reorder_cols]

In [109]:
# do not have previous years brownlow numbers for 2012, therefore we drop the season from the data
new_df = new_df.query('season > 2012')

In [110]:
new_df.isna().sum().sort_values(ascending=False)[:5]

average_votes_prev                   8782
match_id                                0
def_half_pressure_acts_proportion       0
stoppage_clearances_proportion          0
score_involvements_proportion           0
dtype: int64

In [111]:
# n/a values for players that did not play previous season (first year player, injured for entire season etc.)
new_df.loc[new_df.average_votes_prev.isna(), 'average_votes_prev'] = 0

In [112]:
new_df.season.unique()

array([2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [113]:
new_df.to_parquet('../../data/curated/clean_stats_13-22.parquet')