# Preprocessing

In this notebook we filter out finals games since we are only interested in modelling the Brownlow Medal
We remove columns that are not relevant to the overall analysis, and columns that contain far too many N/A values

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../../data/landing/player_stats_12-22_fry.csv')

  data = pd.read_csv('../../data/landing/player_stats_12-22_fry.csv')


In [3]:
data

Unnamed: 0.1,Unnamed: 0,venue_name,match_id,match_home_team,match_away_team,match_date,match_local_time,match_attendance,match_round,match_home_team_goals,...,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,subbed,player_position,date
0,1,ANZ Stadium,13960,Greater Western Sydney,Sydney,2012-03-24,19:20:00,38203,1,5,...,1.0,15.0,11.6,0.0,2.0,1.0,1.0,Not Subbed,R,2012-03-24
1,2,ANZ Stadium,13960,Greater Western Sydney,Sydney,2012-03-24,19:20:00,38203,1,5,...,2.0,12.0,9.8,0.0,1.0,2.0,1.0,Not Subbed,CHF,2012-03-24
2,3,ANZ Stadium,13960,Greater Western Sydney,Sydney,2012-03-24,19:20:00,38203,1,5,...,0.0,15.0,12.4,0.0,1.0,0.0,3.0,Not Subbed,WR,2012-03-24
3,4,ANZ Stadium,13960,Greater Western Sydney,Sydney,2012-03-24,19:20:00,38203,1,5,...,0.0,23.0,15.9,0.0,2.0,1.0,0.0,Not Subbed,RR,2012-03-24
4,5,ANZ Stadium,13960,Greater Western Sydney,Sydney,2012-03-24,19:20:00,38203,1,5,...,0.0,7.0,16.0,0.0,2.0,0.0,3.0,Not Subbed,CHB,2012-03-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98987,98988,MCG,16355,Geelong,Sydney,2022-09-24,14:30:00,0,Grand Final,20,...,0.0,22.0,29.9,0.0,1.0,3.0,0.0,,INT,2022-09-24
98988,98989,MCG,16355,Geelong,Sydney,2022-09-24,14:30:00,0,Grand Final,20,...,1.0,19.0,20.0,0.0,1.0,2.0,1.0,,HFFL,2022-09-24
98989,98990,MCG,16355,Geelong,Sydney,2022-09-24,14:30:00,0,Grand Final,20,...,1.0,16.0,5.8,0.0,0.0,0.0,3.0,,INT,2022-09-24
98990,98991,MCG,16355,Geelong,Sydney,2022-09-24,14:30:00,0,Grand Final,20,...,0.0,4.0,2.4,0.0,0.0,0.0,2.0,,SUB,2022-09-24


In [4]:
# filter out any finals games
# round numbers appear as strings in some games, and int in others,
# therfore we account for both:
round_numbers = [i for i in range(1, 24)] + [str(j) for j in range(1,24)]
data = data.query('match_round.isin(@round_numbers)')

In [5]:
data.columns

Index(['Unnamed: 0', 'venue_name', 'match_id', 'match_home_team',
       'match_away_team', 'match_date', 'match_local_time', 'match_attendance',
       'match_round', 'match_home_team_goals', 'match_home_team_behinds',
       'match_home_team_score', 'match_away_team_goals',
       'match_away_team_behinds', 'match_away_team_score', 'match_margin',
       'match_winner', 'match_weather_temp_c', 'match_weather_type',
       'player_id', 'player_first_name', 'player_last_name',
       'player_height_cm', 'player_weight_kg', 'player_is_retired',
       'player_team', 'guernsey_number', 'kicks', 'marks', 'handballs',
       'disposals', 'effective_disposals', 'disposal_efficiency_percentage',
       'goals', 'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
       'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
       'brownlow_votes', 'contested_possessions', 'uncontested_possessions',
       'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounc

In [9]:
data.isna().sum().sort_values(ascending=False)[:25]

player_weight_kg           25238
player_height_cm           25238
supercoach_score           24948
subbed                     24552
player_is_retired           3800
contest_def_losses           484
hitouts_to_advantage         484
contest_def_one_on_ones      484
contest_off_one_on_ones      484
contest_off_wins             484
effective_kicks              484
f50_ground_ball_gets         484
ground_ball_gets             484
def_half_pressure_acts       484
hitout_win_percentage        484
ruck_contests                484
intercept_marks              484
score_launches               484
spoils                       484
pressure_acts                484
marks_on_lead                484
afl_fantasy_score            395
shots_at_goal                264
tackles_inside_fifty         264
intercepts                   264
dtype: int64

In [10]:
drop_list = [
    # features irrelevant to model:
    'Unnamed: 0', 'match_local_time', 'venue_name', 'match_attendance',
    'match_weather_temp_c', 'match_weather_type',
    'player_height_cm', 'player_weight_kg', 'player_is_retired','guernsey_number',
    'subbed', 'date',
    # too many n/a values, supercoach scores will be sourced from another dataset:
    'supercoach_score'
]

In [11]:
# can get effective disposals from disposals and effective percentage as neither contains n/a
data['effective_disposals'] = round(data['disposal_efficiency_percentage'] / 100 * data['disposals'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['effective_disposals'] = round(data['disposal_efficiency_percentage'] / 100 * data['disposals'])


In [12]:
df = data.drop(columns=drop_list)

In [13]:
df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,hitout_win_percentage,intercept_marks,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,player_position
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,0.0,1.0,15.0,11.6,0.0,2.0,1.0,1.0,R
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,0.0,2.0,12.0,9.8,0.0,1.0,2.0,1.0,CHF
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,3.0,0.0,15.0,12.4,0.0,1.0,0.0,3.0,WR
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,1.0,0.0,23.0,15.9,0.0,2.0,1.0,0.0,RR
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,8.0,0.0,7.0,16.0,0.0,2.0,0.0,3.0,CHB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98573,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,0.0,1.0,8.0,5.6,0.0,1.0,3.0,0.0,INT
98574,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,1.0,0.0,21.0,6.1,0.0,1.0,1.0,0.0,INT
98575,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,0.0,0.0,23.0,5.8,0.0,0.0,2.0,1.0,FPR
98576,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,0.0,3.0,13.0,2.3,0.0,2.0,2.0,0.0,WR


In [16]:
df.to_csv('../../data/raw/stats_12-22.csv', index=False)