## Preprocessing Footytables Data

In this notebook we preprocess the data collected from footy tables as it contains the brownlow votes from 2022.

The aim is to derive the columns of interest and merge it with our original dataframe so that we have the brownlow votes information for all seasons.

Unfortunately there is no nice key to merge the dataframes on and when merging via the first and last name of players, some the result was not as expected. This was because of the represenation of some names that included, two words like "De Goey" or "De Koning", names with apostrophes like "O'hallaron" and names with an initial like "Josh P. Kennedy". There is also the case where names such as "Thomas" were shortened to "Tom"

So the names had to transformed to the same format so that they could be merged as intended.

In [1]:
import pandas as pd

In [56]:
# load afl_tables dataset
df = pd.read_csv('../../data/landing/player_stats_22_tables.csv', index_col=0)

In [57]:
df.columns

Index(['Season', 'Round', 'Date', 'Local.start.time', 'Venue', 'Attendance',
       'Home.team', 'HQ1G', 'HQ1B', 'HQ2G', 'HQ2B', 'HQ3G', 'HQ3B', 'HQ4G',
       'HQ4B', 'Home.score', 'Away.team', 'AQ1G', 'AQ1B', 'AQ2G', 'AQ2B',
       'AQ3G', 'AQ3B', 'AQ4G', 'AQ4B', 'Away.score', 'First.name', 'Surname',
       'ID', 'Jumper.No.', 'Playing.for', 'Kicks', 'Marks', 'Handballs',
       'Goals', 'Behinds', 'Hit.Outs', 'Tackles', 'Rebounds', 'Inside.50s',
       'Clearances', 'Clangers', 'Frees.For', 'Frees.Against',
       'Brownlow.Votes', 'Contested.Possessions', 'Uncontested.Possessions',
       'Contested.Marks', 'Marks.Inside.50', 'One.Percenters', 'Bounces',
       'Goal.Assists', 'Time.on.Ground..', 'Substitute', 'Umpire.1',
       'Umpire.2', 'Umpire.3', 'Umpire.4', 'group_id'],
      dtype='object')

In [58]:
df

Unnamed: 0,Season,Round,Date,Local.start.time,Venue,Attendance,Home.team,HQ1G,HQ1B,HQ2G,...,One.Percenters,Bounces,Goal.Assists,Time.on.Ground..,Substitute,Umpire.1,Umpire.2,Umpire.3,Umpire.4,group_id
1,2022,1,2022-03-16,1910,M.C.G.,58002,Melbourne,4,5,6,...,2,1,0,55,,John Howorth,Rob Findlay,Jacob Mollison,,
2,2022,1,2022-03-16,1910,M.C.G.,58002,Melbourne,4,5,6,...,2,0,0,73,,John Howorth,Rob Findlay,Jacob Mollison,,
3,2022,1,2022-03-16,1910,M.C.G.,58002,Melbourne,4,5,6,...,1,0,0,83,,John Howorth,Rob Findlay,Jacob Mollison,,
4,2022,1,2022-03-16,1910,M.C.G.,58002,Melbourne,4,5,6,...,2,0,0,86,,John Howorth,Rob Findlay,Jacob Mollison,,
5,2022,1,2022-03-16,1910,M.C.G.,58002,Melbourne,4,5,6,...,0,0,1,81,,John Howorth,Rob Findlay,Jacob Mollison,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9518,2022,GF,2022-09-24,1430,M.C.G.,100024,Geelong,6,5,9,...,6,1,0,90,,Simon Meredith,Matt Stevic,Brendan Hosking,,
9519,2022,GF,2022-09-24,1430,M.C.G.,100024,Geelong,6,5,9,...,1,0,0,38,,Simon Meredith,Matt Stevic,Brendan Hosking,,
9520,2022,GF,2022-09-24,1430,M.C.G.,100024,Geelong,6,5,9,...,4,0,1,72,,Simon Meredith,Matt Stevic,Brendan Hosking,,
9521,2022,GF,2022-09-24,1430,M.C.G.,100024,Geelong,6,5,9,...,0,3,1,78,,Simon Meredith,Matt Stevic,Brendan Hosking,,


In [59]:
rounds = [str(i) for i in range(1,24)]
df = df.query('Round.isin(@rounds)')

In [60]:
keep_col = [
    'Home.team', 'Away.team', 'Round', 'First.name', 'Surname', 'Playing.for', 'Brownlow.Votes'
]

In [61]:
df = df[keep_col]

In [62]:
for col in keep_col:
    print(f'"{col}": "{col.lower().replace(".","_")}",')

"Home.team": "home_team",
"Away.team": "away_team",
"Round": "round",
"First.name": "first_name",
"Surname": "surname",
"Playing.for": "playing_for",
"Brownlow.Votes": "brownlow_votes",


In [63]:
df.rename(columns={
    "Home.team": "home_team",
    "Away.team": "away_team",
    "Round": "round_number",
    "First.name": "first_name",
    "Surname": "surname",
    "Playing.for": "player_team",
    "Brownlow.Votes": "brownlow_votes"
}, inplace=True)

In [64]:
df = df.astype({
    'round_number':'int'
})

In [65]:
df

Unnamed: 0,home_team,away_team,round_number,first_name,surname,player_team,brownlow_votes
1,Melbourne,Western Bulldogs,1,Toby,Bedford,Melbourne,0
2,Melbourne,Western Bulldogs,1,Jake,Bowey,Melbourne,0
3,Melbourne,Western Bulldogs,1,Angus,Brayshaw,Melbourne,0
4,Melbourne,Western Bulldogs,1,Ben,Brown,Melbourne,0
5,Melbourne,Western Bulldogs,1,Bayley,Fritsch,Melbourne,0
...,...,...,...,...,...,...,...
9104,St Kilda,Sydney,23,Sam,Reid,Sydney,0
9105,St Kilda,Sydney,23,James,Rowbottom,Sydney,0
9106,St Kilda,Sydney,23,Dylan,Stephens,Sydney,0
9107,St Kilda,Sydney,23,Chad,Warner,Sydney,0


In [66]:
# realistically we only need to merge rows where a player polled, and all remaining will be na
# which can be filled with .fillna(0)
polled = df.query('brownlow_votes > 0')

In [67]:
polled.head()

Unnamed: 0,home_team,away_team,round_number,first_name,surname,player_team,brownlow_votes
15,Melbourne,Western Bulldogs,1,Clayton,Oliver,Melbourne,1
16,Melbourne,Western Bulldogs,1,Christian,Petracca,Melbourne,3
36,Melbourne,Western Bulldogs,1,Jack,Macrae,Western Bulldogs,2
47,Carlton,Richmond,1,Adam,Cerra,Carlton,3
48,Carlton,Richmond,1,Patrick,Cripps,Carlton,1


In [68]:
# 3 players * 9 games * 22 rounds (exclude bye round)
len(polled)

594

In [70]:
players = pd.read_parquet('../../data/curated/player_information_12-22.parquet').query('season == 2022')

In [71]:
players

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
85008,2022,11904,Western Bulldogs,1,Tom,Liberatore,t liberatore
85009,2022,11945,Melbourne,2,Steven,May,s may
85010,2022,11972,Melbourne,1,Max,Gawn,m gawn
85011,2022,12015,Melbourne,1,Tom,McDonald,t mcdonald
85012,2022,12034,Melbourne,2,Adam,Tomlinson,a tomlinson
...,...,...,...,...,...,...,...
93832,2022,12939,North Melbourne,1,Charlie,Comben,c comben
93839,2022,13024,North Melbourne,1,Josh,Goater,j goater
93889,2022,11731,Essendon,1,Michael,Hurley,m hurley
94022,2022,13025,Hawthorn,1,Ned,Long,n long


In [77]:
# process_name and player_team together is a unique identifier
len(players.drop_duplicates(['process_name', 'player_team']))

683

In [78]:
def transform_name(row):
    
    """
    function to transform player name such that we take the first name initial and last name
    in the case of the 4 players where this is not unique (when paired with player team) we take the full first name
    puncuation is removed from player's names as well

    e.g. 
    1. Sam De Koning => s de koning
    2. Jaeger O'Meara => j omeara
    3. Jeremy Cameron => j cameron
    """

    first = row['first_name']
    lst = row['surname'].replace("'", "").split('-')


    if '-' in row['surname']:
        return_name = f'{first[0]} {lst[0][0]}-{lst[1]}'
        return return_name.lower()
    
    else:
        return_name = f'{first[0]} {lst[0]}'

        return return_name.lower()

In [79]:
polled['process_name'] = polled.apply(transform_name, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polled['process_name'] = polled.apply(transform_name, axis=1)


In [81]:
new_df = pd.merge(polled, players, 
                  
        left_on=['process_name', 'player_team'],
        right_on=['process_name', 'player_team'], 
        how='left')

# 1 name represented incorrectly

len(new_df[new_df.isna().any(axis=1)])

1

In [85]:
new_df[new_df.isna().any(axis=1)]

Unnamed: 0,home_team,away_team,round_number,first_name,surname,player_team,brownlow_votes,process_name,season,player_id,no_teams,player_first_name,player_last_name
358,West Coast,Essendon,15,Willie,Rioli,West Coast,1,w rioli,,,,,


In [86]:
polled.query('surname.str.contains("Rioli")')

Unnamed: 0,home_team,away_team,round_number,first_name,surname,player_team,brownlow_votes,process_name
5489,West Coast,Essendon,15,Willie,Rioli,West Coast,1,w rioli


In [87]:
players.query('player_last_name.str.contains("Rioli")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
85069,2022,12409,Richmond,1,Daniel,Rioli,d rioli
85401,2022,12613,West Coast,1,Junior,Rioli,j rioli
87534,2022,12935,Richmond,1,Maurice,Rioli,m rioli


In [92]:
polled.loc[5489, ['first_name', 'process_name']] = 'Junior', 'j rioli'

In [139]:
# remerge datasets after altering player name
new_df = pd.merge(polled, players, 
                  
        left_on=['process_name', 'player_team'],
        right_on=['process_name', 'player_team'], 
        how='left')


len(new_df[new_df.isna().any(axis=1)])

0

In [140]:
# top 20 looks correct as compared to aftables website
new_df.groupby(['player_id', 'player_first_name', 'player_last_name'])['brownlow_votes'].sum().reset_index().sort_values('brownlow_votes', ascending=False)[:20]

Unnamed: 0,player_id,player_first_name,player_last_name,brownlow_votes
71,12269,Patrick,Cripps,29
42,12061,Lachie,Neale,28
83,12329,Touk,Miller,27
150,12596,Andrew,Brayshaw,25
99,12411,Clayton,Oliver,25
109,12437,Christian,Petracca,24
102,12418,Callum,Mills,21
36,12022,Jeremy,Cameron,19
27,11921,Dion,Prestia,19
68,12249,Zach,Merrett,17


In [141]:
new_df

Unnamed: 0,home_team,away_team,round_number,first_name,surname,player_team,brownlow_votes,process_name,season,player_id,no_teams,player_first_name,player_last_name
0,Melbourne,Western Bulldogs,1,Clayton,Oliver,Melbourne,1,c oliver,2022,12411,1,Clayton,Oliver
1,Melbourne,Western Bulldogs,1,Christian,Petracca,Melbourne,3,c petracca,2022,12437,1,Christian,Petracca
2,Melbourne,Western Bulldogs,1,Jack,Macrae,Western Bulldogs,2,j macrae,2022,12172,1,Jack,Macrae
3,Carlton,Richmond,1,Adam,Cerra,Carlton,3,a cerra,2022,12610,2,Adam,Cerra
4,Carlton,Richmond,1,Patrick,Cripps,Carlton,1,p cripps,2022,12269,1,Patrick,Cripps
...,...,...,...,...,...,...,...,...,...,...,...,...,...
589,Carlton,Collingwood,23,Patrick,Cripps,Carlton,3,p cripps,2022,12269,1,Patrick,Cripps
590,Carlton,Collingwood,23,Josh,Daicos,Collingwood,1,j daicos,2022,12582,1,Josh,Daicos
591,St Kilda,Sydney,23,Dan,Hannebery,St Kilda,3,d hannebery,2022,11787,2,Dan,Hannebery
592,St Kilda,Sydney,23,Ben,Long,St Kilda,2,b long,2022,12529,1,Ben,Long


In [142]:
new_df = new_df[['player_id', 'player_first_name', 'player_last_name', 'player_team', 'round_number', 'brownlow_votes']]

In [143]:
# will be used with 2023 data to get past polling perforamnce
new_df.to_parquet('../../data/curated/brownlow_votes_22.parquet')

In [147]:
new_df.drop(columns=['player_first_name', 'player_last_name', 'player_team'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.drop(columns=['player_first_name', 'player_last_name', 'player_team'],inplace=True)


In [148]:
full_df = pd.read_csv('../../data/raw/stats_12-22_no_na.csv')

In [149]:
df_2022 = full_df.query('season == 2022').drop(columns='brownlow_votes')

In [150]:
df_2022 = pd.merge(df_2022, new_df,
         left_on=['player_id', 'match_round'],
         right_on=['player_id', 'round_number'],
         how='left').drop(columns='round_number')

In [151]:
# total players - players polled = n/a
# 22*9*46 - 22*9*3 = 8514 as expected
df_2022.isna().sum().sort_values(ascending=False)[:5]

brownlow_votes             8514
score_involvements            0
contest_def_one_on_ones       0
contest_def_losses            0
tackles_inside_fifty          0
dtype: int64

In [153]:
df_2022.fillna(0, inplace=True)

In [154]:
full_df = full_df.query('season < 2022')
full_df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,intercept_marks,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,player_position,season
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,1.0,15.0,11.6,0.0,2.0,1.0,1.0,R,2012
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,2.0,12.0,9.8,0.0,1.0,2.0,1.0,CHF,2012
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,3.0,0.0,15.0,12.4,0.0,1.0,0.0,3.0,WR,2012
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,1.0,0.0,23.0,15.9,0.0,2.0,1.0,0.0,RR,2012
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,8.0,0.0,7.0,16.0,0.0,2.0,0.0,3.0,CHB,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85003,16105,Adelaide,North Melbourne,2021-08-22,23,13,20,98,8,6,...,0.0,0.0,8.0,6.8,18.0,2.0,3.0,1.0,CHF,2021
85004,16105,Adelaide,North Melbourne,2021-08-22,23,13,20,98,8,6,...,0.0,1.0,15.0,3.2,0.0,0.0,1.0,0.0,FPL,2021
85005,16105,Adelaide,North Melbourne,2021-08-22,23,13,20,98,8,6,...,0.0,0.0,6.0,9.5,0.0,2.0,0.0,0.0,WL,2021
85006,16105,Adelaide,North Melbourne,2021-08-22,23,13,20,98,8,6,...,0.0,0.0,9.0,4.5,0.0,0.0,3.0,0.0,INT,2021


In [155]:
final_df = pd.concat([full_df, df_2022])

In [156]:
final_df

Unnamed: 0,match_id,match_home_team,match_away_team,match_date,match_round,match_home_team_goals,match_home_team_behinds,match_home_team_score,match_away_team_goals,match_away_team_behinds,...,intercept_marks,marks_on_lead,pressure_acts,rating_points,ruck_contests,score_launches,shots_at_goal,spoils,player_position,season
0,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,1.0,15.0,11.6,0.0,2.0,1.0,1.0,R,2012
1,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,0.0,2.0,12.0,9.8,0.0,1.0,2.0,1.0,CHF,2012
2,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,3.0,0.0,15.0,12.4,0.0,1.0,0.0,3.0,WR,2012
3,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,1.0,0.0,23.0,15.9,0.0,2.0,1.0,0.0,RR,2012
4,13960,Greater Western Sydney,Sydney,2012-03-24,1,5,7,37,14,16,...,8.0,0.0,7.0,16.0,0.0,2.0,0.0,3.0,CHB,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9103,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,1.0,8.0,5.6,0.0,1.0,3.0,0.0,INT,2022
9104,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,1.0,0.0,21.0,6.1,0.0,1.0,1.0,0.0,INT,2022
9105,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,0.0,23.0,5.8,0.0,0.0,2.0,1.0,FPR,2022
9106,16346,St Kilda,Sydney,2022-08-21,23,11,8,74,13,10,...,0.0,3.0,13.0,2.3,0.0,2.0,2.0,0.0,WR,2022


In [157]:
# all columns merged succesfully and columns are same between both dataframes
final_df.isna().sum()

match_id           0
match_home_team    0
match_away_team    0
match_date         0
match_round        0
                  ..
score_launches     0
shots_at_goal      0
spoils             0
player_position    0
season             0
Length: 70, dtype: int64

In [158]:
final_df.to_parquet('../../data/raw/cleaned_stats_12-22_fixed_bv.parquet')