# Preprocessing Coaches Votes

In this notebook we clean up the data that contains coaches votes from each game.
This includes mapping the game and player to their respective match_id and player_id's so they can be joined 
onto a future dataframe with a primary key

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../../data/landing/coaches_votes_12-22.csv')

In [3]:
data

Unnamed: 0.1,Unnamed: 0,Season,Round,Home.Team,Away.Team,Player.Name,Coaches.Votes
0,1.1...1,2012,1,GWS Giants,Sydney Swans,Josh Kennedy (SYD),10.0
1,1.2...2,2012,1,GWS Giants,Sydney Swans,Ted Richards (SYD),7.0
2,1.3...3,2012,1,GWS Giants,Sydney Swans,Kieren Jack (SYD),6.0
3,1.4...4,2012,1,GWS Giants,Sydney Swans,Craig Bird (SYD),3.0
4,1.5...5,2012,1,GWS Giants,Sydney Swans,Chad Cornes (GWS),2.0
...,...,...,...,...,...,...,...
14456,32.2...14457,2022,27,Geelong Cats,Sydney Swans,Patrick Dangerfield (GEEL),12.0
14457,32.3...14458,2022,27,Geelong Cats,Sydney Swans,Tom Hawkins (GEEL),9.0
14458,32.4...14459,2022,27,Geelong Cats,Sydney Swans,Tyson Stengle (GEEL),6.0
14459,32.5...14460,2022,27,Geelong Cats,Sydney Swans,Mark Blicavs (GEEL),1.5


In [4]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
# first things first, let's clean up those column names
for col in list(data.columns):
    
    col_out = col.replace('.','_').lower()
    
    print(f"'{col}': '{col_out}',")

'Season': 'season',
'Round': 'round',
'Home.Team': 'home_team',
'Away.Team': 'away_team',
'Player.Name': 'player_name',
'Coaches.Votes': 'coaches_votes',


In [6]:
# add prefix "match_" for future df merges
data = data.rename(columns={
    'Season': 'season',
    'Round': 'match_round',
    'Home.Team': 'match_home_team',
    'Away.Team': 'match_away_team',
    'Player.Name': 'player_name',
    'Coaches.Votes': 'coaches_votes'
})

In [7]:
# also filter out finals games
match_round = [i for i in range(1, 24)]
data = data.query('match_round.isin(@match_round)')

In [8]:
# much nicer
data.head()

Unnamed: 0,season,match_round,match_home_team,match_away_team,player_name,coaches_votes
0,2012,1,GWS Giants,Sydney Swans,Josh Kennedy (SYD),10.0
1,2012,1,GWS Giants,Sydney Swans,Ted Richards (SYD),7.0
2,2012,1,GWS Giants,Sydney Swans,Kieren Jack (SYD),6.0
3,2012,1,GWS Giants,Sydney Swans,Craig Bird (SYD),3.0
4,2012,1,GWS Giants,Sydney Swans,Chad Cornes (GWS),2.0


In [9]:
match_id = pd.read_parquet('../../data/raw/cleaned_stats_12-22_fixed_bv')

In [10]:
match_id = match_id[['match_id', 'match_home_team', 'match_away_team', 
                     'match_round', 'season', 'match_date', 
                     'player_id', 'player_first_name', 'player_last_name']]

In [11]:
match_id.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_round,season,match_date,player_id,player_first_name,player_last_name
0,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10822,James,McDonald
1,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10942,Adam,Goodes
2,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10973,Chad,Cornes
3,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10988,Jude,Bolton
4,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,11183,Ted,Richards


In [12]:
old_teams = data.match_home_team.unique()
new_teams = match_id.match_home_team.unique()
# teams where name is different
list(set(old_teams) - set(new_teams))

['West Coast Eagles',
 'GWS Giants',
 'Geelong Cats',
 'Gold Coast Suns',
 'Sydney Swans',
 'Adelaide Crows']

In [13]:
list(set(new_teams) - set(old_teams))

['Adelaide',
 'Greater Western Sydney',
 'Gold Coast',
 'West Coast',
 'Geelong',
 'Sydney']

In [14]:
# extract player name from player_name column (i.e. without team)
data['player'] = data['player_name'].str.extract(r"(.*?) \(", expand=False)
# extract player team from player_name column
data['player_team'] = data['player_name'].str.extract(r"\((.*?)\)", expand=False)

In [15]:
data.player_team.unique()

array(['SYD', 'GWS', 'CARL', 'RICH', 'COLL', 'HAW', 'BL', 'MELB', 'ADEL',
       'GCFC', 'FRE', 'GEEL', 'ESS', 'NMFC', 'WCE', 'WB', 'PORT', 'STK'],
      dtype=object)

In [16]:
new_teams

<StringArray>
['Greater Western Sydney',               'Richmond',               'Hawthorn',
              'Melbourne',             'Gold Coast',        'North Melbourne',
              'Fremantle',       'Western Bulldogs',          'Port Adelaide',
         'Brisbane Lions',               'Essendon',                 'Sydney',
             'West Coast',               'Adelaide',            'Collingwood',
               'St Kilda',                'Geelong',                'Carlton']
Length: 18, dtype: string

In [17]:
team_dict = {
    # coaches_vote format: match_id format
    'GWS Giants':'Greater Western Sydney',
    'Sydney Swans':'Sydney',
    'West Coast Eagles':'West Coast',
    'Gold Coast Suns':'Gold Coast',
    'Geelong Cats':'Geelong',
    'Adelaide Crows':'Adelaide',
    
    # shortened team names: long team names
    'SYD': 'Sydney', 
    'GWS': 'Greater Western Sydney', 
    'CARL': 'Carlton', 
    'RICH': 'Richmond', 
    'COLL': 'Collingwood', 
    'HAW': 'Hawthorn', 
    'BL': 'Brisbane Lions', 
    'MELB': 'Melbourne', 
    'ADEL': 'Adelaide',    
    'GCFC': 'Gold Coast', 
    'FRE': 'Fremantle', 
    'GEEL': 'Geelong', 
    'ESS': 'Essendon', 
    'NMFC': 'North Melbourne', 
    'WCE': 'West Coast', 
    'WB': 'Western Bulldogs', 
    'PORT': 'Port Adelaide', 
    'STK': 'St Kilda'
}

In [18]:
data.replace({'match_home_team':team_dict, 'match_away_team':team_dict, 'player_team':team_dict}, inplace=True)

In [19]:
data.head()

Unnamed: 0,season,match_round,match_home_team,match_away_team,player_name,coaches_votes,player,player_team
0,2012,1,Greater Western Sydney,Sydney,Josh Kennedy (SYD),10.0,Josh Kennedy,Sydney
1,2012,1,Greater Western Sydney,Sydney,Ted Richards (SYD),7.0,Ted Richards,Sydney
2,2012,1,Greater Western Sydney,Sydney,Kieren Jack (SYD),6.0,Kieren Jack,Sydney
3,2012,1,Greater Western Sydney,Sydney,Craig Bird (SYD),3.0,Craig Bird,Sydney
4,2012,1,Greater Western Sydney,Sydney,Chad Cornes (GWS),2.0,Chad Cornes,Greater Western Sydney


In [20]:
match_id['player'] = match_id['player_first_name'] + ' ' + match_id['player_last_name']

In [21]:
match_id.head()

Unnamed: 0,match_id,match_home_team,match_away_team,match_round,season,match_date,player_id,player_first_name,player_last_name,player
0,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10822,James,McDonald,James McDonald
1,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10942,Adam,Goodes,Adam Goodes
2,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10973,Chad,Cornes,Chad Cornes
3,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10988,Jude,Bolton,Jude Bolton
4,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,11183,Ted,Richards,Ted Richards


In [22]:
df = pd.merge(match_id, data, how='left', left_on=['match_home_team', 'match_away_team', 'match_round', 'season', 'player'],
                                    right_on=['match_home_team', 'match_away_team', 'match_round', 'season', 'player'])

In [23]:
df

Unnamed: 0,match_id,match_home_team,match_away_team,match_round,season,match_date,player_id,player_first_name,player_last_name,player,player_name,coaches_votes,player_team
0,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10822,James,McDonald,James McDonald,,,
1,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10942,Adam,Goodes,Adam Goodes,,,
2,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10973,Chad,Cornes,Chad Cornes,Chad Cornes (GWS),2.0,Greater Western Sydney
3,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,10988,Jude,Bolton,Jude Bolton,,,
4,13960,Greater Western Sydney,Sydney,1,2012,2012-03-24,11183,Ted,Richards,Ted Richards,Ted Richards (SYD),7.0,Sydney
...,...,...,...,...,...,...,...,...,...,...,...,...,...
94111,16346,St Kilda,Sydney,23,2022,2022-08-21,12863,Logan,McDonald,Logan McDonald,,,
94112,16346,St Kilda,Sydney,23,2022,2022-08-21,12864,Errol,Gulden,Errol Gulden,,,
94113,16346,St Kilda,Sydney,23,2022,2022-08-21,12945,Mitch,Owens,Mitch Owens,,,
94114,16346,St Kilda,Sydney,23,2022,2022-08-21,12947,Nasiah,Wanganeen-Milera,Nasiah Wanganeen-Milera,,,


In [24]:
cols_keep = ['match_id', 'player_id', 'coaches_votes']
final_df = df[cols_keep].fillna(0)

In [25]:
final_df

Unnamed: 0,match_id,player_id,coaches_votes
0,13960,10822,0.0
1,13960,10942,0.0
2,13960,10973,2.0
3,13960,10988,0.0
4,13960,11183,7.0
...,...,...,...
94111,16346,12863,0.0
94112,16346,12864,0.0
94113,16346,12945,0.0
94114,16346,12947,0.0


In [26]:
final_df.to_parquet('../../data/raw/coaches_votes.parquet')