# Preprocessing Footywire

In this notebook we clean the dataset sourced from footywire.com

Since the names are represented in a different manner than the fryzigg dataset e.g. Thomas Stewart vs. Tom Stewart

We find a solution to combine the datasets such that the player ids can be joined to the match performance

In [3]:
import pandas as pd

In [2]:
SC = pd.read_parquet('../../data/raw/supercoach_12-22.parquet')

In [3]:
SC['name_length'] = SC['Player'].apply(lambda x: len(x.split()))

In [4]:
SC.drop_duplicates('Player').sort_values('name_length', ascending=False)[:15]

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length
60427,Tom De Koning,Carlton,58.0,15387.0,2018.0,22.0,3
23,Josh P. Kennedy,Sydney,137.0,13960.0,2012.0,1.0,3
202,Nathan Van Berlo,Adelaide,117.0,13964.0,2012.0,1.0,3
26311,Jordan De Goey,Collingwood,11.0,14584.0,2015.0,1.0,3
78384,Sam De Koning,Geelong,27.0,15919.0,2021.0,5.0,3
2903,Jay Van Berlo,Fremantle,20.0,14025.0,2012.0,8.0,3
34913,Callum Ah Chee,Gold Coast,58.0,14789.0,2016.0,1.0,3
19001,Dylan Van Unen,Essendon,40.0,14409.0,2014.0,4.0,3
227,Matthew De Boer,Fremantle,97.0,13966.0,2012.0,1.0,3
375,Nick Dal Santo,St Kilda,148.0,13968.0,2012.0,1.0,3


In [5]:
# only player to have middle initial

print(len(SC.query('Player.str.contains("\.")').Player.unique()))
SC.query('Player.str.contains("\.")').head()


1


Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length
23,Josh P. Kennedy,Sydney,137.0,13960.0,2012.0,1.0,3
484,Josh P. Kennedy,Sydney,139.0,13971.0,2012.0,2.0,3
904,Josh P. Kennedy,Sydney,113.0,13980.0,2012.0,3.0,3
1497,Josh P. Kennedy,Sydney,107.0,13994.0,2012.0,4.0,3
1871,Josh P. Kennedy,Sydney,173.0,14002.0,2012.0,5.0,3


In [6]:
# b ebert and l young are legitimate cases of 2 players on same team with the same initial surname combo

# m frederick name changed from Minairo to Michael in the dataset but it is the same person
# j mcinerney: there was 1 instance where the I in mcinerney was capitalised, hence counted as 2 different names

def transform_name(row):
    
    player_name = row['Player'].replace("'", "")
    player_team = row['Team']

    # for unique case of Josh P. Kennedy
    if '.' in player_name:

        first, mid, last = player_name.split()

        return_name = f'{first[0]} {last}'

        return return_name.lower()
    
    # for case of l young on WB and b ebert on PA
    double_names = {'Lewis Young', 'Lachie Young', 'Brad Ebert', 'Brett Ebert'}
    # double_teams = {'Western Bulldogs', 'Port Adelaide'}
    
    if player_name in double_names:

        return player_name.lower()
    
    # cases like Jordan De Goey -> j de goey
    if row['name_length'] == 3:

        first, mid, last = player_name.split()

        return_name = f'{first[0]} {mid} {last}'
        
        return return_name.lower()

    if row['name_length'] == 2: 

        first, last = player_name.split()

        return_name = f'{first[0]} {last}'
        
        return return_name.lower()

In [7]:
SC['process_name'] = SC.apply(transform_name, axis=1)

In [8]:
SC.query('Player.str.contains("Lewis Young")')

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name
49796,Lewis Young,Western Bulldogs,67.0,15136.0,2017.0,17.0,2,lewis young
49998,Lewis Young,Western Bulldogs,79.0,15141.0,2017.0,18.0,2,lewis young
50532,Lewis Young,Western Bulldogs,48.0,15153.0,2017.0,19.0,2,lewis young
50813,Lewis Young,Western Bulldogs,52.0,15159.0,2017.0,20.0,2,lewis young
51057,Lewis Young,Western Bulldogs,23.0,15165.0,2017.0,21.0,2,lewis young
51498,Lewis Young,Western Bulldogs,46.0,15175.0,2017.0,22.0,2,lewis young
51874,Lewis Young,Western Bulldogs,23.0,15183.0,2017.0,23.0,2,lewis young
54668,Lewis Young,Western Bulldogs,69.0,15256.0,2018.0,7.0,2,lewis young
55240,Lewis Young,Western Bulldogs,62.0,15269.0,2018.0,8.0,2,lewis young
61686,Lewis Young,Western Bulldogs,35.0,15424.0,2019.0,2.0,2,lewis young


In [9]:
players = pd.read_parquet('../../data/curated/player_information_12-22.parquet')

In [10]:
merge = pd.merge(SC, players, how='left',
         left_on=['process_name', 'Team', 'season'],
         right_on=['process_name', 'player_team', 'season'])

In [11]:
merge[merge.isna().any(axis=1)].drop_duplicates('Player')

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
17598,Jay K-Harris,Melbourne,24.0,14381.0,2014.0,1.0,2,j k-harris,,,,,
17727,Angus Litherland,Hawthorn,27.0,14384.0,2014.0,2.0,2,a litherland,,,,,
48085,Josh D-Cardillo,Fremantle,33.0,15108.0,2017.0,14.0,2,j d-cardillo,,,,,
52447,Willie Rioli,West Coast,31.0,15216.0,2018.0,2.0,2,w rioli,,,,,
66696,Ian Hill,Greater Western Sydney,60.0,15549.0,2019.0,17.0,2,i hill,,,,,
76003,James Jordan,Melbourne,78.0,15877.0,2021.0,1.0,2,j jordan,,,,,


Names to Fix:

J Harris: no hyphen in other dataframe so name did not change.

A Litherland: name change from Litherland to Dewar in return to AFL, name change was applied to previous records in dataset

J D-Cardillo is written as J Deluca in other dataset

Willie and Ian go by Juniour and Bobby respectively in other dataset

J Jordon is written as Jordan for first 9 or so career games.

In [12]:
# 93743 rows means no double ups in the merge 
# => the method of processing names to then be joined via name, season, team was successful
merge

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
0,Adam Kennedy,Greater Western Sydney,103.0,13960.0,2012.0,1.0,2,a kennedy,12030.0,Greater Western Sydney,1.0,Adam,Kennedy
1,Toby Greene,Greater Western Sydney,84.0,13960.0,2012.0,1.0,2,t greene,12026.0,Greater Western Sydney,1.0,Toby,Greene
2,Tomas Bugg,Greater Western Sydney,87.0,13960.0,2012.0,1.0,2,t bugg,12021.0,Greater Western Sydney,2.0,Tomas,Bugg
3,Rhys Palmer,Greater Western Sydney,126.0,13960.0,2012.0,1.0,2,r palmer,11647.0,Greater Western Sydney,2.0,Rhys,Palmer
4,Chad Cornes,Greater Western Sydney,104.0,13960.0,2012.0,1.0,2,c cornes,10973.0,Greater Western Sydney,1.0,Chad,Cornes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93738,Ryan Clarke,Sydney,51.0,16346.0,2022.0,23.0,2,r clarke,12469.0,Sydney,2.0,Ryan,Clarke
93739,Sam Wicks,Sydney,44.0,16346.0,2022.0,23.0,2,s wicks,12820.0,Sydney,1.0,Sam,Wicks
93740,Patrick McCartin,Sydney,52.0,16346.0,2022.0,23.0,2,p mccartin,12343.0,Sydney,2.0,Paddy,McCartin
93741,Tom Papley,Sydney,37.0,16346.0,2022.0,23.0,2,t papley,12419.0,Sydney,1.0,Tom,Papley


# J K-Harris

In [13]:
players.query('process_name.str.contains("kennedy harris")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
17599,2014,12259,Melbourne,1,Jay,Kennedy Harris,j kennedy harris
25949,2015,12259,Melbourne,1,Jay,Kennedy Harris,j kennedy harris
44949,2017,12259,Melbourne,1,Jay,Kennedy Harris,j kennedy harris
57835,2018,12259,Melbourne,1,Jay,Kennedy Harris,j kennedy harris
64656,2019,12259,Melbourne,1,Jay,Kennedy Harris,j kennedy harris


In [14]:
merge.loc[merge.query('process_name.str.contains("k-harris")').index]

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
17598,Jay K-Harris,Melbourne,24.0,14381.0,2014.0,1.0,2,j k-harris,,,,,
17971,Jay K-Harris,Melbourne,26.0,14390.0,2014.0,2.0,2,j k-harris,,,,,
18346,Jay K-Harris,Melbourne,12.0,14398.0,2014.0,3.0,2,j k-harris,,,,,
18517,Jay K-Harris,Melbourne,49.0,14402.0,2014.0,4.0,2,j k-harris,,,,,
19109,Jay K-Harris,Melbourne,59.0,14416.0,2014.0,5.0,2,j k-harris,,,,,
19456,Jay K-Harris,Melbourne,71.0,14424.0,2014.0,6.0,2,j k-harris,,,,,
19780,Jay K-Harris,Melbourne,112.0,14431.0,2014.0,7.0,2,j k-harris,,,,,
20167,Jay K-Harris,Melbourne,64.0,14440.0,2014.0,8.0,2,j k-harris,,,,,
20415,Jay K-Harris,Melbourne,25.0,14445.0,2014.0,9.0,2,j k-harris,,,,,
20867,Jay K-Harris,Melbourne,74.0,14457.0,2014.0,11.0,2,j k-harris,,,,,


In [15]:
merge.loc[merge.query('process_name.str.contains("k-harris")').index, 'player_id'] = 12259

# A Litherland

In [16]:
players.query('process_name.str.contains("dewar")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
17731,2014,12261,Hawthorn,2,Angus,Dewar,a dewar
28330,2015,12261,Hawthorn,2,Angus,Dewar,a dewar
34796,2016,12261,Hawthorn,2,Angus,Dewar,a dewar
85708,2022,12261,West Coast,2,Angus,Dewar,a dewar


In [17]:
merge.loc[merge.query('process_name.str.contains("litherland")').index]

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
17727,Angus Litherland,Hawthorn,27.0,14384.0,2014.0,2.0,2,a litherland,,,,,
18061,Angus Litherland,Hawthorn,9.0,14392.0,2014.0,3.0,2,a litherland,,,,,
19687,Angus Litherland,Hawthorn,42.0,14429.0,2014.0,7.0,2,a litherland,,,,,
21084,Angus Litherland,Hawthorn,115.0,14462.0,2014.0,11.0,2,a litherland,,,,,
21220,Angus Litherland,Hawthorn,59.0,14465.0,2014.0,12.0,2,a litherland,,,,,
21600,Angus Litherland,Hawthorn,39.0,14473.0,2014.0,13.0,2,a litherland,,,,,
22058,Angus Litherland,Hawthorn,79.0,14484.0,2014.0,14.0,2,a litherland,,,,,
22414,Angus Litherland,Hawthorn,75.0,14492.0,2014.0,15.0,2,a litherland,,,,,
22787,Angus Litherland,Hawthorn,57.0,14500.0,2014.0,16.0,2,a litherland,,,,,
23182,Angus Litherland,Hawthorn,48.0,14509.0,2014.0,17.0,2,a litherland,,,,,


In [18]:
merge.loc[merge.query('process_name.str.contains("litherland")').index, 'player_id'] = 12261

# J D-Cardillo

In [19]:
players.query('process_name.str.contains("deluca")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
48091,2017,12556,Fremantle,2,Josh,Deluca,j deluca
66912,2019,12556,Carlton,2,Josh,Deluca,j deluca


In [20]:
merge.loc[merge.query('process_name.str.contains("cardillo")').index]

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
48085,Josh D-Cardillo,Fremantle,33.0,15108.0,2017.0,14.0,2,j d-cardillo,,,,,
49606,Josh D-Cardillo,Fremantle,40.0,15143.0,2017.0,18.0,2,j d-cardillo,,,,,
50626,Josh D-Cardillo,Fremantle,67.0,15166.0,2017.0,21.0,2,j d-cardillo,,,,,
51317,Josh D-Cardillo,Fremantle,50.0,15182.0,2017.0,22.0,2,j d-cardillo,,,,,


In [21]:
merge.loc[merge.query('process_name.str.contains("cardillo")').index, 'player_id'] = 12556

# W Rioli, I Hill

In [22]:
players.query('process_name.str.contains("j rioli")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
52447,2018,12613,West Coast,1,Junior,Rioli,j rioli
63661,2019,12613,West Coast,1,Junior,Rioli,j rioli
85401,2022,12613,West Coast,1,Junior,Rioli,j rioli


In [23]:
merge.loc[merge.query('process_name.str.contains("w rioli")').index]

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
52447,Willie Rioli,West Coast,31.0,15216.0,2018.0,2.0,2,w rioli,,,,,
52902,Willie Rioli,West Coast,40.0,15227.0,2018.0,3.0,2,w rioli,,,,,
53173,Willie Rioli,West Coast,48.0,15233.0,2018.0,4.0,2,w rioli,,,,,
53447,Willie Rioli,West Coast,78.0,15239.0,2018.0,5.0,2,w rioli,,,,,
54115,Willie Rioli,West Coast,46.0,15254.0,2018.0,6.0,2,w rioli,,,,,
54264,Willie Rioli,West Coast,109.0,15258.0,2018.0,7.0,2,w rioli,,,,,
54601,Willie Rioli,West Coast,69.0,15265.0,2018.0,8.0,2,w rioli,,,,,
55281,Willie Rioli,West Coast,64.0,15281.0,2018.0,9.0,2,w rioli,,,,,
55563,Willie Rioli,West Coast,72.0,15287.0,2018.0,10.0,2,w rioli,,,,,
55899,Willie Rioli,West Coast,46.0,15295.0,2018.0,11.0,2,w rioli,,,,,


In [24]:
merge.loc[merge.query('process_name.str.contains("w rioli")').index, 'player_id'] = 12613

In [25]:
players.query('process_name.str.contains("b hill")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
1363,2012,12066,Hawthorn,3,Bradley,Hill,b hill
9016,2013,12066,Hawthorn,3,Bradley,Hill,b hill
17543,2014,12066,Hawthorn,3,Bradley,Hill,b hill
26258,2015,12066,Hawthorn,3,Bradley,Hill,b hill
36019,2016,12066,Hawthorn,3,Bradley,Hill,b hill
43410,2017,12066,Fremantle,3,Bradley,Hill,b hill
51898,2018,12066,Fremantle,3,Bradley,Hill,b hill
60826,2019,12066,Fremantle,3,Bradley,Hill,b hill
66703,2019,12744,Greater Western Sydney,1,Bobby,Hill,b hill
69446,2020,12066,St Kilda,3,Bradley,Hill,b hill


In [26]:
merge.loc[merge.query('process_name.str.contains("i hill")').index]

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
66696,Ian Hill,Greater Western Sydney,60.0,15549.0,2019.0,17.0,2,i hill,,,,,
66941,Ian Hill,Greater Western Sydney,60.0,15555.0,2019.0,18.0,2,i hill,,,,,
67448,Ian Hill,Greater Western Sydney,59.0,15566.0,2019.0,19.0,2,i hill,,,,,
67688,Ian Hill,Greater Western Sydney,53.0,15572.0,2019.0,20.0,2,i hill,,,,,
68001,Ian Hill,Greater Western Sydney,38.0,15579.0,2019.0,21.0,2,i hill,,,,,
68702,Ian Hill,Greater Western Sydney,40.0,15595.0,2019.0,22.0,2,i hill,,,,,
68900,Ian Hill,Greater Western Sydney,85.0,15601.0,2019.0,23.0,2,i hill,,,,,
71608,Ian Hill,Greater Western Sydney,5.0,15753.0,2020.0,7.0,2,i hill,,,,,
74095,Ian Hill,Greater Western Sydney,49.0,15824.0,2020.0,13.0,2,i hill,,,,,
74352,Ian Hill,Greater Western Sydney,84.0,15830.0,2020.0,14.0,2,i hill,,,,,


In [27]:
merge.loc[merge.query('process_name.str.contains("i hill")').index, 'player_id'] = 12744

# J Jordon

In [28]:
players.query('process_name.str.contains("jord")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
76036,2021,12853,Melbourne,1,James,Jordon,j jordon
85051,2022,12853,Melbourne,1,James,Jordon,j jordon


In [29]:
merge.loc[merge.query('process_name.str.contains("jord")').index]

Unnamed: 0,Player,Team,SC,match_id,season,match_round,name_length,process_name,player_id,player_team,no_teams,player_first_name,player_last_name
76003,James Jordan,Melbourne,78.0,15877.0,2021.0,1.0,2,j jordan,,,,,
76512,James Jordan,Melbourne,82.0,15888.0,2021.0,2.0,2,j jordan,,,,,
77057,James Jordan,Melbourne,49.0,15900.0,2021.0,3.0,2,j jordan,,,,,
77451,James Jordan,Melbourne,24.0,15909.0,2021.0,4.0,2,j jordan,,,,,
77865,James Jordan,Melbourne,76.0,15918.0,2021.0,5.0,2,j jordan,,,,,
78118,James Jordan,Melbourne,82.0,15924.0,2021.0,6.0,2,j jordan,,,,,
78675,James Jordan,Melbourne,91.0,15935.0,2021.0,7.0,2,j jordan,,,,,
78877,James Jordan,Melbourne,85.0,15942.0,2021.0,8.0,2,j jordan,,,,,
79463,James Jordon,Melbourne,96.0,15954.0,2021.0,9.0,2,j jordon,12853.0,Melbourne,1.0,James,Jordon
79759,James Jordon,Melbourne,47.0,15959.0,2021.0,10.0,2,j jordon,12853.0,Melbourne,1.0,James,Jordon


In [30]:
merge.loc[merge.query('process_name.str.contains("jord")').index, 'player_id'] = 12853

In [31]:
merge.isna().sum()

Player                 0
Team                   0
SC                     0
match_id               0
season                 0
match_round            0
name_length            0
process_name           0
player_id              0
player_team          162
no_teams             162
player_first_name    162
player_last_name     162
dtype: int64

In [32]:
merge = merge[['match_id', 'season', 'player_id', 'SC']]

In [33]:
# now that id's have been fixed, merge again to fill missing player information
final_df = pd.merge(merge, players, on=['player_id', 'season'], how='left')

In [34]:
final_df

Unnamed: 0,match_id,season,player_id,SC,player_team,no_teams,player_first_name,player_last_name,process_name
0,13960.0,2012.0,12030.0,103.0,Greater Western Sydney,1,Adam,Kennedy,a kennedy
1,13960.0,2012.0,12026.0,84.0,Greater Western Sydney,1,Toby,Greene,t greene
2,13960.0,2012.0,12021.0,87.0,Greater Western Sydney,2,Tomas,Bugg,t bugg
3,13960.0,2012.0,11647.0,126.0,Greater Western Sydney,2,Rhys,Palmer,r palmer
4,13960.0,2012.0,10973.0,104.0,Greater Western Sydney,1,Chad,Cornes,c cornes
...,...,...,...,...,...,...,...,...,...
93738,16346.0,2022.0,12469.0,51.0,Sydney,2,Ryan,Clarke,r clarke
93739,16346.0,2022.0,12820.0,44.0,Sydney,1,Sam,Wicks,s wicks
93740,16346.0,2022.0,12343.0,52.0,Sydney,2,Paddy,McCartin,p mccartin
93741,16346.0,2022.0,12419.0,37.0,Sydney,1,Tom,Papley,t papley


In [35]:
final_df.isna().sum()

match_id             0
season               0
player_id            0
SC                   0
player_team          0
no_teams             0
player_first_name    0
player_last_name     0
process_name         0
dtype: int64

In [5]:
final_df = final_df[['match_id', 'player_id', 'SC']]

In [6]:
final_df.to_parquet('../../data/curated/cleaned_supercoach_12-22.parquet')