# Get Captain Data

In this notebook we access the AFL season summary on wikipedia which contains each club's leadership such as Coach, Captains, Vice-Captains etc. We extract each of the captains for each team in each season.

In the event a club has multiple captains, the names appear to be listed by alphabetical order of last name.Therefore each are treated equally not necessarily taking the first name as the primary captain.

The idea behind using captain data would be that umpires would tend to communicate with captains more often, whether it be the coin toss, or asking for clarification on contenious umpire decisions and hence would be noticed more and this may lead to polling more often.

Since we cannot differentiate which is the captain taking coin tosses, we include them all.

In [1]:
import pandas as pd

from tqdm import tqdm

In [2]:
seasons = [i for i in range(2012, 2024)]

In [3]:
seasons_df = {}
# set of columns that appear on every season's page
table_columns = {'Club', 'Coach', 'Captain(s)'}

for season in tqdm(seasons):
    url = f'https://en.wikipedia.org/wiki/{season}_AFL_season'

    df = pd.read_html(url)

    for i in range(len(df)):

        curr_df = df[i]
        # of the list of tables on the page
        # find the table that contains the requried headings
        if table_columns <= set(curr_df.columns):
            idx = i
            
    table = df[idx]

    table['season'] = season

    seasons_df[season] = table

100%|██████████| 12/12 [00:30<00:00,  2.56s/it]


In [4]:
combined_df = pd.concat(seasons_df.values()).reset_index()

In [5]:
combined_df = combined_df[['Club', 'Captain(s)', 'season']]

In [6]:
combined_df

Unnamed: 0,Club,Captain(s),season
0,Adelaide[81],Nathan van Berlo,2012
1,Brisbane Lions,Jonathan Brown,2012
2,Carlton,Chris Judd,2012
3,Collingwood,Nick Maxwell,2012
4,Essendon,Jobe Watson,2012
...,...,...,...
215,Richmond,"Dylan Grimes, Toby Nankervis",2023
216,St Kilda,Jack Steele,2023
217,Sydney,"Callum Mills, Luke Parker, Dane Rampe",2023
218,West Coast,Luke Shuey,2023


In [7]:
# save downloaded data so we do not need to download everytime
combined_df.loc[combined_df['season'] < 2023].to_csv('../../data/landing/is_player_captain_12-22.csv')
combined_df.loc[combined_df['season'] == 2023].to_csv('../../data/landing/is_player_captain_23.csv')

In [8]:
combined_df = pd.read_csv('../../data/landing/is_player_captain_12-22.csv')

In [9]:
sorted(combined_df.Club.unique())

['Adelaide',
 'Adelaide[50]',
 'Adelaide[81]',
 'Adelaide[93]',
 'Brisbane Lions',
 'Brisbane Lions[51]',
 'Brisbane Lions[94]',
 'Carlton',
 'Carlton[52]',
 'Carlton[95]',
 'Collingwood',
 'Collingwood[53]',
 'Collingwood[96]',
 'Essendon',
 'Essendon[54]',
 'Essendon[97]',
 'Fremantle',
 'Fremantle[55]',
 'Fremantle[98]',
 'Geelong',
 'Geelong[101]',
 'Geelong[58]',
 'Gold Coast',
 'Gold Coast[56]',
 'Gold Coast[99]',
 'Greater Western Sydney',
 'Greater Western Sydney[100]',
 'Greater Western Sydney[57]',
 'Hawthorn',
 'Melbourne',
 'Melbourne[102]',
 'Melbourne[59]',
 'North Melbourne',
 'North Melbourne[103]',
 'North Melbourne[60]',
 'Port Adelaide',
 'Port Adelaide[104]',
 'Port Adelaide[61]',
 'Richmond',
 'Richmond[105]',
 'Richmond[62]',
 'St Kilda',
 'St Kilda[106]',
 'St Kilda[63]',
 'Sydney',
 'Sydney[107]',
 'Sydney[64]',
 'West Coast',
 'West Coast[108]',
 'West Coast[65]',
 'Western Bulldogs',
 'Western Bulldogs[109]',
 'Western Bulldogs[66]']

In [10]:
# remove wikipedia references from Club names
combined_df['team'] = combined_df['Club'].str.extract(r'^(.+?)(?:\[\d+\])?$')

In [11]:
# tags removed successfully
combined_df.team.unique()

array(['Adelaide', 'Brisbane Lions', 'Carlton', 'Collingwood', 'Essendon',
       'Fremantle', 'Gold Coast', 'Greater Western Sydney', 'Geelong',
       'Hawthorn', 'Melbourne', 'North Melbourne', 'Port Adelaide',
       'Richmond', 'St Kilda', 'Sydney', 'West Coast', 'Western Bulldogs'],
      dtype=object)

In [12]:
combined_df['Captain(s)'].unique()

array(['Nathan van Berlo', 'Jonathan Brown', 'Chris Judd', 'Nick Maxwell',
       'Jobe Watson', 'Matthew Pavlich', 'Gary Ablett',
       'Phil Davis, Luke Power, Callan Ward', 'Joel Selwood',
       'Luke Hodge', 'Jack Grimes, Jack Trengove', 'Andrew Swallow',
       'Domenic Cassisi', 'Chris Newman', 'Nick Riewoldt',
       'Adam Goodes, Jarrad McVeigh', 'Darren Glass', 'Matthew Boyd',
       'Jed Adcock, Jonathan Brown', 'Marc Murphy',
       'Phil Davis, Callan Ward', 'Travis Boak', 'Trent Cotchin',
       'Kieren Jack, Jarrad McVeigh', 'Jed Adcock', 'Scott Pendlebury',
       'Phil Davis  Callan Ward', 'Jack Grimes  Nathan Jones',
       'Kieren Jack  Jarrad McVeigh', 'Ryan Griffen',
       'Taylor Walker[119]', 'Tom Rockliff[121]', 'Nathan Jones',
       'Shannon Hurn[137]', 'Robert Murphy[139]', 'Taylor Walker',
       'Tom Rockliff', 'Brendon Goddard', 'David Mundy', 'Shannon Hurn',
       'Robert Murphy', 'Dayne Beams', 'Dyson Heppell', 'Nat Fyfe',
       'Tom Lynch Steven May

In [13]:
combined_df.loc[combined_df['Captain(s)'].str.contains("\(")]

Unnamed: 0.1,Unnamed: 0,Club,Captain(s),season,team
109,109,Brisbane Lions,Dayne Beams (until 23 May),2018,Brisbane Lions
110,110,Brisbane Lions,Dayne Zorko (from 23 May),2018,Brisbane Lions
116,116,Gold Coast,Tom Lynch (until 2 August),2018,Gold Coast


In [14]:
# remove dates information where captain was changed mid season

pattern = r'^(.*?)(?:\s\(([^)]+)\))?$'
extracted = combined_df['Captain(s)'].str.extract(pattern)
extracted.columns = ['name', 'dates']
combined_df = pd.concat([combined_df, extracted], axis=1)[['season', 'team', 'name', 'dates']]

In [15]:
combined_df.name.unique()

array(['Nathan van Berlo', 'Jonathan Brown', 'Chris Judd', 'Nick Maxwell',
       'Jobe Watson', 'Matthew Pavlich', 'Gary Ablett',
       'Phil Davis, Luke Power, Callan Ward', 'Joel Selwood',
       'Luke Hodge', 'Jack Grimes, Jack Trengove', 'Andrew Swallow',
       'Domenic Cassisi', 'Chris Newman', 'Nick Riewoldt',
       'Adam Goodes, Jarrad McVeigh', 'Darren Glass', 'Matthew Boyd',
       'Jed Adcock, Jonathan Brown', 'Marc Murphy',
       'Phil Davis, Callan Ward', 'Travis Boak', 'Trent Cotchin',
       'Kieren Jack, Jarrad McVeigh', 'Jed Adcock', 'Scott Pendlebury',
       'Phil Davis  Callan Ward', 'Jack Grimes  Nathan Jones',
       'Kieren Jack  Jarrad McVeigh', 'Ryan Griffen',
       'Taylor Walker[119]', 'Tom Rockliff[121]', 'Nathan Jones',
       'Shannon Hurn[137]', 'Robert Murphy[139]', 'Taylor Walker',
       'Tom Rockliff', 'Brendon Goddard', 'David Mundy', 'Shannon Hurn',
       'Robert Murphy', 'Dayne Beams', 'Dyson Heppell', 'Nat Fyfe',
       'Tom Lynch Steven May

In [16]:
combined_df.query('name.str.contains("\[")')

Unnamed: 0,season,team,name,dates
54,2015,Adelaide,Taylor Walker[119],
55,2015,Brisbane Lions,Tom Rockliff[121],
70,2015,West Coast,Shannon Hurn[137],
71,2015,Western Bulldogs,Robert Murphy[139],


In [17]:
# remove wikipedia reference tags from captain names
combined_df['name'] = combined_df['name'].str.extract(r'^(.+?)(?:\[\d+\])?$')

In [18]:
combined_df.name.unique()

array(['Nathan van Berlo', 'Jonathan Brown', 'Chris Judd', 'Nick Maxwell',
       'Jobe Watson', 'Matthew Pavlich', 'Gary Ablett',
       'Phil Davis, Luke Power, Callan Ward', 'Joel Selwood',
       'Luke Hodge', 'Jack Grimes, Jack Trengove', 'Andrew Swallow',
       'Domenic Cassisi', 'Chris Newman', 'Nick Riewoldt',
       'Adam Goodes, Jarrad McVeigh', 'Darren Glass', 'Matthew Boyd',
       'Jed Adcock, Jonathan Brown', 'Marc Murphy',
       'Phil Davis, Callan Ward', 'Travis Boak', 'Trent Cotchin',
       'Kieren Jack, Jarrad McVeigh', 'Jed Adcock', 'Scott Pendlebury',
       'Phil Davis  Callan Ward', 'Jack Grimes  Nathan Jones',
       'Kieren Jack  Jarrad McVeigh', 'Ryan Griffen', 'Taylor Walker',
       'Tom Rockliff', 'Nathan Jones', 'Shannon Hurn', 'Robert Murphy',
       'Brendon Goddard', 'David Mundy', 'Dayne Beams', 'Dyson Heppell',
       'Nat Fyfe', 'Tom Lynch Steven May', 'Phil Davis Callan Ward',
       'Jarryd Roughead', 'Nathan Jones Jack Viney', 'Jack Ziebell',
  

# Seasons 2014 - 2021

where teams have multiple captains, they are not split by a comma in these seasons

In [19]:
combined_df.query('season == 2021')

Unnamed: 0,season,team,name,dates
166,2021,Adelaide,Rory Sloane,
167,2021,Brisbane Lions,Dayne Zorko,
168,2021,Carlton,Patrick Cripps Sam Docherty,
169,2021,Collingwood,Scott Pendlebury,
170,2021,Essendon,Dyson Heppell,
171,2021,Fremantle,Nathan Fyfe,
172,2021,Geelong,Joel Selwood,
173,2021,Gold Coast,David Swallow Jarrod Witts,
174,2021,Greater Western Sydney,Stephen Coniglio,
175,2021,Hawthorn,Ben McEvoy,


In [20]:
def insert_comma_after_second_word(s):
    """Inserts a comma after every second word in the string."""
    # Split string into words
    words = s.split()
    # Insert commas after every second word
    result = []
    for i, word in enumerate(words):
        # Check if we're on a second word
        if (i + 1) % 2 == 0:  
            result.append(word + ',')
        else:
            result.append(word)
    # return string up to second last character, removing the final comma
    return ' '.join(result)[:-1]

test_string = "Patrick Cripps Sam Docherty"
print(insert_comma_after_second_word(test_string))  # Expected: "Patrick Cripps, Sam Docherty"


Patrick Cripps, Sam Docherty


In [21]:
'Patrick Cripps, Sam Docherty'.split(',')

['Patrick Cripps', ' Sam Docherty']

In [23]:
condition = combined_df.season.isin([i for i in range(2014, 2022)])

In [24]:
combined_df.loc[condition, 'commas'] = combined_df.loc[condition, 'name'].apply(insert_comma_after_second_word)

In [25]:
combined_df.loc[condition]

Unnamed: 0,season,team,name,dates,commas
36,2014,Adelaide,Nathan van Berlo,,"Nathan van, Berl"
37,2014,Brisbane Lions,Jed Adcock,,Jed Adcock
38,2014,Carlton,Marc Murphy,,Marc Murphy
39,2014,Collingwood,Scott Pendlebury,,Scott Pendlebury
40,2014,Essendon,Jobe Watson,,Jobe Watson
...,...,...,...,...,...
179,2021,Richmond,Trent Cotchin,,Trent Cotchin
180,2021,St Kilda,Jarryn Geary Jack Steele,,"Jarryn Geary, Jack Steele"
181,2021,Sydney,Josh Kennedy Luke Parker Dane Rampe,,"Josh Kennedy, Luke Parker, Dane Rampe"
182,2021,West Coast,Luke Shuey,,Luke Shuey


In [26]:
# correct the only case where a captain has 3 names 
# and therefore the comma inserts in the wrong place
combined_df.loc[(combined_df['season'] == 2014) & (combined_df['team'] == "Adelaide"), 'commas'] = combined_df.loc[(combined_df['season'] == 2014) & (combined_df['team'] == "Adelaide"), 'name']

In [27]:
combined_df.loc[~condition, 'commas'] = combined_df.loc[~condition, 'name']

In [28]:
# multiple captains are split by commas
combined_df['captain_list'] = combined_df['commas'].str.split(', ').apply(lambda x: x if isinstance(x, list) else [x])

In [29]:
combined_df.loc[condition]

Unnamed: 0,season,team,name,dates,commas,captain_list
36,2014,Adelaide,Nathan van Berlo,,Nathan van Berlo,[Nathan van Berlo]
37,2014,Brisbane Lions,Jed Adcock,,Jed Adcock,[Jed Adcock]
38,2014,Carlton,Marc Murphy,,Marc Murphy,[Marc Murphy]
39,2014,Collingwood,Scott Pendlebury,,Scott Pendlebury,[Scott Pendlebury]
40,2014,Essendon,Jobe Watson,,Jobe Watson,[Jobe Watson]
...,...,...,...,...,...,...
179,2021,Richmond,Trent Cotchin,,Trent Cotchin,[Trent Cotchin]
180,2021,St Kilda,Jarryn Geary Jack Steele,,"Jarryn Geary, Jack Steele","[Jarryn Geary, Jack Steele]"
181,2021,Sydney,Josh Kennedy Luke Parker Dane Rampe,,"Josh Kennedy, Luke Parker, Dane Rampe","[Josh Kennedy, Luke Parker, Dane Rampe]"
182,2021,West Coast,Luke Shuey,,Luke Shuey,[Luke Shuey]


In [30]:
combined_df['no_captains'] = combined_df['captain_list'].apply(len)

In [31]:
expanded = combined_df['captain_list'].apply(lambda x: pd.Series(x)).fillna('na')

expanded.columns = [f'name{i+1}' for i in range(expanded.shape[1])]


In [32]:
expanded

Unnamed: 0,name1,name2,name3
0,Nathan van Berlo,na,na
1,Jonathan Brown,na,na
2,Chris Judd,na,na
3,Nick Maxwell,na,na
4,Jobe Watson,na,na
...,...,...,...
197,Dylan Grimes,Toby Nankervis,na
198,Jack Steele,na,na
199,Callum Mills,Luke Parker,Dane Rampe
200,Luke Shuey,na,na


In [33]:
combined_df = combined_df.join(expanded)

In [34]:
combined_df.sort_values('no_captains', ascending=False)[:20]

Unnamed: 0,season,team,name,dates,commas,captain_list,no_captains,name1,name2,name3
192,2022,Greater Western Sydney,"Stephen Coniglio, Toby Greene, Josh Kelly",,"Stephen Coniglio, Toby Greene, Josh Kelly","[Stephen Coniglio, Toby Greene, Josh Kelly]",3,Stephen Coniglio,Toby Greene,Josh Kelly
199,2022,Sydney,"Callum Mills, Luke Parker, Dane Rampe",,"Callum Mills, Luke Parker, Dane Rampe","[Callum Mills, Luke Parker, Dane Rampe]",3,Callum Mills,Luke Parker,Dane Rampe
163,2020,Sydney,Josh Kennedy Luke Parker Dane Rampe,,"Josh Kennedy, Luke Parker, Dane Rampe","[Josh Kennedy, Luke Parker, Dane Rampe]",3,Josh Kennedy,Luke Parker,Dane Rampe
145,2019,Sydney,Josh Kennedy Luke Parker Dane Rampe,,"Josh Kennedy, Luke Parker, Dane Rampe","[Josh Kennedy, Luke Parker, Dane Rampe]",3,Josh Kennedy,Luke Parker,Dane Rampe
7,2012,Greater Western Sydney,"Phil Davis, Luke Power, Callan Ward",,"Phil Davis, Luke Power, Callan Ward","[Phil Davis, Luke Power, Callan Ward]",3,Phil Davis,Luke Power,Callan Ward
181,2021,Sydney,Josh Kennedy Luke Parker Dane Rampe,,"Josh Kennedy, Luke Parker, Dane Rampe","[Josh Kennedy, Luke Parker, Dane Rampe]",3,Josh Kennedy,Luke Parker,Dane Rampe
98,2017,Greater Western Sydney,Phil Davis Callan Ward,,"Phil Davis, Callan Ward","[Phil Davis, Callan Ward]",2,Phil Davis,Callan Ward,na
140,2019,Melbourne,Nathan Jones Jack Viney,,"Nathan Jones, Jack Viney","[Nathan Jones, Jack Viney]",2,Nathan Jones,Jack Viney,na
80,2016,Greater Western Sydney,Phil Davis Callan Ward,,"Phil Davis, Callan Ward","[Phil Davis, Callan Ward]",2,Phil Davis,Callan Ward,na
138,2019,Greater Western Sydney,Phil Davis Callan Ward,,"Phil Davis, Callan Ward","[Phil Davis, Callan Ward]",2,Phil Davis,Callan Ward,na


In [35]:
combined_df.loc[~combined_df.dates.isna()]

Unnamed: 0,season,team,name,dates,commas,captain_list,no_captains,name1,name2,name3
109,2018,Brisbane Lions,Dayne Beams,until 23 May,Dayne Beams,[Dayne Beams],1,Dayne Beams,na,na
110,2018,Brisbane Lions,Dayne Zorko,from 23 May,Dayne Zorko,[Dayne Zorko],1,Dayne Zorko,na,na
116,2018,Gold Coast,Tom Lynch,until 2 August,Tom Lynch,[Tom Lynch],1,Tom Lynch,na,na


# Mid Season Change

Brisbane 2018: Dayne Beams up to and including round 9, Dayne Zorko from round 10

Gold Coast 2018: Tom Lynch is captain up to and including round 19

In [36]:
# set min and max round that a player was captain
combined_df['min_round'] = 1
combined_df['max_round'] = 23

In [37]:
# adjust 3 players whose captain status changed mid-season
combined_df.loc[109, 'max_round'] = 9

combined_df.loc[110, 'min_round'] = 10

combined_df.loc[116, 'max_round'] = 19

In [38]:
combined_df = combined_df[['season', 'team', 'name1', 'name2', 'name3', 'min_round', 'max_round']]

In [39]:
melted_df = combined_df.melt(id_vars=['season', 'team', 'min_round', 'max_round'],
                 value_vars=['name1', 'name2', 'name3'],
                 value_name='name')

In [40]:
melted_df.drop(columns=['variable'], inplace=True)

melted_df.sort_values(['season', 'team', 'name'], inplace=True)

melted_df.reset_index(drop=True, inplace=True)

In [41]:
melted_df

Unnamed: 0,season,team,min_round,max_round,name
0,2012,Adelaide,1,23,Nathan van Berlo
1,2012,Adelaide,1,23,na
2,2012,Adelaide,1,23,na
3,2012,Brisbane Lions,1,23,Jonathan Brown
4,2012,Brisbane Lions,1,23,na
...,...,...,...,...,...
601,2022,West Coast,1,23,na
602,2022,West Coast,1,23,na
603,2022,Western Bulldogs,1,23,Marcus Bontempelli
604,2022,Western Bulldogs,1,23,na


In [42]:
melted_df = melted_df.query('name != "na"')

In [43]:
for col in range(1, 24):
    melted_df[col] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  melted_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  melted_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  melted_df[col] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [44]:
melted_df

Unnamed: 0,season,team,min_round,max_round,name,1,2,3,4,5,...,14,15,16,17,18,19,20,21,22,23
0,2012,Adelaide,1,23,Nathan van Berlo,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2012,Brisbane Lions,1,23,Jonathan Brown,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,2012,Carlton,1,23,Chris Judd,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,2012,Collingwood,1,23,Nick Maxwell,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,2012,Essendon,1,23,Jobe Watson,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,2022,Sydney,1,23,Callum Mills,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
598,2022,Sydney,1,23,Dane Rampe,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599,2022,Sydney,1,23,Luke Parker,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
600,2022,West Coast,1,23,Luke Shuey,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
for idx, row in melted_df.iterrows():
    for col in range(row['min_round'], row['max_round'] + 1):

        melted_df.loc[idx, col] = col

In [46]:
melted_df.loc[melted_df['name'] == "Tom Lynch"]

Unnamed: 0,season,team,min_round,max_round,name,1,2,3,4,5,...,14,15,16,17,18,19,20,21,22,23
292,2017,Gold Coast,1,23,Tom Lynch,1,2,3,4,5,...,14,15,16,17,18,19,20,21,22,23
349,2018,Gold Coast,1,19,Tom Lynch,1,2,3,4,5,...,14,15,16,17,18,19,0,0,0,0


In [47]:
new_df = melted_df.melt(id_vars=['season', 'team', 'name'],
        value_vars=[i for i in range(1, 24)],
        value_name='match_round').sort_values(['name', 'season'])

In [48]:
# success
new_df.query('match_round == 0')

Unnamed: 0,season,team,name,variable,match_round
2314,2018,Brisbane Lions,Dayne Beams,10,0
2557,2018,Brisbane Lions,Dayne Beams,11,0
2800,2018,Brisbane Lions,Dayne Beams,12,0
3043,2018,Brisbane Lions,Dayne Beams,13,0
3286,2018,Brisbane Lions,Dayne Beams,14,0
3529,2018,Brisbane Lions,Dayne Beams,15,0
3772,2018,Brisbane Lions,Dayne Beams,16,0
4015,2018,Brisbane Lions,Dayne Beams,17,0
4258,2018,Brisbane Lions,Dayne Beams,18,0
4501,2018,Brisbane Lions,Dayne Beams,19,0


In [49]:
new_df = new_df.query('match_round != 0').reset_index(drop=True)

In [50]:
new_df

Unnamed: 0,season,team,name,variable,match_round
0,2012,Sydney,Adam Goodes,1,1
1,2012,Sydney,Adam Goodes,2,2
2,2012,Sydney,Adam Goodes,3,3
3,2012,Sydney,Adam Goodes,4,4
4,2012,Sydney,Adam Goodes,5,5
...,...,...,...,...,...
5557,2021,Richmond,Trent Cotchin,19,19
5558,2021,Richmond,Trent Cotchin,20,20
5559,2021,Richmond,Trent Cotchin,21,21
5560,2021,Richmond,Trent Cotchin,22,22


In [51]:
# read into player details to merge player_id
players = pd.read_parquet('../../data/curated/player_information_12-22.parquet')

In [52]:
players

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
0,2012,10822,Greater Western Sydney,1,James,McDonald,j mcdonald
1,2012,10942,Sydney,1,Adam,Goodes,a goodes
2,2012,10973,Greater Western Sydney,1,Chad,Cornes,c cornes
3,2012,10988,Sydney,1,Jude,Bolton,j bolton
4,2012,11183,Sydney,1,Ted,Richards,t richards
...,...,...,...,...,...,...,...
93832,2022,12939,North Melbourne,1,Charlie,Comben,c comben
93839,2022,13024,North Melbourne,1,Josh,Goater,j goater
93889,2022,11731,Essendon,1,Michael,Hurley,m hurley
94022,2022,13025,Hawthorn,1,Ned,Long,n long


In [53]:
players['player'] = players['player_first_name'] + ' ' + players['player_last_name']

In [54]:
set(new_df['name']) - set(players['player'])

{'Josh Kennedy', 'Nathan Fyfe'}

In [55]:
players.query('player.str.contains("Kennedy") & player_team == "Sydney"')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name,player
17,2012,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
8816,2013,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
17308,2014,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
25976,2015,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
35169,2016,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
43174,2017,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
52105,2018,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
60635,2019,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
69301,2020,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy
76131,2021,11677,Sydney,1,Josh P.,Kennedy,j kennedy,Josh P. Kennedy


In [56]:
players.query('player.str.contains("Fyfe")')

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name,player
300,2012,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
8702,2013,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
17277,2014,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
26210,2015,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
34649,2016,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
43403,2017,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
51890,2018,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
60816,2019,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
69261,2020,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe
75996,2021,11844,Fremantle,1,Nat,Fyfe,n fyfe,Nat Fyfe


In [57]:
new_df.loc[new_df.name.str.contains("Ken"), 'name'] = 'Josh P. Kennedy'

In [58]:
new_df.loc[new_df.name.str.contains("Fyfe"), 'name'] = 'Nat Fyfe'

In [59]:
set(new_df['name']) - set(players['player'])

set()

In [60]:
set(new_df['team']) - set(players['player_team'])

set()

In [61]:
out_df = pd.merge(left=new_df, right=players, 
         left_on=['season', 'name', 'team'],
         right_on=['season', 'player', 'player_team'],
         how='left')[
             ['season', 'player_id', 'player_team', 'player', 'match_round'
              ]]

In [62]:
out_df.player_id.unique()

array([10942., 11497., 11688., 11829., 11272., 11683., 12418., 11186.,
       11215., 12148., 11032., 11409., 11923., 11734., 12082., 11230.,
       11887., 11903., 11794., 11170., 11712., 12377., 11808., 12152.,
       11730., 11312., 12184., 11399., 11634., 11348., 11289., 11552.,
       11037., 12262., 11677., 11570., 11199., 11958., 10858., 11813.,
       11472., 12277., 11280., 11041., 11972., 11844., 11524., 11396.,
          nan, 11352., 11153., 12155., 12269., 11833., 11084., 11799.,
       11403., 12175., 11506., 11484., 12023., 11945., 11724., 12026.,
       12379., 12007., 11953., 11792., 12329., 11591., 11671.])

In [63]:
# n/a values appear in 2014 and 2019
out_df.loc[out_df.player_id.isna()].drop_duplicates('season')

Unnamed: 0,season,player_id,player_team,player,match_round
3680,2014,,,,1
4347,2019,,,,1


In [64]:
# Adelaide captain did not play in 2014
out_df.loc[out_df.season == 2014].drop_duplicates('player_id').sort_values('player_team')

Unnamed: 0,season,player_id,player_team,player,match_round
2001,2014,11348.0,Brisbane Lions,Jed Adcock,1
2967,2014,11472.0,Carlton,Marc Murphy,1
4416,2014,11506.0,Collingwood,Scott Pendlebury,1
2070,2014,11289.0,Essendon,Jobe Watson,1
3243,2014,11041.0,Fremantle,Matthew Pavlich,1
2162,2014,11552.0,Geelong,Joel Selwood,1
1173,2014,11170.0,Gold Coast,Gary Ablett,1
299,2014,11683.0,Greater Western Sydney,Callan Ward,1
4025,2014,11833.0,Greater Western Sydney,Phil Davis,1
2691,2014,11199.0,Hawthorn,Luke Hodge,1


In [65]:
# Second Carlton captain did not play in 2019
out_df.loc[out_df.season == 2019].drop_duplicates('player_id').sort_values('player_team')

Unnamed: 0,season,player_id,player_team,player,match_round
4945,2019,11724.0,Adelaide,Taylor Walker,1
4232,2019,11799.0,Adelaide,Rory Sloane,1
805,2019,12082.0,Brisbane Lions,Dayne Zorko,1
3887,2019,12269.0,Carlton,Patrick Cripps,1
4531,2019,11506.0,Collingwood,Scott Pendlebury,1
989,2019,11903.0,Essendon,Dyson Heppell,1
3404,2019,11844.0,Fremantle,Nat Fyfe,1
2277,2019,11552.0,Geelong,Joel Selwood,1
690,2019,11923.0,Gold Coast,David Swallow,1
1725,2019,12184.0,Gold Coast,Jarrod Witts,1


In [66]:
out_df.dropna(inplace=True)

In [67]:
out_df['player_id'] = out_df['player_id'].astype(int)

In [68]:
out_df.query('season == 2022').drop_duplicates('player_id')

Unnamed: 0,season,player_id,player_team,player,match_round
161,2022,11688,Hawthorn,Ben McEvoy,1
437,2022,12418,Sydney,Callum Mills,1
575,2022,12148,Sydney,Dane Rampe,1
874,2022,12082,Brisbane Lions,Dayne Zorko,1
920,2022,11887,Richmond,Dylan Grimes,1
1058,2022,11903,Essendon,Dyson Heppell,1
1334,2022,12377,St Kilda,Jack Steele,1
1587,2022,11730,North Melbourne,Jack Ziebell,1
1794,2022,12184,Gold Coast,Jarrod Witts,1
2346,2022,11552,Geelong,Joel Selwood,1


In [70]:
out_df.to_parquet('../../data/curated/season_captains_12-22.parquet')