# Player Information

We use this notebook to get a dataframe of player names, id and team for each season and add a column that transforms the the player's name in such a way that it can be merged with dataframes from different sources.

This transformation will come in useful when we merge the historic supercoach data as the player_id's do not match, and also their names are represented in a different fashion. e.g. Thomas Stewart vs. Tom Stewart

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../data/raw/stats_12-22_no_na.csv')

In [3]:
df.columns

Index(['match_id', 'match_home_team', 'match_away_team', 'match_date',
       'match_round', 'match_home_team_goals', 'match_home_team_behinds',
       'match_home_team_score', 'match_away_team_goals',
       'match_away_team_behinds', 'match_away_team_score', 'match_margin',
       'match_winner', 'player_id', 'player_first_name', 'player_last_name',
       'player_team', 'kicks', 'marks', 'handballs', 'disposals',
       'effective_disposals', 'disposal_efficiency_percentage', 'goals',
       'behinds', 'hitouts', 'tackles', 'rebounds', 'inside_fifties',
       'clearances', 'clangers', 'free_kicks_for', 'free_kicks_against',
       'brownlow_votes', 'contested_possessions', 'uncontested_possessions',
       'contested_marks', 'marks_inside_fifty', 'one_percenters', 'bounces',
       'goal_assists', 'time_on_ground_percentage', 'afl_fantasy_score',
       'centre_clearances', 'stoppage_clearances', 'score_involvements',
       'metres_gained', 'turnovers', 'intercepts', 'tackles_insi

In [4]:
df = df[['match_id', 'season', 'player_id', 'player_first_name', 'player_last_name', 'player_team']]

In [5]:
df.drop_duplicates(['player_id', 'season'], inplace=True)

In [6]:
df.query('player_id == 11706')

Unnamed: 0,match_id,season,player_id,player_first_name,player_last_name,player_team
195,13964,2012,11706,Patrick,Dangerfield,Adelaide
8644,14167,2013,11706,Patrick,Dangerfield,Adelaide
17442,14378,2014,11706,Patrick,Dangerfield,Adelaide
26151,14587,2015,11706,Patrick,Dangerfield,Adelaide
34777,14795,2016,11706,Patrick,Dangerfield,Geelong
43397,15002,2017,11706,Patrick,Dangerfield,Geelong
52499,15218,2018,11706,Patrick,Dangerfield,Geelong
60507,15409,2019,11706,Patrick,Dangerfield,Geelong
69349,15619,2020,11706,Patrick,Dangerfield,Geelong
76042,15878,2021,11706,Patrick,Dangerfield,Geelong


In [7]:
def get_multi_team(row):
    """returns the number of teams a player has played for"""

    player_id = row['player_id']

    return len(df.query('player_id == @player_id').player_team.unique())

In [8]:
df['no_teams'] = df.apply(get_multi_team, axis=1)

In [9]:
df.sort_values('no_teams')

Unnamed: 0,match_id,season,player_id,player_first_name,player_last_name,player_team,no_teams
0,13960,2012,10822,James,McDonald,Greater Western Sydney,1
55087,15276,2018,12640,Jordan,Ridley,Essendon,1
55086,15276,2018,12639,Dylan,Clarke,Essendon,1
55018,15275,2018,12031,Tim,Mohr,Greater Western Sydney,1
54999,15274,2018,12638,Jacob,Heron,Gold Coast,1
...,...,...,...,...,...,...,...
69265,15617,2020,12035,Jacob,Townsend,Essendon,4
40,13960,2012,12035,Jacob,Townsend,Greater Western Sydney,4
17762,14385,2014,12035,Jacob,Townsend,Greater Western Sydney,4
34698,14793,2016,12009,Tom,Hickey,St Kilda,4


In [10]:
def transform_name(row):
    
    """
    function to transform player name such that we take the first name initial and last name
    in the case of the 4 players where this is not unique (when paired with player team) we take the full first name
    puncuation is removed from player's names as well

    e.g. 
    1. Lewis Young => lewis young
    2. Sam De Koning => s de koning
    3. Jaeger O'Meara => j omeara
    4. Jeremy Cameron => j cameron
    """

    # 4 unique player ids where intial surname appears twice on same team in same season
    # 12567: Lewis Young, WB,
    # 12717: Lachie Young, WB,
    # 11644: Brad Ebert, PA,
    # 11323: Brad Ebert, PA,
    # fortunately all names are simple: not requiring any punctuation removal
    # since these are the only cases where the shortened name and team match
    # we will use their full name instead
    
    if row['player_id'] in {12567, 12717, 11644, 11323}:

        first = row['player_first_name']
        last = row['player_last_name']
        return_name = f'{first.lower()} {last.lower()}'

        return return_name


    first = row['player_first_name']
    lst = row['player_last_name'].replace("'", "").split('-')


    if '-' in row['player_last_name']:
        return_name = f'{first[0]} {lst[0][0]}-{lst[1]}'
        return return_name.lower()
    
    else:
        return_name = f'{first[0]} {lst[0]}'

        return return_name.lower()

In [11]:
df['process_name'] = df.apply(transform_name, axis=1)

In [12]:
# created a list of attributes that act as a unique identifier
df.groupby(['season', 'process_name', 'player_team']).count().sort_values('no_teams')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,match_id,player_id,player_first_name,player_last_name,no_teams
season,process_name,player_team,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012,a black,North Melbourne,1,1,1,1,1
2019,j atley,Port Adelaide,1,1,1,1,1
2019,j anderson,North Melbourne,1,1,1,1,1
2019,j allison,Brisbane Lions,1,1,1,1,1
2019,j aish,Collingwood,1,1,1,1,1
...,...,...,...,...,...,...,...
2015,m duffy,Fremantle,1,1,1,1,1
2015,m dick,Carlton,1,1,1,1,1
2015,m de boer,Fremantle,1,1,1,1,1
2015,m crouch,Adelaide,1,1,1,1,1


In [13]:
out_df = df[['season', 'player_id', 'player_team', 'no_teams', 'player_first_name', 'player_last_name', 'process_name']]

In [14]:
out_df

Unnamed: 0,season,player_id,player_team,no_teams,player_first_name,player_last_name,process_name
0,2012,10822,Greater Western Sydney,1,James,McDonald,j mcdonald
1,2012,10942,Sydney,1,Adam,Goodes,a goodes
2,2012,10973,Greater Western Sydney,1,Chad,Cornes,c cornes
3,2012,10988,Sydney,1,Jude,Bolton,j bolton
4,2012,11183,Sydney,1,Ted,Richards,t richards
...,...,...,...,...,...,...,...
93832,2022,12939,North Melbourne,1,Charlie,Comben,c comben
93839,2022,13024,North Melbourne,1,Josh,Goater,j goater
93889,2022,11731,Essendon,1,Michael,Hurley,m hurley
94022,2022,13025,Hawthorn,1,Ned,Long,n long


In [15]:
out_df.to_csv('../../data/curated/player_information_12-22.csv')