#

# NBA API Player ID Helper

In [21]:
import pandas as pd

In [22]:
# This ensures side effects do not occur when dealing with views and copies of
# dataframes. See
# https://pandas.pydata.org/pandas-docs/stable//user_guide/copy_on_write.html.
pd.options.mode.copy_on_write = True

In [23]:
# Changing Pandas option to show all columns in dataframes.
pd.set_option('Display.max_columns', None)

Load in the cleaned season_counting_stats.csv to extract the NBA API IDs
associated to each player and create a dataframe consisting of two columns:
'PLAYER_ID' and 'NAME'. We drop duplicates because a given player could have 
played for more than one season or could have played only one season but for
multiple teams, resulting in multiple identical rows.

In [24]:
player_id_name = pd.read_csv('../counting_stats/season_counting_stats.csv')
player_id_name = player_id_name[['PLAYER_ID', 'NAME']].drop_duplicates()

We create a new dataframe identifying NBA players with identical names. 
For example, there are two entries of 'Patrick Ewing' (father and son).
Technically, the younger Patrick Ewing is Patrick Ewing Jr., but the data entry
omitted the 'Jr.' for some reason.

In [25]:

nonunique_player_names = player_id_name[player_id_name.duplicated(subset= "NAME", keep=False)]
nonunique_player_names.sort_values(by='NAME')

Unnamed: 0,PLAYER_ID,NAME
19037,78209,Bill Smith
19031,78207,Bill Smith
13004,76610,Bob Duffy
13001,76609,Bob Duffy
15305,77193,Bobby Jones
...,...,...
25697,203502,Tony Mitchell
21920,201041,Walker Russell
18371,78048,Walker Russell
15357,77203,Willie Jones


As we can see there are some players that share identical names. Therefore, one thing that might help when matching players to their ID is to also include the season information. It is presumably less likely that two nba players of the same name are in the league during the same season. Let's try this and see if it will eliminate any nonuniqueness. 

In [26]:
player_id_season_name = pd.read_csv('../counting_stats/season_counting_stats.csv')

# We drop duplicates because a given player can have played
# for multiple teams in a given season, resulting in multiple identical rows.
player_id_season_name = player_id_season_name[['PLAYER_ID', 'SEASON_START', 'NAME']].drop_duplicates()
nonunique_player_id_season_pairs = player_id_season_name[player_id_season_name.duplicated(subset= ['SEASON_START', 'NAME'], keep=False)]
nonunique_player_id_season_pairs.sort_values(by=['NAME', 'SEASON_START'])

Unnamed: 0,PLAYER_ID,SEASON_START,NAME
2115,279,1984,Charles Jones
15249,77178,1984,Charles Jones
2118,279,1985,Charles Jones
15250,77178,1985,Charles Jones
2120,279,1987,Charles Jones
15251,77178,1987,Charles Jones
2121,279,1988,Charles Jones
15252,77178,1988,Charles Jones
2256,293,1989,Charles Smith
18900,78179,1989,Charles Smith


Next, we output these dataframes to csv files. To remove clutter, the index column will not be written. 

In [27]:
player_id_name.to_csv('player_id_name.csv', index=False)

In [28]:
nonunique_player_names.to_csv('nonunique_player_names.csv', index=False)

In [29]:
player_id_season_name.to_csv('player_id_season_name.csv', index=False)

In [30]:
nonunique_player_id_season_pairs.to_csv('nonunique_player_id_season_pairs.csv', index=False)