# Cleaning Advanced Stats

In this notebook, we:

- input the file `data_collection/Advanced.csv`,
- clean it in preparation for merging with the other relevant data,
- output a cleaned csv, advanced_stats.csv.

In [1]:
import pandas as pd
from unidecode import unidecode

In [2]:
# Change pandas option to display all columns of dataframes.
pd.set_option('Display.max_columns', None)

In [3]:
# Load csv files as pandas dataframe.
advanced_stats = pd.read_csv('../data_collection/Advanced.csv')

In [4]:
# Delete rows where lg = BAA or ABA.
advanced_stats = advanced_stats.drop(advanced_stats[advanced_stats['lg'] != 'NBA'].index)

In [5]:
# Renaming 'season' column to 'SEASON_START' and switching to convention
# that SEASON_START is the year in which the corresponding NBA season started,
# e.g. the 2022-2023 season should have 2022 as SEASON_START. Previously,
# the 'season' column listed the year that the season ended, e.g. the
# 2022-2023 season was listed as 2023.
advanced_stats.rename(columns={'season': 'SEASON_START'}, inplace=True)

def subtractOne(n):
    return n-1

advanced_stats['SEASON_START']=advanced_stats['SEASON_START'].apply(subtractOne)

Next, we create a new column 'PLAYER_ID' (not to be confused with 'player_id')
which holds the NBA API player ID.


In [6]:
player_id_season = pd.read_csv('../../PlayerIdentification/player_id_season.csv')
nonunique_player_id_season_pairs = pd.read_csv(
    '../../PlayerIdentification/nonunique_player_id_season_pairs.csv')

In [None]:
def id_lookup(season_start, name):
    name = unidecode(name)
    if ((nonunique_player_id_season_pairs['SEASON_START'] == season_start) & (nonunique_player_id_season_pairs['NAME'] == name)).any():
        return 'player season pair not unique'
    elif ((player_id_season['SEASON_START'] == season_start) & (player_id_season['NAME'] == name)).any():
        return player_id_season[(player_id_season['SEASON_START'] == season_start) & (player_id_season['NAME'] == name)].iat[0,0]
    else:
        return 'no name season pair found'

In [8]:
advanced_stats['PLAYER_ID'] = advanced_stats.apply(lambda x: id_lookup(x.SEASON_START, x.player), axis=1)

Moving player ID column to front of dataframe. 

In [9]:
columns = advanced_stats.columns.values.tolist()
columns.remove('PLAYER_ID')
columns.insert(0, 'PLAYER_ID')

In [10]:
advanced_stats = advanced_stats[columns]

Fixing PLAYER_IDs listed as 'no name season pair found' by manually mapping 
the correct ID to these players. 

In [11]:
id_map = {'A.J. Green' : 1631260,
 'A.J. Hammons' : 1627773,
 'A.J. Lawson': 1630639,
 'A.J. Price' : 201985,
 'A.W. Holt' : 77046,
 'Al Attles' : 76070,
 'Alfred McGuire' : 77536,
 'Andrew White' : 1628510,
 'Art Williams' : 78539,
 'B.J. Johnson' : 1629168,
 'Barry Clemens' : 76403,
 'Bevo Nordmann' : 77727,
 'Bill Gabor' : 76768,
 'Bill Hosket' : 77061,
 'Billy Kenville' : 77247,
 'Billy Ray Bates' : 76121,
 'Bird Averitt' : 76076,
 'Blackie Towery' : 78363,
 'Zach Norvell' : 1629668,
 'Zeke Zawoluk' : 78639,
 'Bob Schafer' : 78072,
 'Bones McKinney' : 77546,
 'Xavier Tillman Sr.' : 1630214,
 'World B. Free' : 76753,
 "Boniface N'Dong" : 101238,
 "Boo Ellis" : 76658,
 "Brian Bowen" : 1628968,
 "Whitey Bell" : 76143,
 "Whitey Martin" : 77479,
 "William Smith" : 78207,
 "Wayne Engelstad" : 76671,
 "Wang Zhizhi" : 1917,
 "Wah Wah Jones" : 77199,
 "Walt Budko" : 76298,
 "Bubba Wilson" : 78587,
 "Bubbles Hawkins" : 76975,
 "Bucky Bockhorn" : 76191,
 "Vince Hunter" : 1626205,
 "Vítor Luiz Faverani" : 203543,
 "Wade Baldwin" : 1627735,
 "Vince Edwards" : 1629053,
 "Tommy Kron" : 77313,
 "Tommie Green" : 76879,
 "Bud Acton" : 76010,
 "Bucky McConnell" : 77514,
 "Tom Sanders" : 78060,
 "Bud Ogden" : 77747,
 "Bud Olsen" : 77765,
 "Tiny Archibald" : 76054,
 "Tom Patterson" : 77813,
 "Ticky Burden" : 76303,
 "Tex Ritter" : 77968,
 "Ted McClain" : 77510,
 "Bud Stallworth" : 78239,
 "Butch van Breda Kolff" : 78401,
 "C.J. Kupec" : 77325,
 "Steve Smith" : 120,
 "C.J. Miles" : 101139,
 "Steve Bardo" : 76097,
 "Swede Halbrook" : 76919,
 "Tal Skinner" : 78168,
 "Stanislav Medvedenko" : 2098,
 "Cameron Reynolds" : 1629244,
 "Charles Davis" : 76518,
 "Slick Leonard" : 77371,
 "Sonny Dove" : 76592,
 "Sonny Hertzberg" : 77011,
 "Charles Johnson" : 77133,
 "Sam Williams" : 78569,
 "Skippy Whitaker" : 78506,
 "Russ Lee" : 77365,
 "Sam Smith" : 78205,
 "Charles Pittman" : 77864,
 "Ronald Murray" : 2436,
 "Ronnie Valentine" : 78396,
 "Roy Hamilton" : 76928,
 "Ron Grandison" : 97,
 "Charley Shipp" : 78653,
 "Charlie Lowery" : 77416,
 "Roger Mason" : 2427,
 "Robert Williams" : 1629057,
 "Cheese Johnson" : 77159,
 "Chick Halbert" : 76918,
 "Richie Niemiera" : 77717,
 "Ricky Marsh" : 77465,
 "Rob Lock" : 77398,
 "Rob Rose" : 78016,
 "Robert Hahn" : 76914,
 "Rich Eichhorst" : 76651,
 "Rich Manning" : 316,
 "Chick Reiser" : 77938,
 "Chico Vaughn" : 78409,
 "Reggie Bullock" : 203493,
 "Chink Crossin" : 76479,
 "Chips Sobek" : 78214,
 "Red Owens" : 77783,
 "Chubby Cox": 76463,
 "Chuck Share" : 78125,
 "Red Kerr" : 77248,
 "Red Morrison" : 77648,
 "Clarence Weatherspoon" : 221,
 "Ralph O'Brien" : 77743,
 "Red Davis" : 76524,
 "Cliff Robinson" : 77986,
 "Ralph Johnson" : 77139,
 "Corky Devlin" : 76561,
 "Pickles Kennedy" : 77245,
 "R.J. Hunter" : 1626154,
 "Cotton Nash" : 77687,
 "Pete Verhoeven" : 78414,
 "Curly Armstrong" : 76060,
 "Pep Saul" : 78063,
 "Perry Jones" : 203103,
 "Pete Darcey" : 76506,
 "D.J. Carton" : 1630618,
 "D.J. Mbenga" : 2788,
 "Paul Hoffman" : 77036,
 "Pearl Washington" : 78465,
 "D.J. Stephens" : 203474,
 "Normie Glick" : 76823,
 "P.J. Hairston" : 203798,
 "D.J. Strawberry" : 201199,
 "D.J. White" : 201591,
 "Monk Meineke" : 77573,
 "Norm Richardson" : 2369,
 "Dan Anderson" : 76036,
 "Mo Layton" : 77350,
 "Mo Mahoney" : 77444,
 "Moe Barr" : 76113,
 "Danny Schayes": 7,
 "Mike Todorovich" : 78346,
 "Mitch Creek" : 1628249,
 "Dave Britton" : 76261,
 "Dave Greenwood" : 76881,
 "Mike Kearns" : 77230,
 "Mike Sweetney" : 2552,
 "DeWayne Scales" : 78068,
 "Mickey Davis" : 76522,
 "Mike Davis" : 76530,
 "Derrick Walton" : 1628476,
 "Michael Wilson" : 78581,
 "Dick Atha" : 76069,
 "Melvin Turpin" : 78386,
 "Michael Frazier" : 1626187,
 "Michael Phelps" : 77852,
 "Dick Garrett" : 76788,
 "Med Park" : 77799,
 "Melvin Frazier" : 1628982,
 "Dike Eddleman" : 76636,
 "McCoy McLemore" : 77548,
 "McKinley Singleton" : 78162,
 "Don Ackerman" : 76008,
 "Duck Williams" : 78545,
 "Maurice King" : 77272,
 "Maurice Martin" : 77476,
 "McCoy Ingram" : 77095,
 "Dwight Davis" : 76521,
 "Marcus Morris" : 202694,
 "Matt Williams" : 1628475,
 "Matthew Hurt" : 1630562,
 "Earnie Killum" : 77257,
 "Easy Parham" : 77797,
 "Mamadou N'Diaye" : 2055,
 "Manny Leaks" : 77351,
 "Ed Sherod" : 78136,
 "Eddie Lee Wilkins" : 78534,
 "Luther Rackley" : 77894,
 "M.J. Walker" : 1630640,
 "Makhtar N'Diaye" : 1823,
 "Edmund Lawrence" : 77348,
 "Elmer Gainer" : 76769,
 "Em Bryant" : 76289,
 "Luke Jackson" : 77103,
 "Fat Lever" : 77376,
 "Larry Hennessy" : 76998,
 "Logan Vander Velden" : 78403,
 "Louie Dampier" : 76499,
 "Eugene Jeter" : 200817,
 "Fatty Taylor" : 78302,
 "Frankie Brian" : 76250,
 "LaBradford Smith" : 78194,
 "LaRue Martin" : 77475,
 "Fred Scolari" : 78094,
 "Kleggie Hermsen" : 77007,
 "Freddie Boyd" : 76222,
 "Kevin Knox" : 1628995,
 "Kiwane Lemorris Garris" : 1619,
 "Freddie Lewis" : 77379,
 "Gar Heard" : 76985,
 "Kenny Sears" : 78106,
 "Kenton Edelin" : 76637,
 "Kenyon Martin Jr." : 1630231,
 "Geoff Crompton" : 76474,
 "K.J. McDaniels" : 203909,
 "George Bon Salle" : 76203,
 "Glen Rice Jr." : 203318,
 "Goo Kennedy" : 77243,
 "Ha Seung-Jin" : 2775,
 "Johnny O'Bryant" : 203948,
 "Hamady N'Diaye" : 202380,
 "Harry Giles" : 1628385,
 "John Warren" : 78461,
 "Johnny Austin" : 76073,
 "Hawkeye Whitney" : 78519,
 "Hook Dillon" : 76571,
 "Hoot Gibson" : 76811,
 "John Butler" : 1631219,
 "John Logan" : 77403,
 "John Oldham" : 77758,
 "Hot Rod Hundley" : 77082,
 "Joe Hassett" : 76966,
 "Hutch Jones" : 77203,
 "Ibo Kutluay" : 2825,
 "Ike Borsavage" : 76209,
 "Isaac Austin" : 1134,
 "Jo Jo White" : 78510,
 "Isaac Fontaine" : 1829,
 "Isaac Walthour" : 78448,
 "J.J. Anderson" : 76043,
 "Jeff Taylor" : 203106,
 "Jermaine Samuels" : 1631257,
 "Jo Jo English" : 76676,
 "J.J. Hickson" : 201581,
 "Jeff Dowtin" : 1630288,
 "J.J. O'Brien" : 1626266,
 "J.J. Redick" : 200755,
 "J.R. Smith" : 2747,
 "James Ray" : 77920,
 "Jan van Breda Kolff" : 78400,
 "Jeenathan Williams" : 1631466,
 "Charles Jones" : 1869
 }

In [12]:
for i in id_map:
    advanced_stats.loc[(advanced_stats['player'] == i) & (advanced_stats['PLAYER_ID'] == 'no name season pair found'), 'PLAYER_ID'] = id_map[i]

Drop Bobby Watson and Mal McMullen because there names have no associated ID. 

In [13]:
advanced_stats = advanced_stats[advanced_stats['player'] != 'Bobby Watson']
advanced_stats = advanced_stats[advanced_stats['player'] != 'Mal McMullen']

Check that all PLAYER_IDs listed as 'no name season pair found' have been
fixed. 

In [14]:
advanced_stats[advanced_stats['PLAYER_ID'] == 'no name season pair found']

Unnamed: 0,PLAYER_ID,seas_id,SEASON_START,player_id,player,birth_year,pos,age,experience,lg,tm,g,mp,per,ts_percent,x3p_ar,f_tr,orb_percent,drb_percent,trb_percent,ast_percent,stl_percent,blk_percent,tov_percent,usg_percent,ows,dws,ws,ws_48,obpm,dbpm,bpm,vorp


Fixing PLAYER_IDs listed as 'player season_pair not unique' manually. 

In [15]:
advanced_stats.loc[(advanced_stats['player'] == "Charles Jones") & (advanced_stats['player_id'] == 2109), 'PLAYER_ID'] = 279
advanced_stats.loc[(advanced_stats['player'] == "Charles Jones") & (advanced_stats['player_id'] == 2163), 'PLAYER_ID'] = 77178

In [16]:
advanced_stats.loc[(advanced_stats['player'] == "George Johnson") & (advanced_stats['player_id'] == 1804), 'PLAYER_ID'] = 77148
advanced_stats.loc[(advanced_stats['player'] == "George Johnson") & (advanced_stats['player_id'] == 1278), 'PLAYER_ID'] = 77147
advanced_stats.loc[(advanced_stats['player'] == "George Johnson") & (advanced_stats['player_id'] == 1448), 'PLAYER_ID'] = 77149

In [17]:
advanced_stats.loc[(advanced_stats['player'] == "Charles Smith") & (advanced_stats['player_id'] == 2412), 'PLAYER_ID'] = 293
advanced_stats.loc[(advanced_stats['player'] == "Charles Smith") & (advanced_stats['player_id'] == 2489), 'PLAYER_ID'] = 78179

In [18]:
advanced_stats.loc[(advanced_stats['player'] == "Eddie Johnson") & (advanced_stats['player_id'] == 1993), 'PLAYER_ID'] = 698
advanced_stats.loc[(advanced_stats['player'] == "Eddie Johnson") & (advanced_stats['player_id'] == 1747), 'PLAYER_ID'] = 77144

In [19]:
advanced_stats.loc[(advanced_stats['player'] == "Chris Johnson") & (advanced_stats['player_id'] == 4075), 'PLAYER_ID'] = 203187
advanced_stats.loc[(advanced_stats['player'] == "Chris Johnson") & (advanced_stats['player_id'] == 3921), 'PLAYER_ID'] = 202419

In [20]:
advanced_stats.loc[(advanced_stats['player'] == "Marcus Williams") & (advanced_stats['player_id'] == 3691), 'PLAYER_ID'] = 200766
advanced_stats.loc[(advanced_stats['player'] == "Marcus Williams") & (advanced_stats['player_id'] == 3777), 'PLAYER_ID'] = 201173

In [21]:
advanced_stats.loc[(advanced_stats['player'] == "Michael Smith") & (advanced_stats['player_id'] == 2879), 'PLAYER_ID'] = 63
advanced_stats.loc[(advanced_stats['player'] == "Michael Smith") & (advanced_stats['player_id'] == 2527), 'PLAYER_ID'] = 78197


In [22]:
advanced_stats.loc[(advanced_stats['player'] == "Tony Mitchell") & (advanced_stats['player_id'] == 4210), 'PLAYER_ID'] = 203502
advanced_stats.loc[(advanced_stats['player'] == "Tony Mitchell") & (advanced_stats['player_id'] == 4211), 'PLAYER_ID'] = 203183

Check that all PLAYER_IDs listed as 'player season_pair not unique' have been 
fixed. 

In [23]:
advanced_stats[advanced_stats['PLAYER_ID']=='player season pair not unique']

Unnamed: 0,PLAYER_ID,seas_id,SEASON_START,player_id,player,birth_year,pos,age,experience,lg,tm,g,mp,per,ts_percent,x3p_ar,f_tr,orb_percent,drb_percent,trb_percent,ast_percent,stl_percent,blk_percent,tov_percent,usg_percent,ows,dws,ws,ws_48,obpm,dbpm,bpm,vorp


Cleaning up columns. 

In [24]:
columns = advanced_stats.columns.values.tolist()
columns.remove('seas_id')
columns.remove('player_id')
columns.remove('birth_year')
advanced_stats = advanced_stats[columns]

In [25]:
advanced_stats.to_csv('advanced_stats.csv', index=False)