## Data Preparation


In [127]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os
from scipy.stats import pointbiserialr
from scipy.stats import pearsonr

#include utils directory
import sys
sys.path.append('..')

from utils.files import *
DATA_PATH = os.path.join('..', 'data')

from utils.metrics import *

In [128]:
def getTeams():
    teams = pd.read_csv(os.path.join(DATA_PATH, DATA_TEAMS))
    teams_competition = pd.read_csv(os.path.join(DATA_PATH, 'competition', DATA_TEAMS))
    teams = pd.concat([teams, teams_competition], ignore_index=True)
    return teams

def getPlayersTeams():
    players_teams = pd.read_csv(os.path.join(DATA_PATH, DATA_PLAYERS_TEAMS))
    players_teams_competition = pd.read_csv(os.path.join(DATA_PATH, 'competition', DATA_PLAYERS_TEAMS))
    players_teams = pd.concat([players_teams, players_teams_competition], ignore_index=True)
    return players_teams

def getCoaches():
    coaches = pd.read_csv(os.path.join(DATA_PATH, DATA_COACHES))
    coaches_competition = pd.read_csv(os.path.join(DATA_PATH, 'competition', DATA_COACHES))
    coaches = pd.concat([coaches, coaches_competition], ignore_index=True)
    return coaches

### Players


In [129]:
# Players
players_teams_df = getPlayersTeams() 
pt_df = preparePlayersTeamsDf(players_teams_df)

new_pt_df = pd.DataFrame()
for col in ['playerID', 'year', 'tmID']:
    new_pt_df[col] = pt_df[col]

teams_df = getTeams()
teams_df = prepareTeamsDf(teams_df)

getPer(new_pt_df, pt_df, teams_df) #defined in metrics.py
getEFF(new_pt_df, pt_df) #defined in metrics.py

display(new_pt_df)

# drop players with less than 40 minutes played
print(new_pt_df.shape)
new_pt_df['minutes'] = pt_df['minutes']
new_pt_df = new_pt_df[new_pt_df['minutes'] >= 40]
new_pt_df = new_pt_df.drop(columns=['minutes'])
print(new_pt_df.shape)

# get the average stats for players the previous year
merged_df = teams_df[['year', 'tmID', 'playoff', 'confID']].copy()
for index, row in merged_df.iterrows():
    merged_df.loc[index, 'per'] = new_pt_df[(new_pt_df['year'] == row['year'] - 1) & (new_pt_df['tmID'] == row['tmID'])]['PER'].mean()
    merged_df.loc[index, 'eff'] = new_pt_df[(new_pt_df['year'] == row['year'] - 1) & (new_pt_df['tmID'] == row['tmID'])]['EFF'].mean()

# replace missing per with average
#merged_df['per'] = merged_df['per'].fillna(merged_df['per'].mean())
#merged_df['eff'] = merged_df['eff'].fillna(merged_df['eff'].mean())
merged_df['per'] = merged_df['per'].fillna(merged_df['per'].quantile(0.20))
merged_df['eff'] = merged_df['eff'].fillna(merged_df['eff'].quantile(0.20))

  pt_df.at[index, 'factor'] = (2 / 3) - (0.5 * (lg_asts / lg_fgm)) / (2 * (lg_fgm / lg_ftMade))
  pt_df.at[index, 'vop'] = lg_points / (lg_fga - lg_oRebounds + lg_turnovers + 0.44 * lg_ftAttempted)
  pt_df.at[index, 'drb'] = (lg_rebounds - lg_oRebounds) / lg_rebounds


Unnamed: 0,playerID,year,tmID,PER,EFF
0,abrossv01w,2,MIN,24.671154,0.379433
1,abrossv01w,3,MIN,17.144159,0.277019
2,abrossv01w,4,MIN,23.437776,0.368177
3,abrossv01w,5,MIN,11.492671,0.304348
4,abrossv01w,6,MIN,18.800674,0.332046
...,...,...,...,...,...
2019,wrighmo01w,11,MIN,,
2020,wrighta01w,11,SEA,,
2021,youngso01w,11,SAS,,
2022,youngta01w,11,CHI,,


(2024, 5)
(1690, 5)


In [130]:
players_df = pd.read_csv(os.path.join(DATA_PATH, DATA_PLAYERS))

### Awards

In [131]:
coaches_df = getCoaches()
awards_df = pd.read_csv(f'{DATA_PATH}/{DATA_AWARDS}')

# Remove irrelevant awards
awards_df = awards_df.drop(awards_df[awards_df['award'].str.startswith('Kim Perrot')].index)

# Get the amount of awards in a team
for index, row in awards_df.iterrows():
    # add to awards_df
    tmId = players_teams_df.loc[(players_teams_df['playerID'] == row['playerID']) & (players_teams_df['year'] == row['year'])]['tmID']
    try:
        awards_df.loc[index, 'teamID'] = tmId.iloc[0] 
    except IndexError:
        try:
            tmId = coaches_df.loc[(coaches_df['coachID'] == row['playerID']) & (coaches_df['year'] == row['year'])]['tmID']
            #print(f'No team found for player {row["playerID"]} in year {row["year"]}')
            awards_df.loc[index, 'teamID'] = tmId.iloc[0]
        except:
            print(f'No team found for player {row["playerID"]} in year {row["year"]}')
            awards_df.loc[index, 'teamID'] = 'None'

teams_df_temp = teams_df.copy()
teams_df_temp['yearPrev'] = teams_df_temp['year'] - 1
for index, row in teams_df_temp.iterrows():
    teams_df.loc[index, 'awardCount'] = len(awards_df.loc[(awards_df['teamID'] == row['tmID']) & (awards_df['year'] <= row['yearPrev'])].index)

teams_df[['awardCount']].describe()
merged_df['awardCount'] = teams_df['awardCount']

No team found for player coopecy01w in year 7
No team found for player boltoru01w in year 7
No team found for player weathte01w in year 7


### Teams

In [132]:
# get team conversion rate
teams_df['yearPrev'] = teams_df['year'] - 1
for index, row in teams_df.iterrows():
    try:
        prevYear = teams_df.loc[(teams_df['year'] == row['yearPrev']) & (teams_df['tmID'] == row['tmID'])]
        prevPlayers = players_teams_df.loc[(players_teams_df['tmID'] == row['tmID']) & (players_teams_df['year'] == row['yearPrev'])]['playerID']
        if len(prevPlayers) == 0:
            continue

        count = 0 
        for player in prevPlayers:
            if player not in players_teams_df.loc[(players_teams_df['tmID'] == row['tmID']) & (players_teams_df['year'] == row['year'])]['playerID'].values:
                count += 1

        teams_df.loc[index, 'teamChange'] = count / len(prevPlayers)
    except IndexError:
        pass

merged_df['teamChange'] = teams_df['teamChange']

# replace missing with avg
merged_df['teamChange'] = merged_df['teamChange'].fillna(merged_df['teamChange'].mean())

In [133]:
players_df = pd.read_csv(os.path.join(DATA_PATH, DATA_PLAYERS))
#get average eff per college
getEFF(players_teams_df, players_teams_df) #defined in metrics.py
college_df = pd.DataFrame(columns=['college', 'eff', 'count'])
for index, row in players_teams_df.iterrows():
    college = players_df.loc[players_df['bioID'] == row['playerID']]['college'].iloc[0]
    if college_df.loc[college_df['college'] == college].empty:
        # add row
        college_df.loc[len(college_df)] = [college, 0, 0]
    college_df.loc[college_df['college'] == college, 'eff'] += row['EFF']
    college_df.loc[college_df['college'] == college, 'count'] += 1

college_df['eff'] = college_df['eff'] / college_df['count']

for index, row in players_teams_df.iterrows():
    try:
        college = players_df.loc[players_df['bioID'] == row['playerID']]['college'].iloc[0]
        # if rookie
        if row['year'] == players_teams_df.loc[players_teams_df['playerID'] == row['playerID']]['year'].min():
            players_teams_df.loc[index, 'collegeEff'] = college_df.loc[college_df['college'] == college]['eff'].iloc[0]
    except IndexError:
        pass

print(players_teams_df[['collegeEff']].head())
print(players_teams_df.loc[(players_teams_df['year'] == 10) & (players_teams_df['playerID'] == 'mccouan01w')]['collegeEff'])

#add sum of eff 
for index, row in teams_df.iterrows():
    teams_df.loc[index, 'effSum'] = players_teams_df.loc[(players_teams_df['year'] == row['year']) & (players_teams_df['tmID'] == row['tmID'])]['collegeEff'].mean()
#merged_df['effSum'] = teams_df['effSum']
print(merged_df.shape)
merged_df.head()


   collegeEff
0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
1023   NaN
Name: collegeEff, dtype: float64
(154, 8)


Unnamed: 0,year,tmID,playoff,confID,per,eff,awardCount,teamChange
0,9,ATL,N,EA,13.426494,0.296374,0.0,0.469574
1,10,ATL,Y,EA,11.398591,0.331618,0.0,0.642857
2,1,CHA,N,EA,13.426494,0.296374,0.0,0.469574
3,2,CHA,Y,EA,12.23754,0.306456,0.0,0.615385
4,3,CHA,Y,EA,17.75526,0.312389,0.0,0.166667


### Coach Ratings

To measure the coach efficiency, we are gonna use a simple win-loss ratio:

\begin{aligned}
& \text{Coach Rating} = \frac{\text{Wins}}{\text{Wins} + \text{Losses}}
\end{aligned}


In [134]:
coaches_df = getCoaches()
coaches_df = prepareCoachesDf(coaches_df)
coaches_df = coaches_df.groupby(['tmID', 'year'])['coachWLRatio'].mean().reset_index()

coaches_temp = coaches_df[['year', 'tmID']]
print(merged_df.shape)
merged_df = pd.merge(merged_df, coaches_temp, on=['year', 'tmID'], how='left')
print(merged_df.shape)

coaches_df['year'] = coaches_df['year'] + 1
merged_df = pd.merge(merged_df, coaches_df, on=['year', 'tmID'], how='left')
print(merged_df.shape)
merged_df['coachWLRatio'] = merged_df['coachWLRatio'].fillna(merged_df['coachWLRatio'].mean())

(154, 8)
(154, 8)
(154, 9)


In [135]:
merged_df['playoff'] = merged_df['playoff'].eq('Y').mul(1)
merged_df.drop(columns=['tmID'], inplace=True)
merged_df.dropna(axis=0, inplace=True)
print(merged_df.shape)
merged_df.head()

(154, 8)


Unnamed: 0,year,playoff,confID,per,eff,awardCount,teamChange,coachWLRatio
0,9,0,EA,13.426494,0.296374,0.0,0.469574,1.218513
1,10,1,EA,11.398591,0.331618,0.0,0.642857,0.133333
2,1,0,EA,13.426494,0.296374,0.0,0.469574,1.218513
3,2,1,EA,12.23754,0.306456,0.0,0.615385,0.333333
4,3,1,EA,17.75526,0.312389,0.0,0.166667,1.222222


In [136]:
# Save the result to a new CSV file
merged_df.to_csv(os.path.join(DATA_PATH, DATA_MERGED), index=False)

### References

<a id="ref1"></a> [1] Maroun, E. (2012, March 7). Understanding advanced statistics: player efficiency rating. Hardwood Paroxysm. https://web.archive.org/web/20170910105350/https://hardwoodparoxysm.com/2012/03/07/understanding-advanced-statistics-player-efficiency-rating/

<a id="ref2"></a> [2] Calculating PER | Basketball-Reference.com. (n.d.). Basketball-Reference.com. https://www.basketball-reference.com/about/per.html