In [None]:
import pandas as pd
import matplotlib.pyplot as plt

teams = pd.read_csv('data/original/teams.csv')
players = pd.read_csv('data/original/players.csv')
coaches = pd.read_csv('data/original/coaches.csv')
players_teams = pd.read_csv('data/original/players_teams.csv')
series_post = pd.read_csv('data/original/series_post.csv')
awards_players = pd.read_csv('data/original/awards_players.csv')
teams_post = pd.read_csv('data/original/teams_post.csv')

In [None]:
# Function to get columns with only one value
def get_empty_columns(data):
    for column in data.columns:
        if data[column].nunique() == 1:
            print("Column with " + str(data[column].nunique()) + " unique values: " + column)

## Cleaning teams.csv

In [47]:
get_empty_columns(teams)

Column with 1 unique values: lgID
Column with 1 unique values: seeded
Column with 1 unique values: tmORB
Column with 1 unique values: tmDRB
Column with 1 unique values: tmTRB
Column with 1 unique values: opptmORB
Column with 1 unique values: opptmDRB
Column with 1 unique values: opptmTRB


In [48]:
# Drop the columns that are not needed

teams = teams.drop(columns=['lgID', 'divID', 'seeded', 'arena', 'name', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB', 'attend'])


In [49]:
# Add the post series winrate column to the teams.csv
""""
teams['post_winrate'] = None

for index, row in teams_post.iterrows():
    team_id = row['tmID']
    year = row['year']
    winrate = int((row['W'] / (row['W'] + row['L'])) * 100)
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'post_winrate'] = winrate
"""

'"\nteams[\'post_winrate\'] = None\n\nfor index, row in teams_post.iterrows():\n    team_id = row[\'tmID\']\n    year = row[\'year\']\n    winrate = int((row[\'W\'] / (row[\'W\'] + row[\'L\'])) * 100)\n    teams.loc[(teams[\'tmID\'] == team_id) & (teams[\'year\'] == year), \'post_winrate\'] = winrate\n'

In [50]:

def calculate_team_year_stats(team_id, year):
    team = teams[teams['tmID'] == team_id]
    
    # Select player ids for the team for that year
    team_players = players_teams[(players_teams['tmID'] == team_id) & (players_teams['year'] == year)]
    team_players_bio = players[(players['bioID'].isin(team_players['playerID']))]
    
    # Add up all the stats in a team_players and add each to a different column in the team dataframe
    
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_average_height'] = team_players_bio['height'].mean()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_average_weight'] = team_players_bio['weight'].mean()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_GP'] = team_players['GP'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_GS'] = team_players['GS'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_points'] = team_players['points'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_oRebounds'] = team_players['oRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_dRebounds'] = team_players['dRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_rebounds'] = team_players['rebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_assists'] = team_players['assists'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_steals'] = team_players['steals'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_blocks'] = team_players['blocks'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_turnovers'] = team_players['turnovers'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PF'] = team_players['PF'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_fgAttempted'] = team_players['fgAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_fgMade'] = team_players['fgMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_ftAttempted'] = team_players['ftAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_ftMade'] = team_players['ftMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_threeAttempted'] = team_players['threeAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_threeMade'] = team_players['threeMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_dq'] = team_players['dq'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostGP'] = team_players['PostGP'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostGS'] = team_players['PostGS'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostMinutes'] = team_players['PostMinutes'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostPoints'] = team_players['PostPoints'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostoRebounds'] = team_players['PostoRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostdRebounds'] = team_players['PostdRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostRebounds'] = team_players['PostRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostAssists'] = team_players['PostAssists'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostSteals'] = team_players['PostSteals'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostBlocks'] = team_players['PostBlocks'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostTurnovers'] = team_players['PostTurnovers'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostPF'] = team_players['PostPF'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostfgAttempted'] = team_players['PostfgAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostfgMade'] = team_players['PostfgMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostftAttempted'] = team_players['PostftAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostftMade'] = team_players['PostftMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostthreeAttempted'] = team_players['PostthreeAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostthreeMade'] = team_players['PostthreeMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostDQ'] = team_players['PostDQ'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_awards'] = team_players['TotalAwards'].sum()
    

    

## Cleaning players.csv

In [51]:
get_empty_columns(players)

Column with 1 unique values: firstseason
Column with 1 unique values: lastseason


In [52]:
# Drop the columns that are not needed

players = players.drop(columns=['firstseason', 'lastseason', 'college', 'collegeOther', 'deathDate'])


In [53]:
def convert_height_to_cm(height):
    height = int(height)
    return int(height * 2.54)

def convert_weight_to_kg(weight):
    weight = int(weight)
    return int(weight * 0.453592)

players['height'] = players['height'].apply(convert_height_to_cm)
players['weight'] = players['weight'].apply(convert_weight_to_kg)

In [54]:
# check if any ids from players are not in players_teams and drop them from players if their weight is 0

players_ids = players['bioID'].unique()
players_teams_ids = players_teams['playerID'].unique()

for player_id in players_ids:
    if player_id not in players_teams_ids:
        # Drop only if weight and height are 0 and birthDate is 0000-00-00
        if players.loc[players['bioID'] == player_id, 'weight'].values[0] == 0 and players.loc[players['bioID'] == player_id, 'height'].values[0] == 0 and players.loc[players['bioID'] == player_id, 'birthDate'].values[0] == '0000-00-00':
            players = players[players.bioID != player_id]
    

In [55]:
# Put the Outliers to 0

# Put the height values with <150 cm to 0

players.loc[players['height'] < 150, 'height'] = 0

# Put the weight values with <50 kg to 0

players.loc[players['weight'] < 50, 'weight'] = 0

# Fill the outliers with the average value by position

# Get the average height and weight by position

dict_pos_height = {}
dict_pos_weight = {}
for index, row in players.iterrows():
    if '-' in row['pos']:
        pos = row['pos'].split('-')
        pos1 = pos[0]
        pos2 = pos[1]
        if pos1 not in dict_pos_height:
            dict_pos_height[pos1] = []
            dict_pos_weight[pos1] = []
        if pos2 not in dict_pos_height:
            dict_pos_height[pos2] = []
            dict_pos_weight[pos2] = []
        dict_pos_height[pos1].append(row['height'])
        dict_pos_height[pos2].append(row['height'])
        dict_pos_weight[pos1].append(row['weight'])
        dict_pos_weight[pos2].append(row['weight'])
    else:
        if row['pos'] not in dict_pos_height:
            dict_pos_height[row['pos']] = []
            dict_pos_weight[row['pos']] = []
        dict_pos_height[row['pos']].append(row['height'])
        dict_pos_weight[row['pos']].append(row['weight'])

average_height_by_position = {}

for key in dict_pos_height:
    average_height_by_position[key] = int(sum(dict_pos_height[key]) / len(dict_pos_height[key]))

average_weight_by_position = {}

for key in dict_pos_weight:
    average_weight_by_position[key] = int(sum(dict_pos_weight[key]) / len(dict_pos_weight[key]))


print(average_height_by_position)
print(average_weight_by_position)


# Fill the outliers with the average value by position

for index, row in players.iterrows():
    if row['height'] == 0:
        if '-' in row['pos']:
            pos = row['pos'].split('-')
            pos1 = pos[0]
            pos2 = pos[1]
            players.at[index, 'height'] = int((average_height_by_position[pos1] + average_height_by_position[pos2]) / 2)
        else:
            players.at[index, 'height'] = average_height_by_position[row['pos']]
    if row['weight'] == 0:
        if '-' in row['pos']:
            pos = row['pos'].split('-')
            pos1 = pos[0]
            pos2 = pos[1]
            players.at[index, 'weight'] = int((average_weight_by_position[pos1] + average_weight_by_position[pos2]) / 2)
        else:
            players.at[index, 'weight'] = average_weight_by_position[row['pos']]


{'C': 191, 'F': 185, 'G': 174}
{'C': 81, 'F': 74, 'G': 65}


## Remove irrelevant awards 

In [56]:
# Remove sportsmanship awards from awards_players

awards_players = awards_players[awards_players['award'] != 'Kim Perrot Sportsmanship']
awards_players = awards_players[awards_players['award'] != 'Kim Perrot Sportsmanship Award']

###

## Add the awards columns to the players_teams and coaches 

In [57]:
players_teams['TotalAwards'] = 0
coaches['TotalAwards'] = 0


for index, row in awards_players.iterrows():
    player_id = row['playerID']
    award_year = row['year']

    if player_id in players_teams['playerID'].values:
        players_teams.loc[(players_teams['playerID'] == player_id) & (players_teams['year'] == award_year),'TotalAwards'] += 1

    if player_id in coaches['coachID'].values:
        coaches.loc[(coaches['coachID'] == player_id) & (coaches['year'] == award_year),'TotalAwards'] += 1


## Add the winrate to the coaches

In [58]:
# Add the winrate to the coaches

coaches['winrate'] = None

for index, row in coaches.iterrows():
    coach_id = row['coachID']
    year = row['year']
    winrate = int((row['won'] / (row['won'] + row['lost'])) * 100)
    coaches.loc[(coaches['coachID'] == coach_id) & (coaches['year'] == year), 'winrate'] = winrate


## Add coach experience to the teams

In [59]:
def calculate_coach_experience_for_team(coaches, team_id, year):
    team_coaches = coaches[(coaches['tmID'] == team_id) & (coaches['year'] == year)]
    total_games = team_coaches['won'].sum() + team_coaches['lost'].sum()
    
    total_coach_experience = 0
    
    for _, coach in team_coaches.iterrows():
        coach_history = coaches[(coaches['coachID'] == coach['coachID']) & (coaches['year'] < year)]
        coach_history = coach_history.sort_values(by='year', ascending=False).head(year)

        weights = list(range(year, 0, -1)) 
        weighted_winrate = sum(coach_history['winrate'] * weights[:len(coach_history)])
        total_awards = coach_history['TotalAwards'].sum()
        coach_experience = weighted_winrate + total_awards
        
        coach_games = coach['won'] + coach['lost']
        coach_weight = coach_games / total_games if total_games > 0 else 0
        total_coach_experience += coach_experience * coach_weight
    
    return total_coach_experience

teams['coach_experience'] = teams.apply(
    lambda row: calculate_coach_experience_for_team(coaches, row['tmID'], row['year']), axis=1
)



## Cleaning coaches.csv

In [60]:
get_empty_columns(coaches)

Column with 1 unique values: lgID


In [61]:
# Drop the columns that are not needed

coaches = coaches.drop(columns=['lgID'])

## Cleaning players_teams.csv

In [62]:
# Drop the columns that are not needed

players_teams = players_teams.drop(columns=['lgID'])

In [63]:
rookie_players = players_teams.groupby('playerID')['year'].min().reset_index()

rookie_players.columns = ['playerID', 'rookie_year']

# Dont use the rookie year = 1
rookie_players = rookie_players[rookie_players['rookie_year'] != 1]

players_teams_with_rookies = pd.merge(players_teams, rookie_players, on='playerID', how='left')


rookie_stats = players_teams_with_rookies[players_teams_with_rookies['year'] == players_teams_with_rookies['rookie_year']]


excluded_columns = ['playerID', 'year', 'stint', 'tmID', 'rookie_year']
rookie_avg_stats = rookie_stats.drop(columns=excluded_columns).mean().astype(int)

average_rookie = pd.DataFrame([rookie_avg_stats])

average_rookie['playerID'] = "average_rookie"
average_rookie['year'] = 0
average_rookie['stint'] = 0

average_rookie = average_rookie[['playerID', 'year', 'stint'] + list(rookie_avg_stats.index)]

players_teams= pd.concat([players_teams, average_rookie], ignore_index=True)



In [64]:
# apply to all teams
for index, row in teams.iterrows():
    calculate_team_year_stats(row['tmID'], row['year'])

In [65]:
teams.to_csv('data/clean/cleaned_teams.csv', index=False)
players.to_csv('data/clean/cleaned_players.csv', index=False)
coaches.to_csv('data/clean/cleaned_coaches.csv', index=False)
players_teams.to_csv('data/clean/cleaned_players_teams.csv', index=False)
awards_players.to_csv('data/clean/cleaned_awards_players.csv', index=False)