In [247]:
# Add the post series winrate column to the teams.csv
""""
teams['post_winrate'] = None

for index, row in teams_post.iterrows():
    team_id = row['tmID']
    year = row['year']
    winrate = int((row['W'] / (row['W'] + row['L'])) * 100)
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'post_winrate'] = winrate
"""

'"\nteams[\'post_winrate\'] = None\n\nfor index, row in teams_post.iterrows():\n    team_id = row[\'tmID\']\n    year = row[\'year\']\n    winrate = int((row[\'W\'] / (row[\'W\'] + row[\'L\'])) * 100)\n    teams.loc[(teams[\'tmID\'] == team_id) & (teams[\'year\'] == year), \'post_winrate\'] = winrate\n'

In [248]:
import pandas as pd

coaches = pd.read_csv('data/clean/cleaned_coaches.csv')
players_teams = pd.read_csv('data/clean/cleaned_players_teams.csv')
awards_players = pd.read_csv('data/clean/cleaned_awards_players.csv')
teams = pd.read_csv('data/clean/cleaned_teams.csv')
players = pd.read_csv('data/clean/cleaned_players.csv')

### Feature Engineering: Coach Experience & Awards

#### Coach Winrate

In [249]:
# Add the winrate to the coaches

coaches['winrate'] = None

for index, row in coaches.iterrows():
    coach_id = row['coachID']
    year = row['year']
    winrate = int((row['won'] / (row['won'] + row['lost'])) * 100)
    coaches.loc[(coaches['coachID'] == coach_id) & (coaches['year'] == year), 'winrate'] = winrate

#### Total Awards for each Coach and Player

In [250]:
players_teams['TotalAwards'] = 0
coaches['TotalAwards'] = 0


for index, row in awards_players.iterrows():
    player_id = row['playerID']
    award_year = row['year']

    if player_id in players_teams['playerID'].values:
        players_teams.loc[(players_teams['playerID'] == player_id) & (players_teams['year'] == award_year),'TotalAwards'] += 1

    if player_id in coaches['coachID'].values:
        coaches.loc[(coaches['coachID'] == player_id) & (coaches['year'] == award_year),'TotalAwards'] += 1


#### Coach Experience

In [251]:
def calculate_coach_experience_for_team(coaches, team_id, year):
    team_coaches = coaches[(coaches['tmID'] == team_id) & (coaches['year'] == year)]
    total_games = team_coaches['won'].sum() + team_coaches['lost'].sum()
    
    total_coach_experience = 0
    
    for _, coach in team_coaches.iterrows():
        coach_history = coaches[(coaches['coachID'] == coach['coachID']) & (coaches['year'] < year)]
        coach_history = coach_history.sort_values(by='year', ascending=False).head(year)

        weights = list(range(year, 0, -1)) 
        weighted_winrate = sum(coach_history['winrate'] * weights[:len(coach_history)])
        total_awards = coach_history['TotalAwards'].sum()
        coach_experience = weighted_winrate + total_awards
        
        coach_games = coach['won'] + coach['lost']
        coach_weight = coach_games / total_games if total_games > 0 else 0
        total_coach_experience += coach_experience * coach_weight
    
    return total_coach_experience

teams['coach_experience'] = teams.apply(
    lambda row: calculate_coach_experience_for_team(coaches, row['tmID'], row['year']), axis=1
)



### Feature Engineering: Team Yearly Stats


#### Teams Stats Calculation based on the Players Stats

In [252]:

def calculate_team_year_stats(team_id, year):
    
    # Select player ids for the team for that year
    team_players = players_teams[(players_teams['tmID'] == team_id) & (players_teams['year'] == year)]
    team_players_bio = players[(players['bioID'].isin(team_players['playerID']))]
    
    # Add up all the stats in a team_players and add each to a different column in the team dataframe
    
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_average_height'] = team_players_bio['height'].mean()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_average_weight'] = team_players_bio['weight'].mean()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_GP'] = team_players['GP'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_GS'] = team_players['GS'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_points'] = team_players['points'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_oRebounds'] = team_players['oRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_dRebounds'] = team_players['dRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_rebounds'] = team_players['rebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_assists'] = team_players['assists'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_steals'] = team_players['steals'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_blocks'] = team_players['blocks'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_turnovers'] = team_players['turnovers'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PF'] = team_players['PF'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_fgAttempted'] = team_players['fgAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_fgMade'] = team_players['fgMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_ftAttempted'] = team_players['ftAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_ftMade'] = team_players['ftMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_threeAttempted'] = team_players['threeAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_threeMade'] = team_players['threeMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_dq'] = team_players['dq'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostGP'] = team_players['PostGP'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostGS'] = team_players['PostGS'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostMinutes'] = team_players['PostMinutes'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostPoints'] = team_players['PostPoints'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostoRebounds'] = team_players['PostoRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostdRebounds'] = team_players['PostdRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostRebounds'] = team_players['PostRebounds'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostAssists'] = team_players['PostAssists'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostSteals'] = team_players['PostSteals'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostBlocks'] = team_players['PostBlocks'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostTurnovers'] = team_players['PostTurnovers'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostPF'] = team_players['PostPF'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostfgAttempted'] = team_players['PostfgAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostfgMade'] = team_players['PostfgMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostftAttempted'] = team_players['PostftAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostftMade'] = team_players['PostftMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostthreeAttempted'] = team_players['PostthreeAttempted'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostthreeMade'] = team_players['PostthreeMade'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_PostDQ'] = team_players['PostDQ'].sum()
    teams.loc[(teams['tmID'] == team_id) & (teams['year'] == year), 'player_total_awards'] = team_players['TotalAwards'].sum()
    

# apply to all teams
for index, row in teams.iterrows():
    calculate_team_year_stats(row['tmID'], row['year'])

### Feature Engineering: Team Performance Metrics

In [253]:
def add_engineered_features(teams):
    # Calculate the power of a team by year

    # Columns : year,tmID,confID,rank,playoff,firstRound,semis,finals,o_fgm,o_fga,o_ftm,o_fta,o_3pm,o_3pa,o_oreb,o_dreb,o_reb,o_asts,o_pf,o_stl,o_to,o_blk,o_pts,d_fgm,d_fga,d_ftm,d_fta,d_3pm,d_3pa,d_oreb,d_dreb,d_reb,d_asts,d_pf,d_stl,d_to,d_blk,d_pts,won,lost,GP,homeW,homeL,awayW,awayL,confW,confL,min,attend,post_winrate

    # Season Performance
    teams['winrate'] = teams['won'] / teams['GP']
    
    teams['home_winrate'] = teams['homeW'] / (teams['homeW'] + teams['homeL'])
    
    teams['away_winrate'] = teams['awayW'] / (teams['awayW'] + teams['awayL'])
    
    teams['conf_winrate'] = teams['confW'] / (teams['confW'] + teams['confL'])
    # Offensive Statistics

    # Field goal percentage
    teams['o_fg_pct'] = teams['o_fgm'] / teams['o_fga']

    # Free throw percentage
    teams['o_ft_pct'] = teams['o_ftm'] / teams['o_fta']

    # Three-point percentage
    teams['o_3p_pct'] = teams['o_3pm'] / teams['o_3pa']

    # Offensive rebound percentage
    teams['o_oreb_pct'] = teams['o_oreb'] / teams['o_reb']

    # Defensive rebound percentage
    teams['o_dreb_pct'] = teams['o_dreb'] / teams['o_reb']


    # Defensive Statistics

    # Field goal percentage allowed
    teams['d_fg_pct'] = teams['d_fgm'] / teams['d_fga']

    # Free throw percentage allowed
    teams['d_ft_pct'] = teams['d_ftm'] / teams['d_fta']

    # Three-point percentage allowed
    teams['d_3p_pct'] = teams['d_3pm'] / teams['d_3pa']

    # Offensive rebound percentage allowed
    teams['d_oreb_pct'] = teams['d_oreb'] / teams['d_reb']

    # Defensive rebound percentage allowed
    teams['d_dreb_pct'] = teams['d_dreb'] / teams['d_reb']

    # Points difference
    teams['pts_diff'] = teams['o_pts'] - teams['d_pts']
    

    return teams

teams = add_engineered_features(teams)

### Feature Engineering: Rookie Players Stats

In [254]:
rookie_players = players_teams.groupby('playerID')['year'].min().reset_index()
rookie_players.columns = ['playerID', 'rookie_year']

players_teams_with_rookies = pd.merge(players_teams, rookie_players, on='playerID', how='left')

def calculate_average_rookie_stats(year, players_teams_with_rookies, players_teams):
    # For the year 2, the rookies are all the players in the year 1
    if year == 2:
        rookie_stats = players_teams_with_rookies[players_teams_with_rookies['year'] == 1]
    else:
        # For the other years, the rookies are the players that are in their rookie year excluding the year 1
        rookie_stats_up_to_year = players_teams_with_rookies[players_teams_with_rookies['year'] < year]
        rookie_stats_up_to_year = rookie_stats_up_to_year[rookie_stats_up_to_year['year'] != 1] 
        rookie_stats = rookie_stats_up_to_year[rookie_stats_up_to_year['year'] == rookie_stats_up_to_year['rookie_year']] #

    excluded_columns = ['playerID', 'year', 'stint', 'tmID', 'rookie_year']
    rookie_avg_stats = rookie_stats.drop(columns=excluded_columns).mean().astype(int)

    average_rookie = pd.DataFrame([rookie_avg_stats])
    average_rookie['playerID'] = f"average_rookie_{year}"
    average_rookie['year'] = year
    average_rookie['stint'] = 0

    average_rookie = average_rookie[['playerID', 'year', 'stint'] + list(rookie_avg_stats.index)]

    players_teams = pd.concat([players_teams, average_rookie], ignore_index=True)
    
    return players_teams

def process_rookie_stats(players_teams, players_teams_with_rookies):
    # For Year 2-10
    for year in range(2, 11):
        players_teams = calculate_average_rookie_stats(year, players_teams_with_rookies, players_teams)
    return players_teams


players_teams = process_rookie_stats(players_teams, players_teams_with_rookies)


# Get the rookies in a year
def get_rookies_in_year(year):
    rookies_in_year = players_teams_with_rookies[
        (players_teams_with_rookies['year'] == year) & 
        (players_teams_with_rookies['year'] == players_teams_with_rookies['rookie_year'])
    ]
    return rookies_in_year

print(len(get_rookies_in_year(10)))


25


### Feature Engineering: Rookie Team Stats

In [255]:
rookie_teams = teams.groupby('franchID')['year'].min().reset_index()
rookie_teams.columns = ['franchID', 'rookie_year']

print(rookie_teams)
rookie_team_stats = pd.merge(teams, rookie_teams, on='franchID', how='left')
rookie_stats = rookie_team_stats[rookie_team_stats['year'] == rookie_team_stats['rookie_year']]

excluded_columns = ['tmID', 'franchID', 'confID', 'year', 'rank', 'playoff','rookie_year']
rookie_stats_cleaned = rookie_stats.drop(columns=excluded_columns)

average_rookie_team_stats = rookie_stats_cleaned.mean(numeric_only=True).astype(float)

average_rookie_team = pd.DataFrame([average_rookie_team_stats])
average_rookie_team['year'] = 0  
average_rookie_team['tmID'] = "average_rookie_team"
average_rookie_team['franchID'] = "average_rookie_franch"

valid_columns = ['tmID', 'franchID', 'year'] + [
    col for col in rookie_stats_cleaned.columns if col in average_rookie_team.columns
]

average_rookie_team = average_rookie_team[valid_columns]

teams = pd.concat([teams, average_rookie_team], ignore_index=True)

   franchID  rookie_year
0       ATL            9
1       CHA            1
2       CHI            7
3       CLE            1
4       CON            1
5       DET            1
6       HOU            1
7       IND            1
8       LAS            1
9       MIA            1
10      MIN            1
11      NYL            1
12      PHO            1
13      POR            1
14      SAC            1
15      SAS            1
16      SEA            1
17      WAS            1


### Save the Data


In [256]:
# Save the data
teams.to_csv('data/clean/cleaned_teams.csv', index=False)
coaches.to_csv('data/clean/cleaned_coaches.csv', index=False)
players_teams.to_csv('data/clean/cleaned_players_teams.csv', index=False)