# Feature Engineering

## 1. Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

awards_players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/awards_players_cleaned.csv')
coaches_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/coaches_cleaned.csv')
players_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_cleaned.csv')
players_teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/players_teams_cleaned.csv')
series_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/series_post_cleaned.csv')
teams_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_cleaned.csv')
teams_post_cleaned = pd.read_csv('../data/basketballPlayoffs_cleaned/teams_post_cleaned.csv')

## 2. Overall Calculation
### 2.1 Stamina Overall Calculation

The `calculate_stamina` function computes players overall stamina based on their game and minute statistics.

In [109]:
def calculate_stamina(df):
    total_games = df['GP'].sum() + df['PostGP'].sum()
    total_minutes = df['minutes'].sum() + df['PostMinutes'].sum()

    mean_games = total_games / len(df)
    mean_minutes = total_minutes / len(df)

    overall_stamina = (
        (0.2 * (df['GP'] + df['PostGP']) / mean_games) + 
        (0.8 * (df['minutes'] + df['PostMinutes']) / mean_minutes)
    )

    overall_stamina = np.clip(overall_stamina * 5, 1, 10)
    return overall_stamina.round(1)

players_teams_cleaned['overallSTAMINA'] = calculate_stamina(players_teams_cleaned)


### 2.2 Defense Overall Calculation

The `calculate_overall_defense` function computes players overall defensive metrics based on various statistics.

In [110]:
def calculate_overall_defense(df):
    total_games = df['GP'].sum() + df['PostGP'].sum()

    mean_drebounds = (df['dRebounds'].sum() + df['PostdRebounds'].sum()) / total_games if total_games > 0 else 1
    mean_steals = (df['steals'].sum() + df['PostSteals'].sum()) / total_games if total_games > 0 else 1
    mean_blocks = (df['blocks'].sum() + df['PostBlocks'].sum()) / total_games if total_games > 0 else 1

    mean_turnovers = (df['turnovers'].sum() + df['PostTurnovers'].sum()) / total_games if total_games > 0 else 1
    mean_pf = (df['PF'].sum() + df['PostPF'].sum()) / total_games if total_games > 0 else 1
    mean_dq = (df['dq'].sum() + df['PostDQ'].sum())  / total_games if total_games > 0 else 0 

    player_drebounds = (df['dRebounds'] + df['PostdRebounds']) / (df['GP'] + df['PostGP'])
    player_steals = (df['steals'] + df['PostSteals']) / (df['GP'] + df['PostGP'])
    player_blocks = (df['blocks'] + df['PostBlocks']) / (df['GP'] + df['PostGP'])

    player_turnovers = (df['turnovers'] + df['PostTurnovers']) / (df['GP'] + df['PostGP'])
    player_pf = (df['PF'] + df['PostPF']) / (df['GP'] + df['PostGP'])
    player_dq = (df['dq'] + df['PostDQ']) / (df['GP'] + df['PostGP'])

    overall_defense = (
        (0.45 * (player_drebounds / mean_drebounds)) +
        (0.2 * (player_steals / mean_steals)) + 
        (0.2 * (player_blocks / mean_blocks)) - 
        (0.05 * (player_turnovers / mean_turnovers)) - 
        (0.05 * (player_pf / mean_pf)) - 
        (0.05 * (player_dq / mean_dq))      
    )
    
    overall_defense = np.clip(overall_defense * 5, 1, 10)
    return overall_defense.round(1)

players_teams_cleaned['overallDEFENSE'] = calculate_overall_defense(players_teams_cleaned)

### 2.3 Ofense Overall Calculation

The `calculate_overall_offense` function computes players overall offense metrics based on various statistics.

In [111]:
def calculate_overall_offense(df):
    total_games = df['GP'].sum() + df['PostGP'].sum()

    mean_points = (df['points'].sum() + df['PostPoints'].sum()) / total_games if total_games > 0 else 1
    mean_assists = (df['assists'].sum() + df['PostAssists'].sum()) / total_games if total_games > 0 else 1
    mean_fgMade = (df['fgMade'].sum() + df['PostfgMade'].sum()) / total_games if total_games > 0 else 1
    mean_ftMade = (df['ftMade'].sum() + df['PostftMade'].sum()) / total_games if total_games > 0 else 1
    mean_threeMade = (df['threeMade'].sum() + df['PostthreeMade'].sum()) / total_games if total_games > 0 else 1

    mean_orebounds = (df['oRebounds'].sum() + df['PostoRebounds'].sum()) / total_games if total_games > 0 else 1

    player_points = (df['points'] + df['PostPoints']) / (df['GP'] + df['PostGP'])
    player_assists = (df['assists'] + df['PostAssists']) / (df['GP'] + df['PostGP'])
    player_fgMade = (df['fgMade'] + df['PostfgMade']) / (df['GP'] + df['PostGP'])
    player_ftMade = (df['ftMade'] + df['PostftMade']) / (df['GP'] + df['PostGP'])
    player_threeMade = (df['threeMade'] + df['PostthreeMade']) / (df['GP'] + df['PostGP'])

    player_orebounds = (df['oRebounds'] + df['PostoRebounds']) / (df['GP'] + df['PostGP'])

    overall_offense = (
        (0.4 * (player_points / mean_points)) +
        (0.25 * (player_assists / mean_assists)) +
        (0.1 * (player_fgMade / mean_fgMade)) +
        (0.1 * (player_ftMade / mean_ftMade)) +
        (0.1 * (player_threeMade / mean_threeMade)) +
        (0.05 * (player_orebounds / mean_orebounds))
    )

    overall_offense = np.clip(overall_offense * 5, 1, 10)
    return overall_offense.round(1)

players_teams_cleaned['overallOFFENSE'] = calculate_overall_offense(players_teams_cleaned)

### 2.3 Overall Combined Calculation

The `calculate_overall_combined` function computes a player overall rating by averaging their stamina, defense, and offense scores.

In [131]:
def calculate_overall_combined(df):
    combined_overall = (
        (0.2 * df['overallSTAMINA']) + 
        (0.4 * df['overallDEFENSE']) +  
        (0.4 * df['overallOFFENSE'])    
    )
    
    combined_overall = np.clip(combined_overall, 1, 10)
    return combined_overall.round(1)

players_teams_cleaned['OVERALL'] = calculate_overall_combined(players_teams_cleaned)

### 2.4 Overall Efficiency Rating

The `calculate_efficiency_rating` function measures how efficiently players perform in terms of shooting. 

In [None]:
# # Efficiency Rating
# def calculate_efficiency_rating(df):
#     # Calculate field goal, free throw, and three-point percentages
#     df['FG_percentage'] = df['fgMade'] / df['fgAttempted']
#     df['FT_percentage'] = df['ftMade'] / df['ftAttempted']
#     df['3P_percentage'] = df['threeMade'] / df['threeAttempted']
    
#     # Calculate Efficiency Rating as a weighted average
#     df['EfficiencyRating'] = (
#         0.5 * df['FG_percentage'].fillna(0) + 
#         0.3 * df['FT_percentage'].fillna(0) + 
#         0.2 * df['3P_percentage'].fillna(0)
#     ) * 10  # Scaling to a 1-10 range
#     df['EfficiencyRating'] = df['EfficiencyRating'].round(1)
    
#     # Clip to ensure values remain within the desired range
#     df['EfficiencyRating'] = np.clip(df['EfficiencyRating'], 1, 10)
#     return df

# # Apply efficiency rating to the dataframe
# players_teams_cleaned = calculate_efficiency_rating(players_teams_cleaned)

### 2.5 Overall Impact Score

The `calculate_impact_score` function combines Overall Defense and Overall Offense scores to gauge each player’s all-around contribution.

In [None]:
# # Impact Score
# def calculate_impact_score(df):
#     df['ImpactScore'] = (df['overallOFFENSE'] + df['overallDEFENSE']) / 2
#     df['ImpactScore'] = np.clip(df['ImpactScore'], 1, 10)
#     return df

# # Apply impact score to the dataframe
# players_teams_cleaned = calculate_impact_score(players_teams_cleaned)

### 2.6 Overall Experience Factor

The `calculate_experience` function represents the experience level of each player based on the number of seasons played.

In [None]:
# def calculate_experience(df):
#     # Group by playerID and calculate the number of unique years (seasons played) for each player
#     experience_df = df.groupby('playerID').agg(
#         seasons_played=('year', 'nunique'),
#         total_games=('GP', 'sum')
#     ).reset_index()

#     # Normalize experience factor based on seasons and games played
#     max_seasons = experience_df['seasons_played'].max()
#     max_games = experience_df['total_games'].max()
    
#     experience_df['ExperienceFactor'] = (
#         (0.5 * (experience_df['seasons_played'] / max_seasons)) +
#         (0.5 * (experience_df['total_games'] / max_games))
#     ) * 10  # Scale to a 1-10 range
    
#     # Merge back with original data
#     df = df.merge(experience_df[['playerID', 'ExperienceFactor']], on='playerID', how='left')
#     df['ExperienceFactor'] = df['ExperienceFactor'].round(1)  # Optional rounding
    
#     return df

# # Apply the experience calculation to the DataFrame
# players_teams_cleaned = calculate_experience(players_teams_cleaned)

# 3. Export Updated DataFrame with overalls

In [132]:
players_teams_cleaned.to_csv('../data/basketballPlayoffs_cleaned/players_teams_cleaned.csv', index=False)

# 4. OVERALL FINAL

In [149]:
players_overall_avg = players_teams_cleaned.groupby('playerID')['OVERALL'].mean().reset_index()

players_overall_avg.rename(columns={'OVERALL': 'OVERALL_ALL_TIME'}, inplace=True)

players_overall_avg['OVERALL_ALL_TIME'] = players_overall_avg['OVERALL_ALL_TIME'].round(1)

players_overall_avg.to_csv('../data/basketballPlayoffs_cleaned/players_overall_all_time.csv', index=False)

players_overall_avg = pd.read_csv('../data/basketballPlayoffs_cleaned/players_overall_all_time.csv')

# 5. ROOKIES

In [165]:
rookie_year = players_teams_cleaned.groupby('playerID')['year'].min()

players_teams_cleaned['is_rookie'] = players_teams_cleaned.apply(
    lambda row: 1 if row['year'] == rookie_year[row['playerID']] else 0,
    axis=1
)

players_teams_cleaned.to_csv('../data/basketballPlayoffs_cleaned/players_teams_cleaned.csv', index=False)

rookie_players = players_teams_cleaned[players_teams_cleaned['is_rookie'] == 1]

rookie_overall_avg = rookie_players['OVERALL'].mean().round(1)

rookie_overall_avg_df = pd.DataFrame({'rookie_overall_avg': [rookie_overall_avg]})

rookie_overall_avg_df.to_csv('../data/basketballPlayoffs_cleaned/rookie_overall_avg.csv', index=False)