Practice Pulling Data

In [None]:
# Imports
import pandas as pd
import numpy as np

# Lineup Path
lineup_path = '../data/Cleaned_Starting_Lineups.csv'

# Read lineup data + show some stats
lineups_df = pd.read_csv(lineup_path)
print(lineups_df.head())
print("Number of lineups:", len(lineups_df))
print("Unique players:", pd.unique(lineups_df[['Player1', 'Player2', 'Player3', 'Player4', 'Player5']].values.ravel()).size)
print("Unique list of players:", pd.unique(lineups_df[['Player1', 'Player2', 'Player3', 'Player4', 'Player5']].values.ravel()).tolist())

In [None]:
# Player Path
player_path = '../data/Advanced_Player_Stats.csv'

# Read player data + show some stats
players_df = pd.read_csv(player_path)
print(players_df.head())
print("Player Names:", players_df['Player'].unique().tolist())
print("Player + TS%:", players_df[['Player', 'TS%']].head())

Work on Creating the "Calculated Stats"

In [5]:
# Creating the first row -> "Calculated Stats"
test_row = lineups_df.iloc[0]
#print("Test Row:", test_row)

# Getting each player in the lineup
players_in_lineup = [test_row['Player1'], test_row['Player2'], test_row['Player3'], test_row['Player4'], test_row['Player5']]
#print("Players in Lineup:", players_in_lineup)

# Get the matching player stats from the players_df
player_stats = players_df[players_df['Player'].isin(players_in_lineup)]
#print("Player Stats for Lineup:", player_stats)

# Get the stats that we want to calculate
ast_list = player_stats['AST%'].tolist()
to_list = player_stats['TOV%'].tolist()
usg_list = player_stats['USG%'].tolist()
ws_list = player_stats['WS'].tolist()
dws_list = player_stats['DWS'].tolist()
dbpm_list = player_stats['DBPM'].tolist()
#print("AST List:", ast_list, "TO List:", to_list, "USG List:", usg_list, "WS List:", ws_list)

In [6]:
# Work on Calcuated Stat -> Ball Movement Index -> (BMI) -> comparing assists and turnovers, so we can use the formula:
# BMI = (AST% / TOV%)
# High BMI means more assists than turnovers, which is good for the team, low BMI means more turnovers than assists, which is bad for the team.
def calculate_bmi(ast, tov):
    if not ast or not tov:
        return 0
    ast = np.mean(ast)
    tov = np.mean(tov)
    bmi = (ast / tov) if tov != 0 else 0
    return bmi

bmi = calculate_bmi(ast_list, to_list)
print("Ball Movement Index (BMI):", bmi)

# Work on Calcuated Stat -> Usage Rate -> (USG) -> comparing usage rat based on the players in the lineup, so we can use the formula:
# USG = 1 / 1 + (std(usg% of all players in the lineup))
# high USG means the players are heavily involved in the offense, low USG means the players are not involved in the offense. between 0 and 1
def calculate_usage(usg):
    if not usg:
        return 0
    usg_std_dev = np.std(usg)
    usage = 1 / (1 + usg_std_dev) if usg_std_dev != 0 else 0
    return usage

usage = calculate_usage(usg_list)
print("Usage Rate (USG):", usage)

# Work on Calcuated Stat -> Dominance Penalty -> (DP) -> penalizing if an offense is too dominant, relying on one player too much, so we can use the formula:
# DP = top usg% - second highest usg%
# low DP means the offense is balanced, high DP means the offense is relying on one player too much. 
def calculate_dominance_penalty(usg):
    if not usg or len(usg) < 2:
        return 0
    
    sorted_usg = sorted(usg, reverse=True)
    top_usg = sorted_usg[0]
    second_usg = sorted_usg[1]
    dp = top_usg - second_usg
    return dp

dominance_penalty = calculate_dominance_penalty(usg_list)
print("Dominance Penalty (DP):", dominance_penalty)

# Work on Calculated Stat -> Pair Synergy Score -> (PSS) -> comparing the synergy between pairs of players, using avg win shares of the lineup:
# PSS = average win shares of the lineup / number of players in the lineup
# high PSS means the players are working well together, low PSS means the players are not working well together.
def calculate_pair_synergy(ws):
    if not ws:
        return 0
    
    score = np.mean(ws)
    return score

pair_synergy_score = calculate_pair_synergy(ws_list)
print("Pair Synergy Score (PSS):", pair_synergy_score)

# Work on a defensive stat -> Overall Defensive Rating -> (DR) > comparing the defensive rating of the players in the lineup, using dfensive win shares avg of the lineup:
# DR = average defensive win shares of the lineup / number of players in the lineup
# high DR means the players are good at defense, low DR means the players are not good at defense.
def calculate_defensive_rating(dws):
    if not dws:
        return 0
    
    score = np.mean(dws)
    return score

defensive_rating = calculate_defensive_rating(dws_list)
print("Defensive Rating (DR):", defensive_rating)

Ball Movement Index (BMI): 1.0633284241531664
Usage Rate (USG): 0.23115942925723335
Dominance Penalty (DP): 2.3999999999999986
Pair Synergy Score (PSS): 3.4200000000000004
Defensive Rating (DR): 1.3800000000000001


Now creating final formula + working on setting up functions to loop all the data

In [None]:
# # Now creating final formula + working on setting up functions to loop all the data

all_lineup_stats = []
for index, lineup_row in lineups_df.iterrows():
    # Get the players in the current lineup
    lineup_id = lineup_row['LineupID']
    players_in_lineup_names = [
        lineup_row['Player1'], 
        lineup_row['Player2'], 
        lineup_row['Player3'], 
        lineup_row['Player4'], 
        lineup_row['Player5']
    ]
    current_lineup_player_stats = players_df[players_df['Player'].isin(players_in_lineup_names)].dropna(subset=['Player'])

    # Ensure we have enough player stats for the lineup
    if len(current_lineup_player_stats) < 5:
        print(f"Lineup {lineup_id} does not have enough player stats.")
        continue

    # Get the stats that we want to calculate
    ast_list = current_lineup_player_stats['AST%'].tolist()
    to_list = current_lineup_player_stats['TOV%'].tolist()
    usg_list = current_lineup_player_stats['USG%'].tolist()
    ws_list = current_lineup_player_stats['WS'].tolist()
    dws_list = current_lineup_player_stats['DWS'].tolist()

    # Calculate the various metrics for the lineup
    bmi = calculate_bmi(ast_list, to_list)
    usage = calculate_usage(usg_list)
    dominance_penalty = calculate_dominance_penalty(usg_list)    
    pair_synergy_score = calculate_pair_synergy(ws_list)
    defensive_rating = calculate_defensive_rating(dws_list)

    # Create a dictionary to store the scores for the current lineup
    lineup_scores_dict = {
        'LineupID': lineup_id,
        'Ball_Movement_Index': bmi,
        'Usage_Balance_Score': usage,
        'Dominance_Penalty': dominance_penalty,
        'Pair_Synergy_Score': pair_synergy_score,
        'Defensive_Rating': defensive_rating,
        'Games': lineup_row['Games'],
        'Wins': lineup_row['Wins'],
        'Losses': lineup_row['Losses'],
        'Win_Pct': lineup_row['Win_Pct']
    }
    # Append the lineup scores to the list
    all_lineup_stats.append(lineup_scores_dict)

# Convert the list of dictionaries to a DataFrame
lineup_chemistry_stats_df = pd.DataFrame(all_lineup_stats)
print(lineup_chemistry_stats_df.head())

  LineupID  Ball_Movement_Index  Usage_Balance_Score  Dominance_Penalty  \
0       L1             1.063328             0.231159                2.4   
1       L2             1.166432             0.342368                0.4   
2       L3             0.875556             0.226547                3.8   
3       L4             0.987106             0.216806                2.4   
4       L5             1.145221             0.236608                2.0   

   Pair_Synergy_Score  Defensive_Rating  Games  Wins  Losses  Win_Pct  
0                3.42              1.38     17    12       5    0.706  
1                3.56              1.34      7     5       2    0.714  
2                3.16              1.28      3     1       2    0.333  
3                3.32              1.38      3     2       1    0.667  
4                3.34              1.28      2     1       1    0.500  


In [None]:
# Normalize the scores + calculate the final score
cols_to_normalize_positive = ['Ball_Movement_Index', 'Usage_Balance_Score', 'Pair_Synergy_Score', 'Defensive_Rating']
cols_to_normalize_negative = ['Dominance_Penalty']

for col in cols_to_normalize_positive:
    # Normalize the scores to a range of 0 to 1
    min_val = lineup_chemistry_stats_df[col].min()
    max_val = lineup_chemistry_stats_df[col].max()

    # Handle the case where min_val == max_val
    if min_val == max_val:
        lineup_chemistry_stats_df[f'{col} Normalized'] = 0.5
    else:
        lineup_chemistry_stats_df[f'{col} Normalized'] = (lineup_chemistry_stats_df[col] - min_val) / (max_val - min_val)

for col in cols_to_normalize_negative:
    # Normalize the scores to a range of 0 to 1 (inverted for penalties)
    min_val = lineup_chemistry_stats_df[col].min()
    max_val = lineup_chemistry_stats_df[col].max()

    if min_val == max_val:
        lineup_chemistry_stats_df[f'{col} Normalized'] = 0.5
    else:
        lineup_chemistry_stats_df[f'{col} Normalized'] = 1 - (lineup_chemistry_stats_df[col] - min_val) / (max_val - min_val)

# Define the weights for each normalized score
weight_bmi = 0.20 # Ball Movement Index
weight_ubs = 0.20 # Usage Balance Score
weight_dp = 0.20  # Dominance Penalty (will be subtracted, so it's a 'penalty')
weight_pss = 0.20 # Pair Synergy Score
weight_dr = 0.20 # Defensive Rating

# Calculate the final score
lineup_chemistry_stats_df['Lineup_Chemistry_Score'] = (
    weight_bmi * lineup_chemistry_stats_df['Ball_Movement_Index Normalized'] +
    weight_ubs * lineup_chemistry_stats_df['Usage_Balance_Score Normalized'] +
    weight_dp * lineup_chemistry_stats_df['Dominance_Penalty Normalized'] +
    weight_pss * lineup_chemistry_stats_df['Pair_Synergy_Score Normalized'] +
    weight_dr * lineup_chemistry_stats_df['Defensive_Rating Normalized']
)

# Display final DataFrame with scores
print("\nLineup Chemistry Scores with Final Score:")
print(lineup_chemistry_stats_df[['LineupID', 'Games', 'Wins', 'Losses', 'Win_Pct', 
                                   'Ball_Movement_Index', 'Usage_Balance_Score', 
                                   'Dominance_Penalty', 'Pair_Synergy_Score', 'Defensive_Rating', 
                                   'Lineup_Chemistry_Score']].sort_values(by='Lineup_Chemistry_Score', ascending=False))


Lineup Chemistry Scores with Final Score:
  LineupID  Games  Wins  Losses  Win_Pct  Ball_Movement_Index  \
1       L2      7     5       2    0.714             1.166432   
0       L1     17    12       5    0.706             1.063328   
4       L5      2     1       1    0.500             1.145221   
3       L4      3     2       1    0.667             0.987106   
7       L8      1     0       1    0.000             0.930013   
2       L3      3     1       2    0.333             0.875556   
6       L7      1     1       0    1.000             0.814761   
5       L6      1     0       1    0.000             0.976341   

   Usage_Balance_Score  Dominance_Penalty  Pair_Synergy_Score  \
1             0.342368                0.4                3.56   
0             0.231159                2.4                3.42   
4             0.236608                2.0                3.34   
3             0.216806                2.4                3.32   
7             0.280763                2.4     

In [10]:
# Save as a csv file
lineup_chemistry_stats_df.to_csv('../output/Lineup_Chemistry_Scores.csv', index=False)
print("\nLineup chemistry scores saved to 'Lineup_Chemistry_Scores.csv'.")


Lineup chemistry scores saved to 'Lineup_Chemistry_Scores.csv'.
