# Player Neural Network
We are going on an approach to deduce a "worth" or overall quantitative value to a player during a regular season in order to later make a prediction based on.

## Utils

In [1]:
def convert_int_season_to_str(season):
    if isinstance(season, int):
        return f"{season}-{season%2000 +1 :02d}" 
    return season

In [2]:
def getMatchupByTeamBySeason(scores,matchup,season=False):
    """
    Function to get the matchup of a team in a season
    :param team_tag: team tag
    :optional param season: season to filter the data by season 
    :return: matchup of the team in the season
    """
    teams=scores.filter(items=['SEASON_YEAR','TEAM_ABBREVIATION','GAME_ID','MATCHUP','WL'])
    teams.loc[:,'WL'] = teams['WL'].replace({'W': 1, 'L': 0}).infer_objects(copy=False)
    if season is not False:
        teams=teams[teams['SEASON_YEAR']==convert_int_season_to_str(season)]
    mathcup_tag=matchup[0]+" vs. "+matchup[1]
    matchup_inverse_tag=matchup[1]+" vs. "+matchup[0]
    teams['MATCHUP_STANDARD'] = teams['MATCHUP'].str.replace("@", "vs.")
    teams=pd.concat([teams[teams['MATCHUP_STANDARD'] ==  mathcup_tag],teams[teams['MATCHUP_STANDARD'] ==  matchup_inverse_tag]],ignore_index=True)
    teams['IS_HOME'] = teams['MATCHUP'].str.contains('@')
    return teams.filter(items=['SEASON_YEAR','TEAM_ABBREVIATION','GAME_ID','IS_HOME','MATCHUP_STANDARD','WL'])

In [3]:
def aggregate_matchup_data(df, matchup):
    team1_abbr, team2_abbr = matchup

    # Handle wildcards: build valid team list
    if team1_abbr == "*":
        teams1 = df['TEAM_ABBREVIATION'].unique()
    else:
        teams1 = [team1_abbr]

    if team2_abbr == "*":
        teams2 = df['TEAM_ABBREVIATION'].unique()
    else:
        teams2 = [team2_abbr]

    # Filter for games where team is in either team1 or team2
    filtered = df[
        df['TEAM_ABBREVIATION'].isin(set(teams1) | set(teams2))
    ]

    # Build dynamic mask for MATCHUP_STANDARD column
    mask = filtered['MATCHUP_STANDARD'].apply(
        lambda x: any(f"{a} vs. {b}" in x or f"{b} vs. {a}" in x for a in teams1 for b in teams2 if a != b or team1_abbr == "*" or team2_abbr == "*")
    )
    filtered = filtered[mask]

    # Label as HOME or AWAY
    filtered = filtered.copy()
    filtered['HOME_OR_AWAY'] = filtered['IS_HOME'].map({True: 'HOME', False: 'AWAY'})

    # Pivot to get HOME and AWAY stats side by side
    pivoted = filtered.pivot(index='GAME_ID', columns='HOME_OR_AWAY')

    # Flatten multi-index columns
    pivoted.columns = ['_'.join(col).strip() for col in pivoted.columns.values]
    pivoted = pivoted.reset_index()

    # Select desired columns
    result = pivoted[[
        'GAME_ID',
        'SEASON_YEAR_HOME',
        'TEAM_ABBREVIATION_HOME', 'TEAM_ABBREVIATION_AWAY',
        'WL_HOME', 'WL_AWAY',
        'MATCHUP_STANDARD_HOME'
    ]].rename(columns={
        'SEASON_YEAR_HOME': 'SEASON_YEAR',
        'TEAM_ABBREVIATION_HOME': 'HOME_TEAM',
        'TEAM_ABBREVIATION_AWAY': 'AWAY_TEAM',
        'WL_HOME': 'HOME_WL',
        'WL_AWAY': 'AWAY_WL',
        'MATCHUP_STANDARD_HOME': 'MATCHUP_STANDARD'
    })

    return result


In [4]:
def convert_min_to_float(min_str):
    if isinstance(min_str, str) and ':' in min_str:
        mins, secs = map(int, min_str.split(':'))
        return mins + secs / 60
    return 0.0  # handle empty or malformed entries

In [5]:
def convert_int_season_to_str(season):
    if isinstance(season, int):
        return f"{season}-{season%2000 +1 :02d}" 
    return season

In [6]:
# make me a function that will print the number of unique values in each column of the dataframe
def print_unique_values(df):
    for column in df.columns:
        print(f"{column}: {len(df[column].unique())} unique {column}")


# Imports 

In [7]:
# to install xgboost on the notebook environment
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import itertools 
from typing import Dict,Tuple
from IPython.display import display
import sklearn
import xgboost as xgb

pd.set_option('future.no_silent_downcasting', True)

In [9]:
NUM_GAMES=82
teams=['DAL','MIL','ATL','DEN','HOU','IND','OKC','CHI','ORL','BOS','DET','NYK'
,'CHA','LAL','SAC','MIA','LAC','GSW','POR','MIN','WAS','BKN','MEM','SAS'
,'PHX','NOP','UTA','TOR','PHI','CLE']
all_possible_matchups=list(itertools.combinations(teams, 2))
regular_games_total=pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_totals_2010_2024.csv",delimiter=',',header=0)
regular_season_all_parts=pd.concat([
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_1.csv",delimiter=',',header=0),
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_2.csv",delimiter=',',header=0),
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_3.csv",delimiter=',',header=0)])

# NBA facts
Regular season each team makes 82 games.
The Best 8 teams of each conference (WEST & EAST), makes to the playoffs.
The goal with this model is to predict the probability of the winning a game between a specific matchup.
## Important Game Features 
- Home/ Away Game 
- Players List 
## Important Player Features
- season (season_year)
- time played (MIN)
- Field Goal Made (fieldGoalsMade)
- Field Goal Percentage (fieldGoalsPercentage)
- Field Goal Made 3PT (treePointersMade)
- Field Goal Percentage 3PT (threePointersPercentage)
- Free throw made (freeThrowsMade)
- Free Throw (percentagefreeThrowsPercentage)
- assists
- rebounds 
- steals 
- turnovers
- foulsPersonal
- blocks 
- points 
- plusMinusPoints

In [10]:
def getMatchAndPlayerStats(game,player,season=None,teamname=None):
    """
    Function to get the average points of a team in a season
    :param teamname: team name :List[str]
    :param season: season: str or int or None (for all seasons) 
    :return: average points of the team 
    """
    season=convert_int_season_to_str(season)
    playerScores = player[player['minutes'].notna()].copy()
    playerScores['minutesParsed'] = playerScores['minutes'].apply(convert_min_to_float)
    game.loc[:,'WL'] = game['WL'].replace({'W': 1, 'L': 0}).infer_objects(copy=False)
    gamePlayer=game.merge(playerScores, how='inner', left_on=['GAME_ID','TEAM_ABBREVIATION'], right_on=['gameId','teamTricode'])
    # add a collumn to count the number of games played by each player
    aggregation= gamePlayer.groupby(['personName','teamTricode','season_year']).agg(
        {
            'WL': 'sum',
            'plusMinusPoints':'mean',
            'minutesParsed': 'mean',
            'points': 'mean',
            'fieldGoalsPercentage': 'mean',
            'threePointersPercentage': 'mean',
            'reboundsTotal': 'mean',
            'foulsPersonal': 'mean',
            'turnovers': 'mean',
            'fieldGoalsMade': 'mean',
            'fieldGoalsAttempted': 'mean',
            'steals':'mean',
        }
    ).reset_index()
    aggregation['gamesPlayed'] = gamePlayer.groupby(['personName','teamTricode','season_year'])['gameId'].count().reset_index(drop=True)

    if season is not None:
        aggregation = aggregation[aggregation['season_year'] == season]
    if teamname is not None:
        aggregation = aggregation[aggregation['teamTricode'].isin(teamname)]
    aggregation['winPercentage'] = aggregation['WL'] / aggregation['gamesPlayed'] 
    return aggregation.reset_index(drop=True)

# MSE & R2

First it is important to notice and define what the MSE (Mean Squared Error) and R2 coeficcient, and in what contexts tthe values might mean.
## MSE
The Mean squared error (MSE) represents the error of the estimator or predictive model created based on the given set of observations in the sample. It measures the average squared difference between the 

In [111]:
display(getMatchAndPlayerStats(regular_games_total,regular_season_all_parts))


Unnamed: 0,personName,teamTricode,season_year,WL,plusMinusPoints,minutesParsed,points,fieldGoalsPercentage,threePointersPercentage,reboundsTotal,foulsPersonal,turnovers,fieldGoalsMade,fieldGoalsAttempted,steals,gamesPlayed,winPercentage
0,A.J. Lawson,DAL,2022-23,4,-2.928571,7.607143,3.857143,0.333357,0.321357,1.428571,0.714286,0.214286,1.500000,3.071429,0.142857,14,0.285714
1,A.J. Lawson,DAL,2023-24,27,0.428571,7.402778,3.238095,0.381024,0.117357,1.190476,0.523810,0.333333,1.285714,2.880952,0.238095,42,0.642857
2,A.J. Lawson,MIN,2022-23,1,-5.000000,1.766667,2.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1,1.0
3,AJ Green,MIL,2022-23,27,-0.742857,9.855714,4.400000,0.260543,0.249543,1.285714,0.885714,0.257143,1.514286,3.571429,0.171429,35,0.771429
4,AJ Green,MIL,2023-24,35,0.892857,10.969345,4.500000,0.359804,0.319875,1.142857,0.875000,0.214286,1.482143,3.500000,0.160714,56,0.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8193,Zoran Dragic,MIA,2014-15,4,-1.300000,6.178333,2.200000,0.052900,0.042900,0.500000,0.500000,0.500000,0.900000,2.200000,0.200000,10,0.4
8194,Zoran Dragic,PHX,2014-15,2,-0.333333,2.238889,1.000000,0.138833,0.000000,0.500000,0.166667,0.000000,0.333333,1.333333,0.000000,6,0.333333
8195,Zydrunas Ilgauskas,MIA,2010-11,51,2.944444,15.907870,5.000000,0.430847,0.000000,3.986111,2.569444,0.722222,2.250000,4.430556,0.319444,72,0.708333
8196,Zylan Cheatham,NOP,2019-20,2,-7.000000,12.762500,3.000000,0.583250,0.000000,2.250000,2.500000,1.000000,1.500000,2.250000,0.250000,4,0.5


# XGBoostRegressor with Random Forest Regression For each season

In [112]:
import sklearn.ensemble
from xgboost import XGBRegressor

errors_r2=[]
errors_mse=[]

gameP=getMatchAndPlayerStats(regular_games_total,regular_season_all_parts)
# gameP["weighted"] = gameP[features].dot(weights)
X =gameP.drop(['personName','teamTricode','season_year',"plusMinusPoints",'winPercentage'],axis=1)
y = gameP['plusMinusPoints']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
# applies standardization to the data same 
scaler = sklearn.preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

xgb = XGBRegressor(n_estimators=100)
xgb.fit(X_train, y_train)

# Evaluate
for model, name in zip([ xgb], ["XGBoost"]):
    preds = model.predict(X_test)
    r2_error=sklearn.metrics.r2_score(y_test, preds)
    mean_sq=sklearn.metrics.mean_squared_error(y_test, preds)
    errors_r2.append(r2_error)
    errors_mse.append(mean_sq)
    print(f"{name} R2: {r2_error:.4f}, MSE: {mean_sq:.4f}")
    
# justificar com o MSE tendo em conta os valores absolutos no qual a feature varia
# analisar e comparar com outras features
# justificar com os desvio do valor absoluto porque o R2 não tem em conta os valores absolutos 

XGBoost R2: 0.4304, MSE: 5.3947


# Main Model Trainning features
- ⁠combinacoes de todos os matchups (ir buscar gameIds✅, vitoria/derrota✅ e estatisticas)
- ⁠saber casa/fora
- ⁠winningPercentage each player per matchup
- ⁠impacto do jogador (output Model1)
- ⁠Output TeamTag (percentage vitoria, derrota)

In [164]:
current_xgb = xgb # year_model.get(season_year)
players=dict()
	
players[('LeBron','LAL')]= {
    'WL': 68,
    'minutesParsed': 40.0,
    'points': 17.1,
    'fieldGoalsPercentage': 48.1,
    'threePointersPercentage': 35,
    'reboundsTotal': 6.0,
    'foulsPersonal': 2.6,
    'turnovers': 8.0,
    'fieldGoalsMade': 6.54,
    'fieldGoalsAttempted': 14,
    'steals': 1.8,
    'gamesPlayed': 68
}

players[('Chris Paul','GSW')]= {
    'WL': 50,
    'minutesParsed': 40.0,
    'points': 13.1,
    'fieldGoalsPercentage': 35.1,
    'threePointersPercentage': 35,
    'reboundsTotal': 6.0,
    'foulsPersonal': 2.6,
    'turnovers': 3.0,
    'fieldGoalsMade': 9.54,
    'fieldGoalsAttempted': 14,
    'steals': 4.5,
    'gamesPlayed': 58
}
players[('Kyrie','DAL')]= {
    'WL': 50,
    'minutesParsed': 40.0,
    'points': 13.1,
    'fieldGoalsPercentage': 35.1,
    'threePointersPercentage': 35,
    'reboundsTotal': 6.0,
    'foulsPersonal': 2.6,
    'turnovers': 3.0,
    'fieldGoalsMade': 9.54,
    'fieldGoalsAttempted': 14,
    'steals': 4.5,
    'gamesPlayed': 58
}

players[('Luka Don','DAL')]= {
    'WL': 50,
    'minutesParsed': 40.0,
    'points': 13.1,
    'fieldGoalsPercentage': 35.1,
    'threePointersPercentage': 35,
    'reboundsTotal': 6.0,
    'foulsPersonal': 2.6,
    'turnovers': 3.0,
    'fieldGoalsMade': 9.54,
    'fieldGoalsAttempted': 14,
    'steals': 4.5,
    'gamesPlayed': 58
}
for pk,ps in players.items():
    # Create DataFrame
    fake_player = pd.DataFrame([ps])
    # Apply scaling
    fake_player_scaled = scaler.transform(fake_player)
    # Predict
    impact = xgb.predict(fake_player_scaled)
    players[pk]=impact
print(players)

{('LeBron', 'LAL'): array([6.1847634], dtype=float32), ('Chris Paul', 'GSW'): array([3.6633775], dtype=float32), ('Kyrie', 'DAL'): array([3.6633775], dtype=float32), ('Luka Don', 'DAL'): array([3.6633775], dtype=float32)}


In [165]:
mt=pd.DataFrame([])
for matchup in all_possible_matchups:
    mt = pd.concat([mt,getMatchupByTeamBySeason(regular_games_total,matchup,season=2023)],ignore_index=True)
display(mt)

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,GAME_ID,IS_HOME,MATCHUP_STANDARD,WL
0,2023-24,DAL,22300205,True,DAL vs. MIL,0
1,2023-24,DAL,22300702,False,DAL vs. MIL,0
2,2023-24,MIL,22300205,False,MIL vs. DAL,1
3,2023-24,MIL,22300702,True,MIL vs. DAL,1
4,2023-24,DAL,22300634,True,DAL vs. ATL,1
...,...,...,...,...,...,...
2455,2023-24,PHI,22300040,False,PHI vs. CLE,0
2456,2023-24,CLE,22301067,False,CLE vs. PHI,1
2457,2023-24,CLE,22300762,False,CLE vs. PHI,0
2458,2023-24,CLE,22300040,True,CLE vs. PHI,1


In [172]:
def playerMatchUpIntersection(matchup_games,players:Dict[Tuple[str,str],Dict[str,float]]):
    matchup_games = matchup_games.rename(columns={"GAME_ID": "gameId"})
    rows=[]
    for p in  players.keys():
        TEAM=p[1]        
        NAME=p[0]
        player_statistic=regular_season_all_parts[regular_season_all_parts["personName"].str.contains(NAME)]
        player_statistic=player_statistic[player_statistic["teamTricode"]==TEAM]
        player_statistic=player_statistic.filter(items=["season_year","gameId","teamTricode","personName","minutes"])
        player_statistic=player_statistic[player_statistic['minutes'].notna()] # filter the mathcup games that the players hasn't played
        merged_df = pd.merge(player_statistic, matchup_games, on="gameId", how="inner")
        if merged_df.empty:
            continue

        def did_win(row):
            if row['teamTricode'] == row['HOME_TEAM']:
                return row['HOME_WL'] == 1
            elif row['teamTricode'] == row['AWAY_TEAM']:
                return row['AWAY_WL'] == 1
            else:
                return False  # Shouldn't happen

        merged_df["won"] = merged_df.apply(did_win, axis=1)
        total_games = len(merged_df)
        total_wins = merged_df["won"].sum()
        win_percentage = total_wins / total_games if total_games > 0 else 0
        merged_df["matchupWinPercentage"]=win_percentage
        merged_df["PlayerImpactM1"]=players[p][0]
        matchup_standard = merged_df["MATCHUP_STANDARD"].iloc[0]

        rows.append({
            "MATCHUP_STANDARD": matchup_standard,
            "PlayerImpactM1": players[p][0],
            "personName": NAME,
            "teamTricode": TEAM,
            "gamePlayed":total_games,
            "matchupWinPercentage": round(win_percentage, 6)
        })

    # Create final DataFrame
    final_df = pd.DataFrame(rows)
        # all_results=pd.concat([all_results,merged_df.filter(items=["MATCHUP_STANDARD","PlayerImpactM1","personName","gameId","matchupWinPercentage","teamTricode"])]) 
    return final_df 

In [175]:

matchup_games=aggregate_matchup_data(mt,("DAL","LAL"))
playerMatchupStats=playerMatchUpIntersection(matchup_games,players)
if playerMatchupStats is not None:
    display(playerMatchupStats)


Unnamed: 0,MATCHUP_STANDARD,PlayerImpactM1,personName,teamTricode,gamePlayed,matchupWinPercentage
0,LAL vs. DAL,6.184763,LeBron,LAL,3,0.333333
1,DAL vs. LAL,3.663378,Kyrie,DAL,2,0.5
2,DAL vs. LAL,3.663378,Luka Don,DAL,3,0.666667


# Próximos passos
- Percentagem de vitória de cada jogador face a um matchup (Feature M2)
- Output M1 (impacto jogador) 
- Team Abbreviation ()
- Casa/Fora (Boolean)