# Player Neural Network
We are going on an approach to deduce a "worth" or overall quantitative value to a player during a regular season in order to later make a prediction based on.

## Utils

In [1]:
def convert_min_to_float(min_str):
    if isinstance(min_str, str) and ':' in min_str:
        mins, secs = map(int, min_str.split(':'))
        return mins + secs / 60
    return 0.0  # handle empty or malformed entries

In [2]:
def convert_int_season_to_str(season):
    if isinstance(season, int):
        return f"{season}-{season%2000 +1 :02d}" 
    return season

In [3]:
# make me a function that will print the number of unique values in each column of the dataframe
def print_unique_values(df):
    for column in df.columns:
        print(f"{column}: {len(df[column].unique())} unique {column}")


# Imports 

In [4]:
# to install xgboost on the notebook environment
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import itertools 
import sklearn
import xgboost as xgb

pd.set_option('future.no_silent_downcasting', True)

In [6]:
NUM_GAMES=82
teams=['DAL','MIL','ATL','DEN','HOU','IND','OKC','CHI','ORL','BOS','DET','NYK'
,'CHA','LAL','SAC','MIA','LAC','GSW','POR','MIN','WAS','BKN','MEM','SAS'
,'PHX','NOP','UTA','TOR','PHI','CLE']
all_possible_matchups=itertools.combinations(teams, 2)
regular_games_total=pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_totals_2010_2024.csv",delimiter=',',header=0)
regular_season_all_parts=pd.concat([
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_1.csv",delimiter=',',header=0),
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_2.csv",delimiter=',',header=0),
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_3.csv",delimiter=',',header=0)])

# NBA facts
Regular season each team makes 82 games.
The Best 8 teams of each conference (WEST & EAST), makes to the playoffs.
The goal with this model is to predict the probability of the winning a game between a specific matchup.
## Important Game Features 
- Home/ Away Game 
- Players List 
## Important Player Features
- season (season_year)
- time played (MIN)
- Field Goal Made (fieldGoalsMade)
- Field Goal Percentage (fieldGoalsPercentage)
- Field Goal Made 3PT (treePointersMade)
- Field Goal Percentage 3PT (threePointersPercentage)
- Free throw made (freeThrowsMade)
- Free Throw (percentagefreeThrowsPercentage)
- assists
- rebounds 
- steals 
- turnovers
- foulsPersonal
- blocks 
- points 
- plusMinusPoints

In [7]:
def getMatchAndPlayerStats(game,player,season=None,teamname=None):
    """
    Function to get the average points of a team in a season
    :param teamname: team name :List[str]
    :param season: season: str or int or None (for all seasons) 
    :return: average points of the team 
    """
    season=convert_int_season_to_str(season)
    playerScores = player[player['minutes'].notna()].copy()
    playerScores['minutesParsed'] = playerScores['minutes'].apply(convert_min_to_float)
    game.loc[:,'WL'] = game['WL'].replace({'W': 1, 'L': 0}).infer_objects(copy=False)
    gamePlayer=game.merge(playerScores, how='inner', left_on=['GAME_ID','TEAM_ABBREVIATION'], right_on=['gameId','teamTricode'])
    # add a collumn to count the number of games played by each player
    aggregation= gamePlayer.groupby(['personName','teamTricode','season_year']).agg(
        {
            'WL': 'sum',
            'plusMinusPoints':'mean',
            'minutesParsed': 'mean',
            'points': 'mean',
            'fieldGoalsPercentage': 'mean',
            'threePointersPercentage': 'mean',
            'reboundsTotal': 'mean',
            'foulsPersonal': 'mean',
            'turnovers': 'mean',
            'fieldGoalsMade': 'mean',
            'fieldGoalsAttempted': 'mean',
            'steals':'mean',
        }
    ).reset_index()
    aggregation['gamesPlayed'] = gamePlayer.groupby(['personName','teamTricode','season_year'])['gameId'].count().reset_index(drop=True)

    if season is not None:
        aggregation = aggregation[aggregation['season_year'] == season]
    if teamname is not None:
        aggregation = aggregation[aggregation['teamTricode'].isin(teamname)]
    aggregation['winPercentage'] = aggregation['WL'] / aggregation['gamesPlayed'] 
    return aggregation.reset_index(drop=True)

# MSE & R2

First it is important to notice and define what the MSE (Mean Squared Error) and R2 coeficcient, and in what contexts tthe values might mean.
## MSE
The Mean squared error (MSE) represents the error of the estimator or predictive model created based on the given set of observations in the sample. It measures the average squared difference between the 

# Doing the overall score based on weights of the players

In [18]:
print(getMatchAndPlayerStats(regular_games_total,regular_season_all_parts,season=2013))
weights=[1,.7,.6,.5,.5,-.7]
features=["points","assists","reboundsTotal","steals","blocks","turnovers"]


          personName teamTricode season_year  WL  plusMinusPoints  \
0           AJ Price         MIN     2013-14  15         0.071429   
1       Aaron Brooks         DEN     2013-14  11        -2.724138   
2       Aaron Brooks         HOU     2013-14  31        -0.372093   
3         Aaron Gray         SAC     2013-14  10        -2.393939   
4         Aaron Gray         TOR     2013-14   2        -1.250000   
..               ...         ...         ...  ..              ...   
543     Willie Green         LAC     2013-14  38         1.509091   
544  Wilson Chandler         DEN     2013-14  27        -2.806452   
545     Xavier Henry         LAL     2013-14  16        -0.883721   
546    Zach Randolph         MEM     2013-14  49         2.139241   
547    Zaza Pachulia         MIL     2013-14  10        -3.415094   

     minutesParsed     points  fieldGoalsPercentage  threePointersPercentage  \
0         3.548810   1.571429              0.273821                 0.136893   
1        29

# XGBoostRegressor with Random Forest Regression For each season

In [None]:
import sklearn.ensemble
from xgboost import XGBRegressor

errors_r2=[]
errors_mse=[]

gameP=getMatchAndPlayerStats(regular_games_total,regular_season_all_parts)
# gameP["weighted"] = gameP[features].dot(weights)
X =gameP.drop(['personName','teamTricode','season_year',"plusMinusPoints",'winPercentage'],axis=1)
y = gameP['plusMinusPoints']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
# applies standardization to the data same 
scaler = sklearn.preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

xgb = XGBRegressor(n_estimators=100)
xgb.fit(X_train, y_train)

# Evaluate
for model, name in zip([ xgb], ["XGBoost"]):
    preds = model.predict(X_test)
    r2_error=sklearn.metrics.r2_score(y_test, preds)
    mean_sq=sklearn.metrics.mean_squared_error(y_test, preds)
    errors_r2.append(r2_error)
    errors_mse.append(mean_sq)
    print(f"{name} R2: {r2_error:.4f}, MSE: {mean_sq:.4f}")
    
# justificar com o MSE tendo em conta os valores absolutos no qual a feature varia
# analisar e comparar com outras features
# justificar com os desvio do valor absoluto porque o R2 não tem em conta os valores absolutos 

Random Forest R2: 0.3534, MSE: 6.1239
XGBoost R2: 0.4304, MSE: 5.3947


# Main Model Trainning features
- ⁠combinacoes de todos os matchups (ir buscar gameIds✅, vitoria/derrota✅ e estatisticas)
- ⁠saber casa/fora
- ⁠winningPercentage each player per matchup
- ⁠impacto do jogador (output Model1)
- ⁠Output TeamTag (percentage vitoria, derrota)

In [69]:
season_year = 2013
current_xgb = xgb # year_model.get(season_year)

aggregation_dict = {
    'WL': 68,
    'minutesParsed': 40.0,
    'points': 17.1,
    'fieldGoalsPercentage': 48.1,
    'threePointersPercentage': 35,
    'reboundsTotal': 6.0,
    'foulsPersonal': 2.6,
    'turnovers': 8.0,
    'fieldGoalsMade': 6.54,
    'fieldGoalsAttempted': 14,
    'steals': 1.8,
    'gamesPlayed': 68
}

# Create DataFrame
fake_player = pd.DataFrame([aggregation_dict])
# Apply scaling
fake_player_scaled = scaler.transform(fake_player)
# Predict
prediction_plus_minus = xgb.predict(fake_player_scaled)
print(prediction_plus_minus)

[6.1847634]


# Próximos passos
- Percentagem de vitória de cada jogador face a um matchup (Feature M2)
- Output M1 (impacto jogador) 
- Team Abbreviation ()
- Casa/Fora (Boolean)