# Player Neural Network
We are going on an approach to deduce a "worth" or overall quantitative value to a player during a regular season in order to later make a prediction based on.

## Utils

In [2]:
def convert_min_to_float(min_str):
    if isinstance(min_str, str) and ':' in min_str:
        mins, secs = map(int, min_str.split(':'))
        return mins + secs / 60
    return 0.0  # handle empty or malformed entries

In [3]:
def convert_int_season_to_str(season):
    if isinstance(season, int):
        return f"{season}-{season%2000 +1 :02d}" 
    return season

In [4]:
# make me a function that will print the number of unique values in each column of the dataframe
def print_unique_values(df):
    for column in df.columns:
        print(f"{column}: {len(df[column].unique())} unique {column}")


# Imports 

In [5]:
# to install xgboost on the notebook environment
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import itertools 
import sklearn
import xgboost as xgb

pd.set_option('future.no_silent_downcasting', True)

In [7]:
NUM_GAMES=82
teams=['DAL','MIL','ATL','DEN','HOU','IND','OKC','CHI','ORL','BOS','DET','NYK'
,'CHA','LAL','SAC','MIA','LAC','GSW','POR','MIN','WAS','BKN','MEM','SAS'
,'PHX','NOP','UTA','TOR','PHI','CLE']
all_possible_matchups=itertools.combinations(teams, 2)
regular_games_total=pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_totals_2010_2024.csv",delimiter=',',header=0)
regular_season_all_parts=pd.concat([
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_1.csv",delimiter=',',header=0),
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_2.csv",delimiter=',',header=0),
        pd.read_csv("./datasets/NBA_DATA_2010_2024/regular_season_box_scores_2010_2024_part_3.csv",delimiter=',',header=0)])

# NBA facts
Regular season each team makes 82 games.
The Best 8 teams of each conference (WEST & EAST), makes to the playoffs.
The goal with this model is to predict the probability of the winning a game between a specific matchup.
## Important Game Features 
- Home/ Away Game 
- Players List 
## Important Player Features
- season (season_year)
- time played (MIN)
- Field Goal Made (fieldGoalsMade)
- Field Goal Percentage (fieldGoalsPercentage)
- Field Goal Made 3PT (treePointersMade)
- Field Goal Percentage 3PT (threePointersPercentage)
- Free throw made (freeThrowsMade)
- Free Throw (percentagefreeThrowsPercentage)
- assists
- rebounds 
- steals 
- turnovers
- foulsPersonal
- blocks 
- points 
- plusMinusPoints

In [8]:
def getMatchAndPlayerStats(game,player,season=None,teamname=None):
    """
    Function to get the average points of a team in a season
    :param teamname: team name :List[str]
    :param season: season: str or int or None (for all seasons) 
    :return: average points of the team 
    """
    season=convert_int_season_to_str(season)
    playerScores = player[player['minutes'].notna()].copy()
    playerScores['minutesParsed'] = playerScores['minutes'].apply(convert_min_to_float)
    game.loc[:,'WL'] = game['WL'].replace({'W': 1, 'L': 0}).infer_objects(copy=False)
    gamePlayer=game.merge(playerScores, how='inner', left_on=['GAME_ID','TEAM_ABBREVIATION'], right_on=['gameId','teamTricode'])
    # add a collumn to count the number of games played by each player
    aggregation= gamePlayer.groupby(['personName','teamTricode','season_year']).agg(
        {
            'WL': 'sum',
            'plusMinusPoints':'mean',
            'minutesParsed': 'mean',
            'assists': 'mean',
            'points': 'mean',
            'fieldGoalsPercentage': 'mean',
            'threePointersPercentage': 'mean',
            'reboundsTotal': 'mean',
            'foulsPersonal': 'mean',
            'turnovers': 'mean',
            'fieldGoalsMade': 'mean',
            'fieldGoalsAttempted': 'mean',
            'threePointersMade': 'mean',
            'threePointersAttempted': 'mean',
            'freeThrowsMade': 'mean',
            'freeThrowsAttempted': 'mean',
            'freeThrowsPercentage': 'mean'
        }
    ).reset_index()
    aggregation['gamesPlayed'] = gamePlayer.groupby(['personName','teamTricode','season_year'])['gameId'].count().reset_index(drop=True)

    if season is not None:
        aggregation = aggregation[aggregation['season_year'] == season]
    if teamname is not None:
        aggregation = aggregation[aggregation['teamTricode'].isin(teamname)]
    aggregation['winPercentage'] = aggregation['WL'] / aggregation['gamesPlayed'] 
    return aggregation.reset_index(drop=True)

# XGBoostRegressor with Random Forest Regression For each season

In [21]:
import sklearn.ensemble
from xgboost import XGBRegressor

errors_r2=[]
errors_mse=[]

for year in range(2010,2024):
    gameP=getMatchAndPlayerStats(regular_games_total,regular_season_all_parts,season=2019)
    X =gameP.drop(['personName','teamTricode','season_year','winPercentage'],axis=1)
    y = gameP['winPercentage'] 
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=200)
    # applies standardization to the data same 
    scaler = sklearn.preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train models
    rf = sklearn.ensemble.RandomForestRegressor(n_estimators=100)
    rf.fit(X_train, y_train)

    xgb = XGBRegressor(n_estimators=100)
    xgb.fit(X_train, y_train)

    # Evaluate
    print(f"Season: {convert_int_season_to_str(year)}")
    for model, name in zip([rf, xgb], ["Random Forest", "XGBoost"]):
        preds = model.predict(X_test)
        r2_error=sklearn.metrics.r2_score(y_test, preds)
        mean_sq=sklearn.metrics.mean_squared_error(y_test, preds)
        errors_r2.append(r2_error)
        errors_mse.append(mean_sq)
        print(f"{name} R2: {r2_error:.4f}, MSE: {mean_sq:.4f}")
    



Season: 2010-11
Random Forest R2: 0.7551, MSE: 0.0093
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2011-12
Random Forest R2: 0.7560, MSE: 0.0093
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2012-13
Random Forest R2: 0.7610, MSE: 0.0091
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2013-14
Random Forest R2: 0.7449, MSE: 0.0097
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2014-15
Random Forest R2: 0.7603, MSE: 0.0091
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2015-16
Random Forest R2: 0.7489, MSE: 0.0096
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2016-17
Random Forest R2: 0.7496, MSE: 0.0095
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2017-18
Random Forest R2: 0.7516, MSE: 0.0095
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2018-19
Random Forest R2: 0.7549, MSE: 0.0093
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2019-20
Random Forest R2: 0.7418, MSE: 0.0098
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2020-21
Random Forest R2: 0.7588, MSE: 0.0092
XGBoost R2: 0.8541, MSE: 0.0056
Season: 2021-22
Random Forest R2: 0.7544, MSE: 0.0094


## Analysis 
...

# Neural Network for value estimation

In [19]:
from sklearn.neural_network import MLPRegressor

X=gameP.drop(['personName','teamTricode','season_year','winPercentage'],axis=1)
y=gameP['winPercentage']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=200)
# Create a neural network regressor
nn = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000,activation='relu',solver='adam', random_state=42)
nn.fit(X_train, y_train)
nn.predictions = nn.predict(X_test)
# Evaluate the model
r2_error=sklearn.metrics.r2_score(y_test, nn.predictions)
mean_sq=sklearn.metrics.mean_squared_error(y_test, nn.predictions)
print(f"Neural Network R2: {r2_error:.4f}, MSE: {mean_sq:.4f}")


Neural Network R2: 0.5195, MSE: 0.0183
