 # Preparing and Cleaning the Data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

nba_df = pd.read_csv('player_totals.csv', encoding = 'unicode_escape', engine ='python')

print(f"Original data set length: {len(nba_df)} rows")

season2024_condition = nba_df['Season'] > 2023
nba_df.drop(index=nba_df[season2024_condition].index, inplace=True)

# season2005_condition = nba_df['Season'] <= 2005
# nba_df.drop(index=nba_df[season2005_condition].index, inplace=True)

# targeting modern game

print(f"Length of data set after dropping the 2024 season: {len(nba_df)} rows")

gp_condition = nba_df["GP"] <= 10
nba_df.drop(index=nba_df[gp_condition].index, inplace=True)

print(f"Length of data set after dropping all rows with less than 10 games played: {len(nba_df)} rows")

# nba_df = nba_df.dropna()

# print(f"Length of data set after dropping all rows with NA values: {len(nba_df)} rows")

missing_data = nba_df.isnull().sum()

print("Columns with missing data:")
print(missing_data[missing_data > 0])
print("\n")

columns_to_fill = ['3P%', '2P%', 'FT%', 'Age', 'GS', 'MP', '3P', '3PA', 'ORB', 'DRB', 'TRB', 'STL', 'BLK', 'TOV']

nba_df[columns_to_fill] = nba_df[columns_to_fill].fillna(0)

missing_data_updated = nba_df[columns_to_fill].isnull().sum()

print('Columns with missing data after filling:')
print(missing_data_updated)


Original data set length: 31550 rows
Length of data set after dropping the 2024 season: 31135 rows
Length of data set after dropping all rows with less than 10 games played: 27308 rows
Columns with missing data:
Age      12
GS     7706
MP      939
3P     5725
3PA    5725
3P%    8346
2P%       1
FT%      85
ORB    4153
DRB    4153
TRB     769
STL    4969
BLK    4968
TOV    5068
dtype: int64


Columns with missing data after filling:
3P%    0
2P%    0
FT%    0
Age    0
GS     0
MP     0
3P     0
3PA    0
ORB    0
DRB    0
TRB    0
STL    0
BLK    0
TOV    0
dtype: int64


# Training with Linear Regression

In [19]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Extract features and target variable
X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%", "FG"]]
y_pts = nba_df["PTS"]

# Create and fit the scaler on the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create linear regression model
model = LinearRegression()

# Create KFold cross-validator
kf = KFold(n_splits=10, shuffle=True, random_state=13)

# Perform cross-validation
cv_mse_pts = cross_val_score(model, X_scaled, y_pts, cv=kf, scoring='neg_mean_squared_error')
cv_mae_pts = cross_val_score(model, X_scaled, y_pts, cv=kf, scoring='neg_mean_absolute_error')
cv_r2_pts = cross_val_score(model, X_scaled, y_pts, cv=kf, scoring='r2')

# Calculate mean values of metrics
mse_pts_mean = -np.mean(cv_mse_pts)
mae_pts_mean = -np.mean(cv_mae_pts)
r2_pts_mean = np.mean(cv_r2_pts)

# Calculate adjusted R-squared
n = len(y_pts)
k = X.shape[1]
adjusted_r2_pts = 1 - ((1 - r2_pts_mean) * (n - 1) / (n - k - 1))

print("Metrics for Points (PTS) with Linear Regression and Cross-Validation:")
print(f"MSE: {mse_pts_mean}")
print(f"MAE: {mae_pts_mean}")
print(f"R-squared: {r2_pts_mean}")
print(f"Adjusted R-squared: {adjusted_r2_pts}")

Metrics for Points (PTS) with Linear Regression and Cross-Validation:
MSE: 3057.3544741968035
MAE: 36.514456396036174
R-squared: 0.9869317159981545
Adjusted R-squared: 0.9869283651561028


# Justification
The choice to use linear regression was due to the fact that the relationship between the features and the target variable was linear. So any changes to one of them would affect the outcome of points.

# Random Forest Regression

In [53]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Split the data into training and testing set
X_train, X_test, y_pts_train, y_pts_test = train_test_split(
    X, y_pts, test_size=0.4, random_state=42
)

# Create and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Fit the model to the scaled data
rf_model.fit(X_train_scaled, y_pts_train)

# Predictions on scaled data
y_pts_pred_rf = rf_model.predict(X_test_scaled)

# Evaluation for Points (PTS) with untuned Random Forest model and scaling
mse_pts_rf = mean_squared_error(y_pts_test, y_pts_pred_rf)
mae_pts_rf = mean_absolute_error(y_pts_test, y_pts_pred_rf)
r2_pts_rf = r2_score(y_pts_test, y_pts_pred_rf)

print("Metrics for Points (PTS) with Untuned Random Forest Model:")
print(f"MSE: {mse_pts_rf}")
print(f"MAE: {mae_pts_rf}")
print(f"R-squared: {r2_pts_rf}")


Metrics for Points (PTS) with Untuned Random Forest Model:
MSE: 26552.490437486267
MAE: 104.67685005492496
R-squared: 0.886211854636577


# Random Forest Retraining with Grid Search


In [16]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import time
import psutil

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}


# Define a custom scorer for adjusted R-squared
def adjusted_r2_scorer(estimator, X, y):
    n = len(y)
    k = X.shape[1]
    r2 = r2_score(y, estimator.predict(X))
    return 1 - (1 - r2) * (n - 1) / (n - k - 1)

# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object with cross-validation
grid_search = GridSearchCV(
    rf_model,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring={'neg_mean_squared_error': 'neg_mean_squared_error', 'adjusted_r2': make_scorer(adjusted_r2_scorer)},
    refit='neg_mean_squared_error',  # Refit using neg_mean_squared_error for predictions
    return_train_score=True,
    n_jobs=-1
)

# Create and fit the scaler on the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Record the start time
start_time = time.time()

# Monitor CPU and memory usage during grid search
cpu_percentages = []
memory_percentages = []

# Fit the model to the data and perform cross-validation
for i, (train_index, test_index) in enumerate(grid_search.cv.split(X_scaled, y_pts)):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y_pts.iloc[train_index], y_pts.iloc[test_index]
    
    grid_search.fit(X_train, y_train)
    
    # Monitor CPU and memory usage
    cpu_percentages.append(psutil.cpu_percent())
    memory_percentages.append(psutil.virtual_memory().percent)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_

# Create a new Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=42, **best_params).fit(X_scaled, y_pts)

# Calculate adjusted R-squared for the best model
best_rf_model_r2 = r2_score(y_pts, best_rf_model.predict(X_scaled))
adjusted_r2 = 1 - (1 - best_rf_model_r2) * (len(y_pts) - 1) / (len(y_pts) - X_scaled.shape[1] - 1)

# Predictions
y_pts_pred_tuned_rf = best_rf_model.predict(X_scaled)

# Evaluation for Points (PTS) with tuned Random Forest model
mse_pts_tuned_rf = mean_squared_error(y_pts, y_pts_pred_tuned_rf)
mae_pts_tuned_rf = mean_absolute_error(y_pts, y_pts_pred_tuned_rf)
r2_pts_tuned_rf = best_rf_model_r2

print("Metrics for Points (PTS) with Grid Search Random Forest Regression Model (Cross-Validation):")
print(f"MSE: {mse_pts_tuned_rf}")
print(f"MAE: {mae_pts_tuned_rf}")
print(f"R-squared: {r2_pts_tuned_rf}")
print(f"Adjusted R-squared: {adjusted_r2}")
print(f"Grid Search with Cross-Validation took {elapsed_time_rounded} seconds.")

# Print average CPU and memory usage during grid search
print(f"Average CPU Usage: {np.mean(cpu_percentages)}%")
print(f"Average Memory Usage: {np.mean(memory_percentages)}%")


1080 fits failed out of a total of 4320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
747 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidPara

KeyboardInterrupt: 

# Random Forest Retraining with Randomized Search

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time

# Split the data into training and testing sets
X_train, X_test, y_pts_train, y_pts_test = train_test_split(
    X, y_pts, test_size=0.4, random_state=42
)

# Create and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Define the hyperparameter grid for Random Forest
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Create the RandomizedSearchCV object
randomized_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Record the start time
start_time = time.time()

# Fit the model to the data
randomized_search.fit(X_train_scaled, y_pts_train)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = randomized_search.best_params_

# Create a new Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=42, **best_params).fit(X_train_scaled, y_pts_train)

# Predictions
y_pts_pred_tuned_rf = best_rf_model.predict(X_test_scaled)

# Evaluation for Points (PTS) with tuned Random Forest model
mse_pts_tuned_rf = mean_squared_error(y_pts_test, y_pts_pred_tuned_rf)
mae_pts_tuned_rf = mean_absolute_error(y_pts_test, y_pts_pred_tuned_rf)
r2_pts_tuned_rf = r2_score(y_pts_test, y_pts_pred_tuned_rf)

print("Metrics for Points (PTS) with Randomized Search Random Forest Regression Model:")
print(f"MSE: {mse_pts_tuned_rf}")
print(f"MAE: {mae_pts_tuned_rf}")
print(f"R-squared: {r2_pts_tuned_rf}")

print(f"Randomized Search took {elapsed_time_rounded} seconds.")

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterE

Metrics for Points (PTS) with Randomized Search Random Forest Regression Model:
MSE: 19752.866031005913
MAE: 90.05895923429973
R-squared: 0.8994674418860689
Randomized Search took 11.45 seconds.


# kNN Regression

In [35]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Assuming X and y are your features and target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_pts, test_size=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create kNN regression model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors

# Fit the model to the data
knn_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Metrics for Points (PTS) with Untuned k-Nearest Neigbours Regression")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

Metrics for Points (PTS) with Untuned k-Nearest Neigbours Regression
MSE: 31116.79576711827
MAE: 115.98396191871107
R-squared: 0.8666519628985883


# kNN Retraining with Grid Search

In [16]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import time

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%"]]
y_pts = nba_df["PTS"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_pts, test_size=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid= {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Create kNN regression model
knn_model = KNeighborsRegressor()

grid_search = GridSearchCV(knn_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Record the start time
start_time = time.time()

# Fit the model to the data
grid_search.fit(X_train_scaled, y_train)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new KNN model with the best hyperparameters
best_knn_model = KNeighborsRegressor(**best_params).fit(X_train_scaled, y_train)

# Predictions
y_pts_pred_tuned_knn = best_knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pts_pred_tuned_knn)
mae = mean_absolute_error(y_test, y_pts_pred_tuned_knn)
r2 = r2_score(y_test, y_pts_pred_tuned_knn)

print("Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")
print(f"Grid Search took {elapsed_time_rounded} seconds.")


Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression
MSE: 22665.662973350594
MAE: 99.85248456114738
R-squared: 0.8846427107599278
Grid Search took 8.9 seconds.


# kNN Retraining with Randomized Search

In [17]:
from sklearn.model_selection import RandomizedSearchCV

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%"]]
y_pts = nba_df["PTS"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_pts, test_size=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_dist = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Create kNN regression model
knn_model = KNeighborsRegressor()

random_search = RandomizedSearchCV(knn_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=1, random_state=42)

# Record the start time
start_time = time.time()

# Fit the model to the data
random_search.fit(X_train_scaled, y_train)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = random_search.best_params_

# Create a new KNN model with the best hyperparameters
best_knn_model = KNeighborsRegressor(**best_params).fit(X_train_scaled, y_train)

# Predictions
y_pts_pred_tuned_knn = best_knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pts_pred_tuned_knn)
mae = mean_absolute_error(y_test, y_pts_pred_tuned_knn)
r2 = r2_score(y_test, y_pts_pred_tuned_knn)

print("Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression (Randomized Search)")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")
print(f"Randomized Search took {elapsed_time_rounded} seconds.")


Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression (Randomized Search)
MSE: 22665.662973350594
MAE: 99.85248456114738
R-squared: 0.8846427107599278
Randomized Search took 1.5 seconds.
