 # Preparing and Cleaning the Data

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

nba_df = pd.read_csv('player_totals.csv', encoding = 'unicode_escape', engine ='python')

print(f"Original data set length: {len(nba_df)} rows")

season2024_condition = nba_df['Season'] > 2023
nba_df.drop(index=nba_df[season2024_condition].index, inplace=True)

season2005_condition = nba_df['Season'] <= 2005
nba_df.drop(index=nba_df[season2005_condition].index, inplace=True)

print(f"Length of data set after dropping all seasons before 2005 and dropping the 2023 season: {len(nba_df)} rows")

gp_condition = nba_df["GP"] <= 10
nba_df.drop(index=nba_df[gp_condition].index, inplace=True)

print(f"Length of data set after dropping all rows with less than 10 games played: {len(nba_df)} rows")
print("\n")


missing_data = nba_df.isnull().sum()

print("Columns with missing data:")
print(missing_data[missing_data > 0])
print("\n")

columns_to_fill = ['FG%', '3P%', '2P%', 'eFG%', 'FT%' ]

nba_df[columns_to_fill] = nba_df[columns_to_fill].fillna(0)

missing_data_updated = nba_df[columns_to_fill].isnull().sum()

print('Columns with missing data after filling:')
print(missing_data_updated)



Original data set length: 31550 rows
Length of data set after dropping all seasons before 2005 and 2032 season: 11236 rows
Length of data set after dropping all rows with less than 10 games played: 9686 rows


Columns with missing data:
3P%    911
2P%      1
FT%     54
dtype: int64


Columns with missing data after filling:
FG%     0
3P%     0
2P%     0
eFG%    0
FT%     0
dtype: int64


# Training with Linear Regression

In [32]:
from sklearn.preprocessing import StandardScaler

# Extract features and target variable
X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%"]]
y_pts = nba_df["PTS"]

# Split the data into training and testing sets
X_train, X_test, y_pts_train, y_pts_test = train_test_split(
    X, y_pts, test_size=0.4, random_state=42
)

# Create and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Create and train linear regression model on scaled data
model_pts_scaled = LinearRegression().fit(X_train_scaled, y_pts_train)

# Predictions on scaled data
y_pts_pred_scaled = model_pts_scaled.predict(X_test_scaled)

# Evaluation for Points (PTS) on scaled data
mse_pts_scaled = mean_squared_error(y_pts_test, y_pts_pred_scaled)
mae_pts_scaled = mean_absolute_error(y_pts_test, y_pts_pred_scaled)
r2_pts_scaled = r2_score(y_pts_test, y_pts_pred_scaled)

print("Metrics for Points (PTS) with Linear Regression:")
print(f"MSE: {mse_pts_scaled}")
print(f"MAE: {mae_pts_scaled}")
print(f"R-squared: {r2_pts_scaled}")


Metrics for Points (PTS) with Linear Regression:
MSE: 26961.91884372399
MAE: 109.11193790799443
R-squared: 0.8627768411548458


# Justification
The choice to use linear regression was due to the fact that the relationship between the features and the target variable was linear. So any changes to one of them would affect the outcome of points.

# Random Forest Regression

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_pts_train, y_pts_test = train_test_split(
    X, y_pts, test_size=0.4, random_state=42
)

# Create and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Fit the model to the scaled data
rf_model.fit(X_train_scaled, y_pts_train)

# Predictions on scaled data
y_pts_pred_rf = rf_model.predict(X_test_scaled)

# Evaluation for Points (PTS) with untuned Random Forest model and scaling
mse_pts_rf = mean_squared_error(y_pts_test, y_pts_pred_rf)
mae_pts_rf = mean_absolute_error(y_pts_test, y_pts_pred_rf)
r2_pts_rf = r2_score(y_pts_test, y_pts_pred_rf)

print("Metrics for Points (PTS) with Untuned Random Forest Model:")
print(f"MSE: {mse_pts_rf}")
print(f"MAE: {mae_pts_rf}")
print(f"R-squared: {r2_pts_rf}")


Metrics for Points (PTS) with Untuned Random Forest Model:
MSE: 21178.11296469677
MAE: 91.50268903225808
R-squared: 0.8922136226193829


# Random Forest Retraining with Grid Search


In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time

# Split the data into training and testing sets
X_train, X_test, y_pts_train, y_pts_test = train_test_split(
    X, y_pts, test_size=0.4, random_state=42
)

# Create and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Define the hyperparameter grid for Random Forest
# CHANGE THE PARAMETERS TO 6
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Record the start time
start_time = time.time()

# Fit the model to the data
grid_search.fit(X_train_scaled, y_pts_train)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=42, **best_params).fit(X_train_scaled, y_pts_train)

# Predictions
y_pts_pred_tuned_rf = best_rf_model.predict(X_test_scaled)

# Evaluation for Points (PTS) with tuned Random Forest model
mse_pts_tuned_rf = mean_squared_error(y_pts_test, y_pts_pred_tuned_rf)
mae_pts_tuned_rf = mean_absolute_error(y_pts_test, y_pts_pred_tuned_rf)
r2_pts_tuned_rf = r2_score(y_pts_test, y_pts_pred_tuned_rf)

print("Metrics for Points (PTS) with Grid Search Random Forest Regression Model:")
print(f"MSE: {mse_pts_tuned_rf}")
print(f"MAE: {mae_pts_tuned_rf}")
print(f"R-squared: {r2_pts_tuned_rf}")
print(f"Grid Search took {elapsed_time_rounded} seconds.")


Metrics for Points (PTS) with Grid Search Random Forest Regression Model:
MSE: 20796.84414034716
MAE: 90.50302128608358
R-squared: 0.8941540969880538
Grid Search took 804.74 seconds.


# Random Forest Retraining with Randomized Search

In [45]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time

# Split the data into training and testing sets
X_train, X_test, y_pts_train, y_pts_test = train_test_split(
    X, y_pts, test_size=0.4, random_state=42
)

# Create and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Transform the testing data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Define the hyperparameter grid for Random Forest
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Create the Random Forest regression model
rf_model = RandomForestRegressor(random_state=42)

# Create the RandomizedSearchCV object
randomized_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, cv=5, scoring='neg_mean_squared_error')

# Record the start time
start_time = time.time()

# Fit the model to the data
randomized_search.fit(X_train_scaled, y_pts_train)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = randomized_search.best_params_

# Create a new Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(random_state=42, **best_params).fit(X_train_scaled, y_pts_train)

# Predictions
y_pts_pred_tuned_rf = best_rf_model.predict(X_test_scaled)

# Evaluation for Points (PTS) with tuned Random Forest model
mse_pts_tuned_rf = mean_squared_error(y_pts_test, y_pts_pred_tuned_rf)
mae_pts_tuned_rf = mean_absolute_error(y_pts_test, y_pts_pred_tuned_rf)
r2_pts_tuned_rf = r2_score(y_pts_test, y_pts_pred_tuned_rf)

print("Metrics for Points (PTS) with Randomized Search Random Forest Regression Model:")
print(f"MSE: {mse_pts_tuned_rf}")
print(f"MAE: {mae_pts_tuned_rf}")
print(f"R-squared: {r2_pts_tuned_rf}")

print(f"Randomized Search took {elapsed_time_rounded} seconds.")

10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\justi\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Metrics for Points (PTS) with Randomized Search Random Forest Regression Model:
MSE: 20503.657813162612
MAE: 91.30154959139786
R-squared: 0.8956462739425082
Randomized Search took 83.58 seconds.


# kNN Regression

In [39]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%"]]
y_pts = nba_df["PTS"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_pts, test_size=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create kNN regression model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors

# Fit the model to the data
knn_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Metrics for Points (PTS) with Untuned k-Nearest Neigbours Regression")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

Metrics for Points (PTS) with Untuned k-Nearest Neigbours Regression
MSE: 25386.66116129032
MAE: 103.72490322580646
R-squared: 0.8707941427583242


# kNN Retraining with Grid Search

In [49]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import time

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%"]]
y_pts = nba_df["PTS"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_pts, test_size=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_dist = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Create kNN regression model
knn_model = KNeighborsRegressor()

grid_search = GridSearchCV(knn_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Record the start time
start_time = time.time()

# Fit the model to the data
grid_search.fit(X_train_scaled, y_train)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new KNN model with the best hyperparameters
best_knn_model = KNeighborsRegressor(**best_params).fit(X_train_scaled, y_train)

# Predictions
y_pts_pred_tuned_knn = best_knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pts_pred_tuned_knn)
mae = mean_absolute_error(y_test, y_pts_pred_tuned_knn)
r2 = r2_score(y_test, y_pts_pred_tuned_knn)

print("Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")
print(f"Grid Search took {elapsed_time_rounded} seconds.")


Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression
MSE: 22665.662973350594
MAE: 99.85248456114738
R-squared: 0.8846427107599278
Grid Search took 2.14 seconds.


# kNN Retraining with Randomized Search

In [50]:
from sklearn.model_selection import RandomizedSearchCV

# Assuming X and y are your features and target variable
X = nba_df[["GP", "MP", "FG%", "3P%", "2P%", "FT%"]]
y_pts = nba_df["PTS"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_pts, test_size=0.4, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_dist = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'leaf_size': [20, 30, 40],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

# Create kNN regression model
knn_model = KNeighborsRegressor()

random_search = RandomizedSearchCV(knn_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)

# Record the start time
start_time = time.time()

# Fit the model to the data
random_search.fit(X_train_scaled, y_train)

# Record the end time
end_time = time.time()

# Calculate the elapsed time
elapsed_time = end_time - start_time

# Round the elapsed time to two decimal places
elapsed_time_rounded = round(elapsed_time, 2)

# Get the best hyperparameters
best_params = random_search.best_params_

# Create a new KNN model with the best hyperparameters
best_knn_model = KNeighborsRegressor(**best_params).fit(X_train_scaled, y_train)

# Predictions
y_pts_pred_tuned_knn = best_knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pts_pred_tuned_knn)
mae = mean_absolute_error(y_test, y_pts_pred_tuned_knn)
r2 = r2_score(y_test, y_pts_pred_tuned_knn)

print("Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression (Randomized Search)")
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")
print(f"Randomized Search took {elapsed_time_rounded} seconds.")


Metrics for Points (PTS) with Tuned k-Nearest Neighbors Regression (Randomized Search)
MSE: 22665.662973350594
MAE: 99.85248456114738
R-squared: 0.8846427107599278
Randomized Search took 1.82 seconds.
