### Import libraries

In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

### Fit and run RF model on 2023-24 season. Using a random sample to split and training on 70% of the data.

In [2]:
df = pd.read_csv('exports/processed_gamelogs_top200_ppg_season23_24.csv')
df['date_dt'] = pd.to_datetime(df['date_dt'])

RangeIndex(start=0, stop=12277, step=1)

In [4]:
# Drop home_vs_away and win loss column and set plater name and date of the game as our index. Drop any rows that contains NA values.
data = df.drop(columns=['home_vs_away', 'wl'], inplace=False)

data.set_index(['player_name', 'date_dt'], inplace=True)

data = data.dropna()

# Prepare the features and target
X = data[['min_10', 'fgm_10', 'fga_10', 'fg_pct_10', 'fg3m_10', 'fg3a_10', 'fg3_pct_10', 'ftm_10', 'fta_10', 
        'ft_pct_10','pts_10', 'min_5', 'fgm_5', 'fga_5', 'fg_pct_5', 'fg3m_5', 'fg3a_5', 'fg3_pct_5', 'ftm_5',
        'fta_5', 'ft_pct_5', 'pts_5', 'min_3', 'fgm_3', 'fga_3', 'fg_pct_3', 'fg3m_3', 'fg3a_3', 'fg3_pct_3', 
        'ftm_3', 'fta_3', 'ft_pct_3', 'pts_3', 'min_last', 'fgm_last', 'fga_last', 'fg_pct_last', 'fg3m_last',
        'fg3a_last', 'fg3_pct_last', 'ftm_last', 'fta_last', 'ft_pct_last', 'pts_last',
        'min_season', 'fgm_season', 'fga_season', 'fg_pct_season', 
        'fg3m_season', 'fg3a_season', 'fg3_pct_season', 'ftm_season', 'fta_season', 'ft_pct_season','pts_season'
        ]]

y = data['pts']

In [7]:
def run_gbr_with_pca_comparison(features=X, target=y):

    random_state = random.randint(1, 100)
    print(f"Random state: {random_state}\n")

    # Split data into test and training sets.
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=random_state)


    # 1. Train and run model without Scaler and PCA
    # Fit and run the model.
    model = GradientBoostingRegressor(random_state=random_state)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)


    # 2. Train and run model with Scaler and PCA
    # Normalize feature values with Scaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Fit PCA model, reduce dataset features to the # of components (n_components) that account for 95% of the variance.
    pca = PCA(n_components=0.95)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # Fit and run the model.
    model_pca = GradientBoostingRegressor(random_state=random_state)
    model_pca.fit(X_train_pca, y_train)
    predictions_pca = model_pca.predict(X_test_pca)


    # 3. Evaluate and compare the models.    
    # Model 1 - GBR model without PCA and normalization.
    mae = (mean_absolute_error(y_test, predictions))
    mse = (mean_squared_error(y_test, predictions))
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    relative_mae = mae / y_test.mean()  
    r2 = r2_score(y_test, predictions)

    print("GBR model without PCA:")
    print(f"MAE: {round(mae, 4)}")
    print(f"MSE: {round(mse, 4)}")
    print(f"RMSE: {round(rmse, 4)}")
    print(f"R² Score: {round(r2, 4)}") 

    # Model 2 - GBR model with PCA and normalization.
    mae_pca = (mean_absolute_error(y_test, predictions_pca))
    mse_pca = (mean_squared_error(y_test, predictions_pca))
    rmse_pca = np.sqrt(mean_squared_error(y_test, predictions_pca))
    r2_pca = r2_score(y_test, predictions_pca)

    print("\nGBR model with PCA:")
    print(f"MAE: {round(mae_pca, 4)}")
    print(f"MSE (with PCA): {round(mse_pca, 4)}")
    print(f"RMSE (with PCA): {round(rmse_pca, 4)}")
    print(f"R² Score (with PCA): {round(r2_pca, 4)}")

    # Calculate and print the differences between the models
    mae_diff = mae - mae_pca
    mse_diff = mse - mse_pca
    rmse_diff = rmse - rmse_pca
    r2_diff = r2 - r2_pca

    # Calculate percentage differences
    mae_diff_pct = (mae_diff / mae) * 100
    mse_diff_pct = (mse_diff / mse) * 100
    rmse_diff_pct = (rmse_diff / rmse) * 100
    r2_diff_pct = (r2_diff / r2) * 100

    print("\nDiff (Positive Value = GBR w/ PCA performed better):")
    print(f"MAE Difference: {round(mae_diff, 4)} ({round(mae_diff_pct, 2)}%)")
    print(f"MSE Difference: {round(mse_diff, 4)} ({round(mse_diff_pct, 2)}%)")
    print(f"RMSE Difference: {round(rmse_diff, 4)} ({round(rmse_diff_pct, 2)}%)")
    print(f"R² Score Difference: {round(r2_diff, 4)} ({round(r2_diff_pct, 2)}%)")

# Run the function.
run_gbr_with_pca_comparison()

Random state: 53

GBR model without PCA:
MAE: 3.7738
MSE: 24.2397
RMSE: 4.9234
R² Score: 0.717

GBR model with PCA:
MAE: 4.0472
MSE (with PCA): 27.7632
RMSE (with PCA): 5.2691
R² Score (with PCA): 0.6758

Diff (Positive Value = GBR w/ PCA performed better):
MAE Difference: -0.2734 (-7.24%)
MSE Difference: -3.5234 (-14.54%)
RMSE Difference: -0.3457 (-7.02%)
R² Score Difference: 0.0411 (5.74%)
