In [22]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split, cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Ridge, Lasso

import xgboost as xgb

In [23]:
# Get stats from csv
games_df = pd.read_csv('summary_data.csv')

In [24]:
# Baseline model: Splitting testing and training data with FG3M as the output
#X = games_df[['three_pm_avg_to_date']]
#y = games_df['team_three_pm_next_game']

# Baseline + FT% model:
#X = games_df[['three_pm_avg_to_date', 'free_throw_percent_avg_to_date']]
#y = games_df['team_three_pm_next_game']

# Baseline + Orating model:
#X = games_df[['three_pm_avg_to_date', 'orating_avg_to_date']]
# y = games_df['team_three_pm_next_game']

# Baseline + Orating + Ft%:
#X = games_df[['team_three_pm_avg_to_date', 'team_free_throw_percent_avg_to_date', 'team_orating_avg_to_date']]
#y = games_df['team_three_pm_next_game']

# Everything model
X = games_df[[col for col in games_df.columns if 'avg_to_date' in col or 'last_5' in col]]
y = games_df['team_three_pm_next_game']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# # Define the random forest regressor model
# rf_model = RandomForestRegressor()

# # Define the number of folds for cross-validation
# k_folds = 5

# # Define KFold cross-validator
# kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# # Perform cross-validation and get scores
# mse_scores = -cross_val_score(rf_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
# r2_scores = cross_val_score(rf_model, X_train, y_train, cv=kf, scoring='r2')

# # Calculate mean scores
# mean_mse = np.mean(mse_scores)
# mean_r2 = np.mean(r2_scores)

# print("Mean Squared Error (MSE) across {} folds: {:.4f}".format(k_folds, mean_mse))
# print("Mean R-squared (R^2) across {} folds: {:.4f}".format(k_folds, mean_r2))


In [26]:
# Define XGBoost Random Forest parameters
params = {
    'objective': 'reg:squarederror',  # Regression task
    'eval_metric': 'rmse',  # Root Mean Squared Error
    'booster': 'gbtree',  # Use tree-based models
    'subsample': 0.8,  # Subsample ratio of the training instances
    'colsample_bynode': 0.8,  # Subsample ratio of columns when constructing each tree
    'tree_method': 'hist'  # Use histogram-based algorithms for approximate tree method
}

# Initialize the XGBoost Random Forest model
xgb_model = xgb.XGBRFRegressor(**params)

# Fit the model to your data
xgb_model.fit(X, y)

# Get feature importances
feature_importances = xgb_model.feature_importances_

# Create a list of feature names
feature_names = X.columns

# Print the importance of each feature
for name, importance in zip(feature_names, feature_importances):
    print(f"{name}: {importance}")

# Define 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get predictions
y_pred = cross_val_predict(xgb_model, X, y, cv=kf)

# Calculate Mean Squared Error
mse = mean_squared_error(y, y_pred)

# Calculate R-squared
r2 = r2_score(y, y_pred)

# Print the MSE and R-squared values
print("Mean Squared Error (MSE):", mse)
print("R-squared (R^2):", r2)


team_fga_avg_to_date: 0.02217787131667137
team_fgm_avg_to_date: 0.008329637348651886
team_free_throw_percent_avg_to_date: 0.007545917294919491
team_three_percent_avg_to_date: 0.007407812867313623
team_three_pa_avg_to_date: 0.5514865517616272
team_three_pm_avg_to_date: 0.26705679297447205
team_orating_avg_to_date: 0.0094875143840909
team_oreb_avg_to_date: 0.008693582378327847
team_dreb_avg_to_date: 0.00858371239155531
team_to_avg_to_date: 0.008571433834731579
team_points_avg_to_date: 0.10065922141075134
Mean Squared Error (MSE): 8.25346179632256
R-squared (R^2): 0.3051616860118217


In [27]:
# Initialize a Ridge model
ridge_model = Ridge(alpha=1.0)  # You can adjust the regularization strength (alpha) as needed

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation on the training set
cv_r2_scores = cross_val_score(ridge_model, X_train, y_train, cv=kf, scoring='r2')
average_cv_r2 = np.mean(cv_r2_scores)

cv_mse_scores = cross_val_score(ridge_model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
average_cv_mse = -np.mean(cv_mse_scores)

# Print the results
print(f'Average R-squared value (Cross-validation): {average_cv_r2}')
print(f'Average Mean Squared Error (Cross-validation): {average_cv_mse}')

Average R-squared value (Cross-validation): 0.3035418094274702
Average Mean Squared Error (Cross-validation): 8.289422296594992


In [28]:
# Initialize a Lasso Regression model
lasso_model = Lasso(alpha=1.0)  # You can adjust the regularization strength (alpha) as needed

# Define k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
cv_r2_scores = cross_val_score(lasso_model, X, y, cv=kf, scoring='r2')
average_cv_r2 = np.mean(cv_r2_scores)

cv_mse_scores = -cross_val_score(lasso_model, X, y, cv=kf, scoring='neg_mean_squared_error')
average_cv_mse = np.mean(cv_mse_scores)

# Print the results
print(f'Average R-squared value (Cross-validation): {average_cv_r2}')
print(f'Average Mean Squared Error (Cross-validation): {average_cv_mse}')

Average R-squared value (Cross-validation): 0.29568782447836295
Average Mean Squared Error (Cross-validation): 8.365343825529681
