In [71]:
from a_pscore_data_reader_preproc import read_and_process_data, model_evaluation
from b_pscore_linear_reg_model import trained_linear_regression
from c_pscore_ridge_reg_model import trained_ridge_model
from d_pscore_random_forest_model import trained_rf_model
from e_pscore_GBM_model import trained_gbm_model
from f_pscore_xgb_model import trained_xgb_model
from g_pscore_tf_dn_model import trained_tf_dn_model
from sklearn.model_selection import GridSearchCV
import pandas as pd

# import for linear model stacked
from sklearn.linear_model import LinearRegression

# import for random forest fitting
from sklearn.ensemble import RandomForestRegressor

# import for xgboost
from xgboost import XGBRegressor

# import for tf
from tensorflow import keras
from tensorflow.keras import Input, layers, models, metrics, regularizers
from tensorflow.keras.optimizers import Adam

In [72]:
# Read in the data using the pre-processing code
x_train, y_train, x_test, y_test, train_team_values, test_team_values = read_and_process_data(
     r"E:\github_repos\Private_Projects\NCAA_FBS_AP_Ranking_Predictions\python_ap\scripts_and_data\data\score_pred_train_data.csv",
     r"E:\github_repos\Private_Projects\NCAA_FBS_AP_Ranking_Predictions\python_ap\scripts_and_data\data\score_pred_test_data.csv",
     True
)

# check to make sure the data looks right
x_train.shape
x_test.shape

train data has shape: (14648, 490)
test data has shape: (132, 490)


(132, 490)

In [17]:
# lets fit all the models
lr_model = trained_linear_regression(x_train, y_train)
ridge_model = trained_ridge_model(x_train, y_train)
rf_model = trained_rf_model(x_train, y_train)
gbm_model = trained_gbm_model(x_train, y_train)
xgb_model = trained_xgb_model(x_train, y_train)
tf_model = trained_tf_dn_model(x_train, y_train)

Epoch 1/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 666us/step - loss: 718.2397 - mae: 22.7803 - mse: 717.6425
Epoch 2/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step - loss: 218.9368 - mae: 11.7279 - mse: 217.8525
Epoch 3/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 555us/step - loss: 166.2628 - mae: 10.2329 - mse: 164.9634
Epoch 4/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 642us/step - loss: 157.0816 - mae: 9.9837 - mse: 155.6641
Epoch 5/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 591us/step - loss: 149.6927 - mae: 9.7577 - mse: 148.1797
Epoch 6/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 561us/step - loss: 145.2055 - mae: 9.6222 - mse: 143.6184
Epoch 7/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 558us/step - loss: 145.3736 - mae: 9.6193 - mse: 143.7052
Epoch 8/25
[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━

In [73]:
# Model evaluations
y_pred_train_lr = lr_model.predict(x_train)
y_pred_train_ridge = ridge_model.predict(x_train)
y_pred_train_rf = rf_model.predict(x_train)
y_pred_train_gbm = gbm_model.predict(x_train)
y_pred_train_xgb = xgb_model.predict(x_train)
y_pred_train_tf = tf_model.predict(x_train)

# also lets add those back to the x_test
x_train_stacked = x_train.copy()
x_train_stacked['lr_pred'] = y_pred_train_lr
x_train_stacked['ridge_pred'] = y_pred_train_ridge
x_train_stacked['rf_pred'] = y_pred_train_rf
x_train_stacked['gbm_pred'] = y_pred_train_gbm
x_train_stacked['xgb_pred'] = y_pred_train_xgb
x_train_stacked['tf_pred'] = y_pred_train_tf

# lets restrict just the train and test data now to the new preds
#####x_train_stacked = x_train_stacked[['lr_pred','ridge_pred','rf_pred','gbm_pred','xgb_pred','tf_pred']]
x_train_stacked = x_train_stacked[['lr_pred','ridge_pred','gbm_pred','xgb_pred','tf_pred']]

# evaluate models
model_evaluation(y_train, y_pred_train_lr)
model_evaluation(y_train, y_pred_train_ridge)
model_evaluation(y_train, y_pred_train_rf)
model_evaluation(y_train, y_pred_train_gbm)
model_evaluation(y_train, y_pred_train_xgb)
model_evaluation(y_train, y_pred_train_tf)


[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 408us/step
R-squared: 0.294
Mean Squared Error (MSE): 137.884
Mean Absolute Error (MAE): 9.412
R-squared: 0.248
Mean Squared Error (MSE): 146.956
Mean Absolute Error (MAE): 9.740
R-squared: 0.890
Mean Squared Error (MSE): 21.432
Mean Absolute Error (MAE): 3.704
R-squared: 0.302
Mean Squared Error (MSE): 136.372
Mean Absolute Error (MAE): 9.408
R-squared: 0.302
Mean Squared Error (MSE): 136.461
Mean Absolute Error (MAE): 9.395
R-squared: 0.356
Mean Squared Error (MSE): 125.900
Mean Absolute Error (MAE): 9.003


In [74]:
# lets predict in the test
y_pred_test_lr = lr_model.predict(x_test)
y_pred_test_ridge = ridge_model.predict(x_test)
y_pred_test_rf = rf_model.predict(x_test)
y_pred_test_gbm = gbm_model.predict(x_test)
y_pred_test_xgb = xgb_model.predict(x_test)
y_pred_test_tf = tf_model.predict(x_test)

x_test['lr_pred'] = y_pred_test_lr
x_test['ridge_pred'] = y_pred_test_ridge
x_test['rf_pred'] = y_pred_test_rf
x_test['gbm_pred'] = y_pred_test_gbm
x_test['xgb_pred'] = y_pred_test_xgb
x_test['tf_pred'] = y_pred_test_tf

# lets restrict just the train and test data now to the new preds
####x_test_stacked = x_test[['lr_pred','ridge_pred','rf_pred','gbm_pred','xgb_pred','tf_pred']].copy()
x_test_stacked = x_test[['lr_pred','ridge_pred','gbm_pred','xgb_pred','tf_pred']].copy() # RF is too over fitted

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [20]:
### First starting with RF model
# Step 1: Define hyperparameter grid
n_estimator_values = [50, 100, 200]  # Number of trees in the forest
max_depth_values = [None, 10, 20, 30]  # Maximum depth of the tree
max_features_values = ['auto', 'sqrt', 'log2']  # Number of features to consider at each split

# Step 2: Create the parameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 6, 9],
    'max_features': [None, 'sqrt', 'log2', 0.5]  # Valid options
}

# Step 3: set up regressor
rf_regressor = RandomForestRegressor()

# Step 4: fit the grid search model
rf_grid_search = GridSearchCV(
    estimator=rf_regressor,
    param_grid=rf_param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='neg_mean_squared_error',  # Use negative MSE as the scoring metric
    n_jobs=-3,  # Use all available cores minus 2
    verbose=1  # Output progress
)

In [21]:
# Fit the RF
rf_grid_search.fit(x_train_stacked, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [22]:
# Get the best parameters
best_n_estimator_value = rf_grid_search.best_params_['n_estimators']
best_max_depth_value = rf_grid_search.best_params_['max_depth']
best_max_features_value = rf_grid_search.best_params_['max_features']

# Print the best values
print(f"Best n_estimators: {best_n_estimator_value}")
print(f"Best max_depth: {best_max_depth_value}")
print(f"Best max_features: {best_max_features_value}")

Best n_estimators: 150
Best max_depth: 9
Best max_features: None


In [23]:
# fit final model using the best param
best_rf_regressor = RandomForestRegressor(
    n_estimators=best_n_estimator_value,
    max_depth=best_max_depth_value,
    max_features=best_max_features_value
)

best_rf_regressor.fit(x_train_stacked, y_train)

In [46]:
# Model evaluation
y_pred_stacked_rf = best_rf_regressor.predict(x_train_stacked)

# evaluate model
model_evaluation(y_train, y_pred_stacked_rf)

R-squared: 0.993
Mean Squared Error (MSE): 1.449
Mean Absolute Error (MAE): 0.927


In [25]:
y_pred_stacked_rf = best_rf_regressor.predict(x_test_stacked)
x_test_stacked_rf_pred = x_test_stacked.copy()
x_test_stacked_rf_pred['rf_stacked_predictions'] = y_pred_stacked_rf
x_test_stacked_rf_pred['actual'] = y_test
x_test_stacked_rf_pred = pd.concat([x_test_stacked_rf_pred, test_team_values], axis=1)
x_test_stacked_rf_pred.to_csv("x_test_stacked_rf_pred.csv")

In [75]:
# Lets do a stacked linear model as well
lm_stacked = LinearRegression()

# now fit
lm_stacked.fit(x_train_stacked, y_train)

In [78]:
# Model evaluation
y_pred_stacked_lm = lm_stacked.predict(x_train_stacked)

# evaluate model
model_evaluation(y_train, y_pred_stacked_lm)

R-squared: 0.407
Mean Squared Error (MSE): 115.979
Mean Absolute Error (MAE): 8.587


In [79]:
y_pred_stacked_lm = lm_stacked.predict(x_test_stacked)
x_test_stacked_lm_pred = x_test_stacked.copy()
x_test_stacked_lm_pred['lm_stacked_predictions'] = y_pred_stacked_lm
x_test_stacked_lm_pred['actual'] = y_test
x_test_stacked_lm_pred = pd.concat([x_test_stacked_lm_pred, test_team_values], axis=1)
x_test_stacked_lm_pred.to_csv("x_test_stacked_lm_pred.csv")