In [40]:
import polars as pl
import numpy as np
import joblib

In [37]:
final_data = pl.read_csv("data/Pitcher_Salary_Data.csv")
display(final_data.head(43))

yearID,playerID,Salary,stint_pitch,W,L,G_pitch,GS_pitch,CG,SHO,SV,IPouts,H,ER,HR,BB,SO,BAOpp,ERA,IBB,WP_pitch,HBP,BK,BFP,GF,R,SH,SF,GIDP,stint_field,G_field,GS_field,InnOuts,PO,A,E,DP,PB,WP_field,SB,CS,ZR,TrainVal
i64,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i64,f64,i64,f64,i64,i64,f64,f64,f64,i64,i64,f64,f64,i64,i64,f64,i64,f64,f64,f64,f64,f64,str
2007,"""aardsda01""",387500,1,2,1,25,0,0,0,0,97,39,23,4,17,36,0.3,6.4,3.0,2,1.0,0,151.0,7,24,2.0,1.0,1.0,1,25,0.0,97.0,2,4,1.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""
2008,"""aardsda01""",403250,1,4,2,47,0,0,0,0,146,49,30,4,35,49,0.268,5.55,2.0,3,5.0,0,228.0,7,32,3.0,2.0,4.0,1,47,0.0,146.0,3,6,0.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""
2009,"""aardsda01""",419000,1,3,6,73,0,0,0,38,214,49,20,4,34,80,0.19,2.52,3.0,2,0.0,0,296.0,53,23,2.0,1.0,2.0,1,73,0.0,214.0,2,5,0.0,1,0.0,0.0,0.0,0.0,0.0,"""Training"""
2010,"""aardsda01""",2750000,1,0,6,53,0,0,0,31,149,33,19,5,25,49,0.198,3.44,5.0,2,2.0,0,202.0,43,19,7.0,1.0,5.0,1,53,0.0,149.0,2,3,1.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""
2012,"""aardsda01""",500000,1,0,0,1,0,0,0,0,3,1,1,1,1,1,0.25,9.0,0.0,0,0.0,0,5.0,1,1,0.0,0.0,0.0,1,1,0.0,3.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2015,"""affelje01""",6000000,1,2,2,52,0,0,0,0,106,43,23,6,14,21,0.293,5.86,2.0,1,2.0,0,163.0,7,24,0.0,0.0,4.0,1,52,0.0,106.0,3,6,0.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""
2008,"""albaljo01""",393225,1,0,1,7,0,0,0,0,41,15,6,1,6,13,0.294,3.95,0.0,0,0.0,0,58.0,2,6,1.0,0.0,2.0,1,7,0.0,41.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""
2009,"""albaljo01""",403075,1,5,1,32,0,0,0,0,103,41,20,6,16,21,0.306,5.24,2.0,0,3.0,0,158.0,5,23,1.0,4.0,3.0,1,32,0.0,103.0,1,7,0.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""
2008,"""alberma01""",395000,1,3,3,28,3,0,0,0,147,43,19,4,22,26,0.24,3.49,1.0,1,2.0,0,208.0,5,21,1.0,3.0,9.0,1,28,3.0,147.0,5,6,0.0,0,0.0,0.0,0.0,0.0,0.0,"""Training"""


In [41]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [42]:
RND = 42

In [43]:
validation_df = final_data.filter(pl.col("TrainVal") == "Validation")
train_all_df  = final_data.filter(pl.col("TrainVal") == "Training")

print("Training rows:", train_all_df.height)
print("Validation rows:", validation_df.height)

Training rows: 3971
Validation rows: 417


In [44]:
TARGET = "Salary"

In [46]:
drop_cols = ["playerID", "TrainVal"]
feature_cols = [c for c in final_data.columns if c not in drop_cols + [TARGET]]

In [47]:
X_all = train_all_df.select(feature_cols).to_numpy()
y_all = train_all_df.select(TARGET).to_numpy().ravel()

X_val = validation_df.select(feature_cols).to_numpy()
y_val = validation_df.select(TARGET).to_numpy().ravel()

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=RND
)

In [49]:
imputer = SimpleImputer(strategy="mean")
X_train = imputer.fit_transform(X_train)
X_test  = imputer.transform(X_test)
X_val   = imputer.transform(X_val)

In [51]:
cart = DecisionTreeRegressor(random_state=RND)
cart_grid = {"max_depth": [3, 5, 10, None], "min_samples_split": [2, 5, 10]}

cart_search = GridSearchCV(cart, cart_grid, cv=5, scoring="neg_root_mean_squared_error")
cart_search.fit(X_train, y_train)

print("\nCART Results")
print("Best Parameters:", cart_search.best_params_)
print("Training RMSE:", -cart_search.best_score_)


CART Results
Best Parameters: {'max_depth': 3, 'min_samples_split': 2}
Training RMSE: 3779233.7455706247


In [52]:
cart_best = cart_search.best_estimator_
y_pred_test = cart_best.predict(X_test)

In [53]:
import numpy as np

mse = mean_squared_error(y_test, y_pred_test)
rmse = np.sqrt(mse)
print("Test RMSE:", rmse)
print("Test R²:", r2_score(y_test, y_pred_test))

Test RMSE: 3748584.7650515093
Test R²: 0.17874765043262497


In [54]:
y_pred_val = cart_best.predict(X_val)

In [55]:
mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = np.sqrt(mse_val)

print("\nValidation RMSE:", rmse_val)
print("Validation R²:", r2_score(y_val, y_pred_val))


Validation RMSE: 5101686.859874753
Validation R²: 0.16284636768860894


In [56]:
rf = RandomForestRegressor(random_state=RND)
rf_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5],
}

In [58]:
rf_search = GridSearchCV(rf, rf_grid, cv=5, scoring="neg_root_mean_squared_error", n_jobs=-1)
rf_search.fit(X_train, y_train)

print("\nRandom Forest Results")
print("Best Parameters:", rf_search.best_params_)
print("Training RMSE:", -rf_search.best_score_)


Random Forest Results
Best Parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Training RMSE: 3637242.960921482


In [59]:
rf_best = rf_search.best_estimator_
y_pred_rf_test = rf_best.predict(X_test)

mse_rf_test = mean_squared_error(y_test, y_pred_rf_test)
rmse_rf_test = np.sqrt(mse_rf_test)

print("Test RMSE:", rmse_rf_test)
print("Test R²:", r2_score(y_test, y_pred_rf_test))

Test RMSE: 3656877.1193011696
Test R²: 0.2184393457692666


In [60]:
y_pred_rf_val = rf_best.predict(X_val)

mse_rf_val = mean_squared_error(y_val, y_pred_rf_val)
rmse_rf_val = np.sqrt(mse_rf_val)

print("\nValidation RMSE:", rmse_rf_val)
print("Validation R²:", r2_score(y_val, y_pred_rf_val))


Validation RMSE: 4743085.392813332
Validation R²: 0.27639850293345414


In [61]:
joblib.dump(cart_best, "cart_best_model.pkl")
joblib.dump(rf_best, "rf_best_model.pkl")

print("\nModels saved successfully.")


Models saved successfully.


The Random Forest model was the winner of the two. We tested the CART model and the Random Forest Model.The Random Forest model did a better job predicting the outcome, achieving an $R^2$ score of 0.2764 (meaning it's 27.64\% accurate at explaining the data's movement) and a lower error (RMSE) of 4,743,085 on the final validation data. The simpler CART model was less accurate, only reaching an $R^2$ of 0.1628. Our best model still explains only about a quarter of what's going on. To improve accuracy, we may need better data or to rethink the modeling approach altogether.