In [1]:
import pandas as pd

# Load the datasets
salaries_df = pd.read_csv('/Users/ranch/Desktop/DSCI303/mlbSalaries.csv')
batters_df = pd.read_csv('/Users/ranch/Desktop/DSCI303/mlb-player-stats-Batters.csv')

# Rename columns so the datasets can be joined
salaries_df.rename(columns={'name': 'Player'}, inplace=True)

# Filter to only use salaries from 2019
salaries_2019_df = salaries_df[salaries_df['year'] == 2019]

# Join the datasets
merged_df = pd.merge(salaries_2019_df, batters_df, on='Player', how='inner')

# Display the merged dataset
print(merged_df.head(1))

   year                  team           Player   salary  playerID Team Pos  \
0  2019  arizona-diamondbacks  Eduardo Escobar  6166666    500871  ARI  3B   

   Age    G   AB  ...  CS  BB   SO  SH  SF  HBP    AVG   OBP    SLG    OPS  
0   35  158  636  ...   1  50  130   0  10    3  0.269  0.32  0.511  0.831  

[1 rows x 27 columns]


In [2]:
# Data Pre Processing

# Drop columns we won't need
merged_df.drop(columns=['playerID'], inplace=True)
merged_df.drop(columns=['team'], inplace=True)

# Filter out players who have less than 50 at bats
filtered_df = merged_df[merged_df['AB'] >= 50]

In [3]:
# Remove Pete Alonso to predict his salary later using our model
chosen_player = filtered_df[filtered_df['Player'] == 'Pete Alonso']

# Remove the his row from the DataFrame so it's not used for training or testing
filtered_df = filtered_df[filtered_df['Player'] != 'Pete Alonso']

In [4]:
# Verifying processing went as expected
print(filtered_df.head())

   year           Player   salary Team Pos  Age    G   AB   R    H  ...  CS  \
0  2019  Eduardo Escobar  6166666  ARI  3B   35  158  636  94  171  ...   1   
1  2019        Jake Lamb  4825000  ARI  OF   34   78  187  26   36  ...   0   
2  2019       Adam Jones  4500000  ARI  OF   39  137  485  66  126  ...   1   
3  2019       Alex Avila  4250000  ARI   C   37   63  164  22   34  ...   0   
4  2019     Jarrod Dyson  4000000  ARI  OF   40  130  400  65   92  ...   4   

   BB   SO  SH  SF  HBP    AVG    OBP    SLG    OPS  
0  50  130   0  10    3  0.269  0.320  0.511  0.831  
1  32   55   0   2    5  0.193  0.323  0.353  0.676  
2  31  101   0   3    8  0.260  0.313  0.414  0.727  
3  36   68   0   0    1  0.207  0.353  0.421  0.774  
4  47   86   1   2    2  0.230  0.313  0.320  0.633  

[5 rows x 25 columns]


In [17]:
# Support Vector Regression(our main model)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features
X = filtered_df[['HR', 'H', 'AVG', 'Age', 'SLG', 'OBP', 'OPS', 'BB']]  
# Target
y = filtered_df['salary']  


# Apply log transformation to salary
y_log = np.log1p(y)

# Split the data into training and testing sets
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVR model
svr = SVR(kernel='rbf', C=10, gamma=0.01)
svr.fit(X_train_scaled, y_train_log)

# Make predictions
y_pred_log = svr.predict(X_test_scaled)

# Reverse the log transformation for interpretation
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")


Root Mean Squared Error: 4508876.223824617
R-squared Score: 0.5269883826991046


In [7]:
from sklearn.inspection import permutation_importance

# Compute permutation importance
perm_importance = permutation_importance(svr, X_test_scaled, y_test_log, n_repeats=10, random_state=42)

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({
    'Feature': ['HR', 'H', 'AVG', 'Age', 'SLG', 'OBP', 'OPS', 'BB'],  # Replace with your feature names
    'Importance': perm_importance.importances_mean
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display feature importance
print(feature_importance)

  Feature  Importance
3     Age    0.725375
1       H    0.282499
0      HR    0.135981
7      BB    0.102782
5     OBP    0.044643
6     OPS    0.024291
2     AVG    0.016007
4     SLG    0.012440


In [10]:
# Extract Pete Alonso's features for prediction
X_player = chosen_player[['HR', 'H', 'AVG', 'Age', 'SLG', 'OBP', 'OPS', 'BB']].copy()  # Make a copy to avoid warnings

# For changing his age to see how salary prediction is affected
#X_player.loc[:, 'Age'] = 30

# Scale the features using the same scaler used for training
X_player_scaled = scaler.transform(X_player)

# Predict the log-transformed salary using the SVR model
player_salary_log = svr.predict(X_player_scaled)

# Reverse the log transformation to get the actual salary
player_salary = np.expm1(player_salary_log)

# Output the predicted salary
print(f"Predicted Salary for Pete Alonso: ${player_salary[0]:,.2f}")


Predicted Salary for Pete Alonso: $2,040,880.50


In [15]:
# Tuning the hyperparameters for RBF kernel
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for RBF kernel
param_grid = {
    'C': [0.1, 1, 10, 100],       # Regularization parameter
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]  # Kernel coefficient
}

# Perform Grid Search with RBF kernel
grid_search = GridSearchCV(SVR(kernel='rbf'), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train_log)

# Display the best parameters and model
print("Best Parameters:", grid_search.best_params_)
svr = grid_search.best_estimator_


Best Parameters: {'C': 10, 'gamma': 0.01}


In [18]:
# Linear regression(control to compare to our other models)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features and Target
X = filtered_df[['HR', 'H', 'AVG', 'Age', 'SLG', 'OBP', 'OPS', 'BB']]  
y = filtered_df['salary']  

# Apply log transformation to salary
y_log = np.log1p(y)

# Split the data into training and testing sets
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Linear Regression model
lr = LinearRegression()
lr.fit(X_train_scaled, y_train_log)

# Make predictions
y_pred_log = lr.predict(X_test_scaled)

# Reverse the log transformation for interpretation
y_pred = np.expm1(y_pred_log) 
y_test = np.expm1(y_test_log) 

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")


Root Mean Squared Error: 4772336.273188315
R-squared Score: 0.4700959288963734


In [23]:
# Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Features and Target
X = filtered_df[['HR', 'H', 'AVG', 'Age', 'SLG', 'OBP', 'OPS', 'BB']]  
y = filtered_df['salary']  

# Apply log transformation to salary
y_log = np.log1p(y)

# Split the data into training and testing sets
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Random Forest model(values from tuning)
rf = RandomForestRegressor(
    n_estimators=500,      
    max_depth=10,           
    min_samples_split=2,   
    min_samples_leaf=4,    
    random_state=42        
)
rf.fit(X_train_scaled, y_train_log)

# Make predictions
y_pred_log = rf.predict(X_test_scaled)

# Reverse the log transformation for interpretation
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")


Root Mean Squared Error: 4886104.713525955
R-squared Score: 0.4445298564353426


In [20]:
# Tuning hyperparameters
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train_scaled, y_train_log)

# Get the best model
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Use the best model to make predictions
y_pred_log = best_rf.predict(X_test_scaled)
y_pred = np.expm1(y_pred_log)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 500}
