In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [None]:
# Load the data
df = pd.read_csv("/Users/johndavis/Desktop/df_total_full_arsenal_ERA4.csv")

In [None]:
# Drop rows with NaN values
df_cleaned = df.dropna()

# Define features and target
X = df_cleaned.drop(columns=['ERA', 'Unnamed: 0', 'Pitcher'])
y = df_cleaned['ERA']

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train Ridge Regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict using Ridge Regression and Random Forest
y_val_pred_ridge = ridge_model.predict(X_val_scaled)
y_val_pred_rf = rf_model.predict(X_val_scaled)

In [None]:
# Ensemble predictions using weighted average
w1, w2 = 0.5, 0.5
y_val_pred_ensemble = w1 * y_val_pred_ridge + w2 * y_val_pred_rf

# Evaluate the ensemble's performance
r2_ensemble = r2_score(y_val, y_val_pred_ensemble)
print(f"R^2 score of the ensemble model: {r2_ensemble}")

In [None]:
#predict era for 2023
df_2023 = pd.read_csv("/Users/johndavis/Desktop/df_total_full_arsenal_ERA4.csv")

# Drop rows with NaN values to create df_2023_cleaned
df_2023_cleaned = df_2023.dropna()

# Reset the index for df_2023_cleaned
df_2023_cleaned = df_2023_cleaned.reset_index(drop=True)

# Define features for 2023 as X_2023
X_2023 = df_2023_cleaned.drop(columns=['ERA', 'Unnamed: 0', 'Pitcher'])

# Scale X_2023 using the same scaler used for training data
X_2023_scaled = scaler.transform(X_2023)

X_2023_scaled.shape

In [None]:
# Make predictions for xERA in 2023 using Ridge and Random Forest models
df_2023_cleaned['xERA_RIDGE'] = ridge_model.predict(X_2023_scaled)
df_2023_cleaned['xERA_RF'] = rf_model.predict(X_2023_scaled)

# Calculate the ensemble prediction
df_2023_cleaned['xERA_ENSEMBLE'] = w1 * df_2023_cleaned['xERA_RIDGE'] + w2 * df_2023_cleaned['xERA_RF']

In [None]:
#sort by xERA_ENSEMBLE lowest to highest
df_2023_cleaned.sort_values(by=['xERA_ENSEMBLE'], inplace=True)

In [79]:
#df_total_full_arsenal_ERA4 min 1000
df_2023_cleaned_502 = df_2023_cleaned[df_2023_cleaned['count_x'] >= 500]

In [80]:
#columns to keep Pitcher	count_x	Stuff+	ERA xERA_ENSEMBLE
df_2023_cleaned_502 = df_2023_cleaned_501[['Pitcher', 'count_x', 'Stuff+', 'ERA', 'xERA_ENSEMBLE', 'xba', 'whiff_percent']]

In [None]:
#change xERA_ENSEMBLE to xERA
df_2023_cleaned_501.rename(columns={'xERA_ENSEMBLE': 'xERA'}, inplace=True)

In [None]:
top_25_xERA = df_2023_cleaned_501.head(25)

In [81]:
#df_total_full_arsenal_ERA4 min 1000
df_2023_cleaned_1000 = df_2023_cleaned[df_2023_cleaned['count_x'] >= 1000]

In [None]:
#to csv top_10_xERA
top_25_xERA.to_csv('/Users/johndavis/Desktop/top_25_xERA.csv')
df_2023_cleaned_501.to_csv('/Users/johndavis/Desktop/df_2023_cleaned_501.csv')
