# models

In [146]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

This file is used to prototype with different models in this project. It is split up into two different sections: one section is for the models predicting the hits for batters, and the other is for strikes for pitchers.

### Section 1: Batter Models

#### Linear Regression

In [147]:
# Load the data
data = pd.read_csv("data/batting_season_summary.csv")

In [148]:
X = data[["SLG", "OPS", "BA", "HR", "2B"]]
y = data["H"]  # Number of hits as target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.01, random_state=42
)

In [149]:
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

In [150]:
# Predict on the test set
y_pred = model.predict(X_test)

In [151]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 49.50089244262777


In [152]:
# 6. Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [153]:
print("Model Performance:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Model Performance:
Mean Squared Error: 49.50089244262777
R-squared: 0.9473248710486875


In [154]:
# Make predictions for all players
predictions = model.predict(X)

# Add predicted hits to the DataFrame
data["Predicted_H"] = predictions

# Group the data by player name and select the player with the highest predicted hits within each group
top_players = data.groupby("Name").apply(lambda x: x.nlargest(1, "Predicted_H"))

# Sort the DataFrame by predicted hits and select the top 25 players
top_players = top_players.sort_values(by="Predicted_H", ascending=False).head(25)

print("Top 25 players with the highest predicted hits:")
print(top_players[["Name", "Predicted_H"]])

Top 25 players with the highest predicted hits:
                                         Name  Predicted_H
Name                                                      
Eric Cyr            2248             Eric Cyr   245.211102
Dusty Ryan          2511           Dusty Ryan   236.602747
Steve Gajkowski     2670      Steve Gajkowski   233.637139
Bill Ortega         1909          Bill Ortega   232.175259
Scott Alexander     1842      Scott Alexander   231.916246
Severino Gonzalez   1587    Severino Gonzalez   231.641507
Chris Reed          1420           Chris Reed   231.511684
Matt Tracy          2610           Matt Tracy   230.095167
Dan Murray          2497           Dan Murray   229.447787
Clay Timpner        881          Clay Timpner   228.647506
Brian Broderick     2192      Brian Broderick   228.271268
Audry Perez         1100          Audry Perez   228.189779
Willie Martinez     1615      Willie Martinez   227.198681
Danny Young         2500          Danny Young   226.423547
Jaff Dec

  top_players = data.groupby("Name").apply(lambda x: x.nlargest(1, "Predicted_H"))


#### Random Forest Regressors

In [155]:
# Select features and target variable
X = data[["PA", "AB", "2B", "3B", "HR", "BB", "SO"]]
y = data["H"]  # Number of hits as target variable

In [156]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [157]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [158]:
# Predict on the test set
y_pred = model.predict(X_test)

In [159]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 140.3367275925926


In [160]:
# Make predictions for all players
predictions = model.predict(X)

# Add predicted hits to the DataFrame
data["Predicted_H"] = predictions

# Group the data by player name and select the player with the highest predicted hits within each group
top_players = data.groupby("Name").apply(lambda x: x.nlargest(1, "Predicted_H"))

# Sort the DataFrame by predicted hits and select all players
top_players = top_players.sort_values(by="Predicted_H", ascending=False).head(500)

print("Top 25 players with the highest predicted hits:")
print(top_players[["Name", "Predicted_H"]])

Top 25 players with the highest predicted hits:
                                     Name  Predicted_H
Name                                                  
Eric Cyr          2248           Eric Cyr       248.25
Scott Alexander   1580    Scott Alexander       240.20
Dusty Ryan        2511         Dusty Ryan       236.53
Danny Young       2500        Danny Young       233.88
Severino Gonzalez 1587  Severino Gonzalez       232.37
...                                   ...          ...
Dan Winkler       75          Dan Winkler       121.84
Brian Slocum      2467       Brian Slocum       120.91
Eddie Kunz        1175         Eddie Kunz       120.69
Alex Castellanos  8      Alex Castellanos       120.63
Shawn Sedlacek    2665     Shawn Sedlacek       120.42

[500 rows x 2 columns]


  top_players = data.groupby("Name").apply(lambda x: x.nlargest(1, "Predicted_H"))


#### Linear Regression using Previous Year Stats