<a href="https://colab.research.google.com/github/Megancodes2017/Megancodes2017/blob/main/Schwarber_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting Deprecated (from pygithub>=1.51->pybaseball)
  Downloading Deprecated-1.2.18-py2.py3-none-any.whl.metadata (5.7 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyGithub-2.6.1-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━

In [None]:
from pybaseball import statcast_batter, batting_stats
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

In [None]:
# Load in data for all hitters from 2015-2024
years = list(range(2015, 2025))
all_stats = pd.concat([batting_stats(year) for year in years], ignore_index=True)
all_stats = all_stats[all_stats['PA'] >= 100]

In [None]:
# Create future target (next year's wOBA)
all_stats['next_year'] = all_stats['Season'] + 1
future = all_stats[['Name', 'Season', 'wOBA']].rename(columns={'wOBA': 'next_year_woba', 'Season': 'next_year'})
merged = all_stats.merge(future, on=['Name', 'next_year'])

In [None]:
# Feature selection and engineering
features = ['Season', 'Age', 'PA', 'BB%', 'K%', 'AVG', 'OBP', 'SLG', 'ISO', 'BABIP', 'wRC+', 'WAR']
merged = merged.dropna(subset=features + ['next_year_woba'])

X = merged[features]
y = merged['next_year_woba']

In [None]:
# Train test split and model fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model= LGBMRegressor()
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1261
[LightGBM] [Info] Number of data points in the train set: 621, number of used features: 12
[LightGBM] [Info] Start training from score 0.342498


In [None]:
# Evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'MSE Test: {mse:.4f}')

MSE Test: 0.0010


In [None]:
# Predict for Kyle Schwarber
kyle = all_stats[all_stats['Name'] == 'Kyle Schwarber'].sort_values(by='Season', ascending=False).head(1)

if not kyle.empty:
  kyle_input = kyle[features]
  kyle_woba_pred = model.predict(kyle_input)
  print(f'Predicted next year wOBA for Kyle Schwarber: {kyle_woba_pred[0]:.3f}')
else:
  print('Kyle Schwarber data not found')

Predicted next year wOBA for Kyle Schwarber: 0.347


In [None]:
# Create multi year future targets
for i in range(1, 4):
  all_stats[f'future_year_{i}'] = all_stats['Season'] + i

future_targets = []
for i in range(1, 4):
  future = all_stats[['Name', f'future_year_{i}', 'wOBA']].rename(
      columns = {'wOBA': f'wOBA_plus_{i}', f'future_year_{i}': 'Season'})
  future_targets.append(future)

merged = all_stats.copy()
for i in range(1, 4):
  merged = merged.merge(future_targets[i-1], on=['Name', 'Season'], how='left')

In [None]:
# Feature selection and engineering
features = ['Season', 'Age', 'PA', 'BB%', 'K%', 'AVG', 'OBP', 'SLG', 'ISO', 'BABIP', 'wRC+', 'WAR']
targets = ['wOBA_plus_1', 'wOBA_plus_2', 'wOBA_plus_3']

merged = merged.dropna(subset=features + targets)

X = merged[features]

In [None]:
# Train one model per future year
models = {}
predictions = {}

for i, target in enumerate(targets, start=1):
  y = merged[target]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  model = LGBMRegressor()
  model.fit(X_train, y_train)
  models[target] = model
  y_pred = model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  print(f'MSE Test for {target}: {mse:.4f}')
  predictions[target] = y_pred


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 220, number of used features: 12
[LightGBM] [Info] Start training from score 0.349432
MSE Test for wOBA_plus_1: 0.0015
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 220, number of used features: 12
[LightGBM] [Info] Start training from score 0.350214
MSE Test for wOBA_plus_2: 0.0010
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 617
[LightGBM] [Info] Number of data points in the train set: 220

In [None]:
# Predict for Kyle Schwarber next 3 seasons
kyle = all_stats[all_stats['Name'] == 'Kyle Schwarber'].sort_values(by='Season', ascending=False).head(1)

if not kyle.empty:
  kyle_input = kyle[features]
  for target in targets:  # Iterate through the target names
    kyle_pred = models[target].predict(kyle_input)
    # Extract the year number from the target string for printing
    year_number = target.split('_')[-1]
    print(f'Predicted wOBA for Kyle Schwarber in +{year_number}: {kyle_pred[0]:.3f}')
else:
  print('Kyle Schwarber data not found')

Predicted wOBA for Kyle Schwarber in +1: 0.363
Predicted wOBA for Kyle Schwarber in +2: 0.369
Predicted wOBA for Kyle Schwarber in +3: 0.386
