In [None]:
!pip install pybaseball

Collecting pybaseball
  Downloading pybaseball-2.2.7-py3-none-any.whl.metadata (11 kB)
Collecting pygithub>=1.51 (from pybaseball)
  Downloading pygithub-2.8.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub>=1.51->pybaseball)
  Downloading pynacl-1.6.2-cp38-abi3-manylinux_2_34_x86_64.whl.metadata (10.0 kB)
Downloading pybaseball-2.2.7-py3-none-any.whl (426 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m426.1/426.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pygithub-2.8.1-py3-none-any.whl (432 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m432.7/432.7 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pynacl-1.6.2-cp38-abi3-manylinux_2_34_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, pygithub, pybaseball
Successfully installed pybaseball-2.2.7 pygithub-2

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from pybaseball import statcast_pitcher, playerid_lookup

# Pull Statcast data for multiple seasons

def get_season_data(year, player_id):
  start = f"{year}-03-20"
  end = f"{year}-09-30"
  return statcast_pitcher(start, end, player_id)


In [None]:
playerid_lookup('verlander', 'justin')

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,verlander,justin,434378,verlj001,verlaju01,8700,2005.0,2025.0


In [None]:
player_id = 434378
seasons = range(2017, 2026)
dfs = []

for year in seasons:
  df_year = get_season_data(year, player_id)
  df_year['season'] = year
  dfs.append(df_year)

statcast = pd.concat(dfs, ignore_index=True)


Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data
Gathering Player Data


  statcast = pd.concat(dfs, ignore_index=True)


In [None]:
# Filter to fastballs only

fastballs = statcast[statcast['pitch_type'].isin(['FF'])].copy()

In [None]:
# Compute average fastball velocity per pitcher-season

velo_by_season = (
    fastballs.groupby(['pitcher', 'season']).agg(
        fb_velo=('release_speed', 'mean'),
        pitches=('release_speed', 'count')
    )
    .reset_index()
)

In [None]:
# Remove any small samples
velo_by_season = velo_by_season[velo_by_season['pitches'] >= 200]

In [None]:
# Add pitcher age

age_lookup = {
    434378: {
        2019: 36,
        2020: 37,
        2021: 38,
        2022: 39,
        2023: 40,
        2024: 41,
        2025: 42
    }
}

In [None]:
# Attach age to velo_by_season

def get_age(pitcher_id, season):
  return age_lookup.get(pitcher_id, {}).get(season, np.nan)

In [None]:
velo_by_season['age'] = velo_by_season.apply(
    lambda r: get_age(r['pitcher'], r['season']), axis=1
)

In [None]:
# drop rows where age is missing

velo_by_season = velo_by_season.dropna(subset=['age'])

In [None]:
# Create nonlinear age term

velo_by_season['age_sq'] = velo_by_season['age'] ** 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  velo_by_season['age_sq'] = velo_by_season['age'] ** 2


In [None]:
# Build paired season data

velo_by_season = velo_by_season.sort_values(['pitcher', 'season'])

In [None]:
velo_by_season['fb_velo_next'] = (
    velo_by_season.groupby('pitcher')['fb_velo'].shift(-1)
)

In [None]:
# Keep only valid pairs

paired = velo_by_season.dropna(subset=['fb_velo_next']).copy()

In [None]:
# Define model inputs

features = [
    'fb_velo',
    'age',
    'age_sq',
    'season',
    'pitches'
]

X = paired[features]
y = paired['fb_velo_next']

In [None]:
# fit the aging model

model = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])

model.fit(X, y)

In [None]:
# Sanity check

preds = model.predict(X)
residuals = y - preds

print('Mean error:', residuals.mean())
print('RMSE:', np.sqrt(np.mean(residuals**2)))

Mean error: 3.552713678800501e-15
RMSE: 0.18106857248596087


In [None]:
# prepare Justin Verlander's most recent season

verlander_id = 434378

In [None]:
verlander_recent = (
    velo_by_season[velo_by_season['pitcher'] == verlander_id]
    .sort_values('season')
    .iloc[-1]
)

In [None]:
# Predict Verlander's next-season velo

verlander_input = pd.DataFrame([{
    'fb_velo': verlander_recent['fb_velo'],
    'age': verlander_recent['age'],
    'age_sq': verlander_recent['age'] ** 2,
    'season': verlander_recent['season'],
    'pitches': verlander_recent['pitches']
}])

In [None]:
# Predict

verlander_pred = model.predict(verlander_input)[0]
print(f'Predicted next-season FB velocity: {verlander_pred:.2f} mph')

Predicted next-season FB velocity: 93.22 mph


In [None]:
# Add uncertainty

rmse = np.sqrt(np.mean(residuals**2))

lower = verlander_pred - 1.28 * rmse    # ~80% interval
upper = verlander_pred + 1.28 * rmse

print(f'80% prediction interval: [{lower:.2f}, {upper:.2f}] mph')

80% prediction interval: [92.99, 93.45] mph
