In [32]:
#Credit to https://github.com/sam-walsh/pitching_model/blob/main/predictive_run_values.ipynb
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold

In [77]:
df_2023 = pd.read_csv("Statcast_2023.csv")
df_2021_22 = pd.read_csv("Statcast_2020_to_2022.csv")

In [113]:
df_all_years = pd.concat([df_2023, df_2021_22])

In [114]:
df_all_years['new_run_value'] = df_all_years.groupby(['events'])['delta_run_exp'].transform('mean')

In [115]:
df_all_years['description'].unique().tolist()

['called_strike',
 'ball',
 'foul',
 'hit_into_play',
 'foul_tip',
 'swinging_strike',
 'blocked_ball',
 'swinging_strike_blocked',
 'foul_bunt',
 'hit_by_pitch',
 'pitchout',
 'missed_bunt',
 'bunt_foul_tip',
 'foul_pitchout']

In [116]:
simple_descriptions_list = {
    'called_strike': 'called_strike',
    'ball': 'ball',
    'foul': 'foul',
    'foul_tip': 'foul',
    'swinging_strike': 'swinging_strike',
    'blocked_ball': 'ball',
    'swinging_strike_blocked': 'swinging_strike',
    'foul_bunt': 'foul',
    'pitchout': 'ball',
    'missed_bunt': 'swinging_strike',
    'bunt_foul_tip': 'foul',
    'foul_pitchout': 'foul'
}

In [117]:
df_all_years['simple_description'] = df_all_years['description'].map(simple_descriptions_list)

In [118]:
not_inplay = df_all_years[df_all_years['description']!='hit_into_play']
not_inplay_run_values = not_inplay.groupby(['simple_description'])['delta_run_exp'].mean()
not_inplay_run_values

simple_description
ball               0.057641
called_strike     -0.064107
foul              -0.040587
swinging_strike   -0.116211
Name: delta_run_exp, dtype: float64

In [119]:
df_all_years.loc[df_all_years['simple_description'].notna(), 'new_run_value'] = df_all_years.loc[df_all_years['simple_description'].notna(), 'simple_description'].map(not_inplay_run_values)

In [120]:
df_inplay = df_all_years[df_all_years['description'] == 'hit_into_play']
df_not_inplay = df_all_years[df_all_years['description'] != 'hit_into_play']

In [121]:
features = ['launch_speed', 'launch_angle', 'hit_distance_sc']
target = 'new_run_value'

X = df_inplay[features]
y = df_inplay[target]

In [None]:
#Random
model = CatBoostRegressor(iterations=1000, 
                          learning_rate=0.1, 
                          depth=6, 
                          loss_function='RMSE',
                          random_state=12345,
                          verbose=100)


cv = KFold(n_splits=5, shuffle=True, random_state=12345)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_squared_error')
cv_scores

In [None]:
final_model = CatBoostRegressor(iterations=1000, 
                                learning_rate=0.1, 
                                depth=6, 
                                loss_function='RMSE',
                                random_state=12345,
                                verbose=100)
final_model.fit(X, y)

In [None]:
df_inplay['new_run_value'] = final_model.predict(X)

In [148]:
df_final = pd.concat([df_inplay,df_not_inplay],ignore_index=True)

In [154]:
df_final.to_csv("Statcast_2020_to_2023.csv")