In [24]:
import pandas as pd
import numpy as np
from pybaseball import statcast
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from unidecode import unidecode

In [None]:
"""
data20_22 = statcast(start_dt = "2020-04-01", end_dt = "2022-12-01") 
data20_22.to_csv('Statcast_2020_to_2022.csv')
data23 = statcast(start_dt = "2023-04-01", end_dt = "2023-12-01")
data23.to_csv('Statcast_2023.csv')
"""

In [42]:
#Read in data
data20_22 = pd.read_csv("Statcast_2020_to_2022.csv")
data23 = pd.read_csv("Statcast_2023.csv")

data20_22['player_name'] = data20_22['player_name'].apply(unidecode)
data23['player_name'] = data23['player_name'].apply(unidecode)

#Select columns
data20_22 = data20_22[['player_name','pitch_type','release_speed','release_pos_x','release_pos_z','release_spin_rate','release_extension','spin_axis','pfx_x', 'pfx_z','vx0','vy0','vz0','ax','ay','az','game_year','stand','p_throws','delta_run_exp','balls','strikes']]
data23 = data23[['player_name','pitch_type','release_speed','release_pos_x','release_pos_z','release_spin_rate','release_extension','spin_axis','pfx_x', 'pfx_z','vx0','vy0','vz0','ax','ay','az','game_year','stand','p_throws','delta_run_exp','balls','strikes']]

#Drop na values
data20_22 = data20_22.dropna()
data23 = data23.dropna()

#Flip to get view from pitcher
data20_22['pfx_x'] = data20_22['pfx_x'] * -1
data23['pfx_x'] = data23['pfx_x'] * -1

#Change to inches
data20_22['pfx_x'] = data20_22['pfx_x'] * 12
data20_22['pfx_z'] = data20_22['pfx_z'] * 12
data23['pfx_x'] = data23['pfx_x'] * 12
data23['pfx_z'] = data23['pfx_z'] * 12

data20_22['p_throws'] = data20_22['p_throws'].map({'L': 0, 'R': 1})
data23['p_throws'] = data23['p_throws'].map({'L': 0, 'R': 1})

data20_22['stand'] = data20_22['stand'].map({'L': 0, 'R': 1})
data23['stand'] = data23['stand'].map({'L': 0, 'R': 1})

#Remove unnecessary pitches
data20_22 = data20_22[data20_22.pitch_type != 'PO']
data23 = data23[data23.pitch_type != 'PO']

#Retag pitches
data20_22['pitch_type'] = data20_22['pitch_type'].replace(['FF','FA'], 'Fastball')
data23['pitch_type'] = data23['pitch_type'].replace(['FF','FA'], 'Fastball')

data20_22['pitch_type'] = data20_22['pitch_type'].replace(['SI'], 'Sinker')
data23['pitch_type'] = data23['pitch_type'].replace(['SI'], 'Sinker')

data20_22['pitch_type'] = data20_22['pitch_type'].replace(['FC'], 'Cutter')
data23['pitch_type'] = data23['pitch_type'].replace(['FC'], 'Cutter')

data20_22['pitch_type'] = data20_22['pitch_type'].replace(['SL'], 'Slider')
data23['pitch_type'] = data23['pitch_type'].replace(['SL'], 'Slider')

data20_22['pitch_type'] = data20_22['pitch_type'].replace(['ST'], 'Sweeper')
data23['pitch_type'] = data23['pitch_type'].replace(['ST'], 'Sweeper')

data20_22['pitch_type'] = data20_22['pitch_type'].replace(['CU','CS','SV','KC'], 'Curveball')
data23['pitch_type'] = data23['pitch_type'].replace(['CU','CS','SV','KC'], 'Curveball')

data20_22['pitch_type'] = data20_22['pitch_type'].replace(['CH','FO','EP','KN','SC'], 'Changeup')
data23['pitch_type'] = data23['pitch_type'].replace(['CH','FO','EP','KN','SC'], 'Changeup')

data20_22['pitch_type'] = data20_22['pitch_type'].replace(['FS'], 'Splitter')
data23['pitch_type'] = data23['pitch_type'].replace(['FS'], 'Splitter')


In [43]:
#caculate vaa and haa
def approach_angles(df):
    yf = 17/12
    y0 = 50
    df['vy_f'] = -np.sqrt(df['vy0']**2 - (2 * df['ay'] * y0 - yf))
    df['t'] = (df['vy_f'] - df['vy0']) / df['ay']
    df['vz_f'] = df['vz0'] + (df['az'] * df['t'])
    df['vx_f'] = df['vx0'] + (df['ax'] * df['t'])

    df['vaa'] = -np.arctan(df['vz_f'] / df['vy_f']) * (180 / np.pi)
    df['haa'] = -np.arctan(df['vx_f'] / df['vy_f']) * (180 / np.pi)
    return df

data20_22 = approach_angles(data20_22)
data23 = approach_angles(data23)


#caculate estimated axis differential and estimated spin efficiency
def axis_differential(df):
    df['spin_axis'] = (df['spin_axis']+ 180) % 360
    df['calc_spin'] = np.arctan2(df['pfx_z'],df['pfx_x']) * (180 / np.pi)
    df['axis_dif'] = np.minimum(
        np.abs(df['spin_axis'] - df['calc_spin']),
        360 - np.abs(df['spin_axis'] - df['calc_spin'])
    )
    return df

data20_22 = axis_differential(data20_22)
data23 = axis_differential(data23)


def find_most_common_fastball(df):
    fastball_types = df[df['pitch_type'].isin(['Fastball', 'Sinker', 'Cutter'])]
    most_common_fastball = fastball_types.groupby(['player_name', 'pitch_type']).size().reset_index(name='count')
    most_common_fastball = most_common_fastball.loc[most_common_fastball.groupby('player_name')['count'].idxmax()]
    most_common_fastball = most_common_fastball[['player_name', 'pitch_type']]
    most_common_fastball.rename(columns={'pitch_type': 'most_common_fastball'}, inplace=True)
    return most_common_fastball

def add_velocity_and_movement_diff(df):
    most_common_fastball = find_most_common_fastball(df)
    
    df = df.merge(most_common_fastball, on='player_name', how='left')
    
    avg_stats = df[df['pitch_type'] == df['most_common_fastball']].groupby('player_name')[['release_speed', 'pfx_x', 'pfx_z']].mean().reset_index()
    avg_stats.rename(columns={
        'release_speed': 'avg_fastball_velocity',
        'pfx_x': 'avg_fastball_pfx_x',
        'pfx_z': 'avg_fastball_pfx_z'
    }, inplace=True)
    
    df = df.merge(avg_stats, on='player_name', how='left')
    
    df['velocity_diff'] = df['avg_fastball_velocity'] - df['release_speed']
    df['horizontal_movement_diff'] = df['avg_fastball_pfx_x'] - df['pfx_x']
    df['vertical_movement_diff'] = df['avg_fastball_pfx_z'] - df['pfx_z']
    
    return df

data20_22 = add_velocity_and_movement_diff(data20_22)
data23 = add_velocity_and_movement_diff(data23)

In [44]:
# Combining Fastball, Sinker, and Cutter into one dataframe
fastball_20_22 = data20_22[data20_22.pitch_type.isin(['Fastball', 'Sinker', 'Cutter'])]
fastball_23 = data23[data23.pitch_type.isin(['Fastball', 'Sinker', 'Cutter'])]

# Combining Slider and Sweeper into one dataframe
breakingball_20_22 = data20_22[data20_22.pitch_type.isin(['Slider', 'Sweeper', 'Curveball'])]
breakingball_23 = data23[data23.pitch_type.isin(['Slider', 'Sweeper', 'Curveball'])]

# Combining Changeup and Splitter into one dataframe
offspeed_20_22 = data20_22[data20_22.pitch_type.isin(['Changeup', 'Splitter'])]
offspeed_23 = data23[data23.pitch_type.isin(['Changeup', 'Splitter'])]

In [45]:
features = ['release_speed','release_pos_x','release_pos_z','release_extension','pfx_x', 'pfx_z', 'axis_dif', 'velocity_diff', 'horizontal_movement_diff', 'vertical_movement_diff','game_year']
fball_features = ['release_speed','release_pos_x','release_pos_z','release_extension','pfx_x', 'pfx_z', 'axis_dif','game_year']

#Split into train and test

fastball_x = fastball_20_22[fball_features]
fastball_y = fastball_20_22['delta_run_exp']

x_train_fastball, x_test_fastball, y_train_fastball, y_test_fastball = train_test_split(fastball_x, fastball_y, train_size = 0.75, random_state = 12345)

breakingball_x = breakingball_20_22[features]
breakingball_y = breakingball_20_22['delta_run_exp']

x_train_breakingball, x_test_breakingball, y_train_breakingball, y_test_breakingball = train_test_split(breakingball_x, breakingball_y, train_size=0.75, random_state=12345)

offspeed_x = offspeed_20_22[features]
offspeed_y = offspeed_20_22['delta_run_exp']

x_train_offspeed, x_test_offspeed, y_train_offspeed, y_test_offspeed = train_test_split(offspeed_x, offspeed_y, train_size=0.75, random_state=12345)



In [46]:
#Set params for catboost models (Random ones right now)
params = {
    'iterations': 500,
    'depth': 5,
    'learning_rate': 0.11, #manual checks with FIP corr both predictive and descriptive
    'random_seed': 12345
    
}

#Train each model

#Fastball
fastball_model = CatBoostRegressor(**params)
fastball_model.fit(x_train_fastball, y_train_fastball)

#Sinker
breakingball_model = CatBoostRegressor(**params)
breakingball_model.fit(x_train_breakingball, y_train_breakingball)

#Cutter
offspeed_model = CatBoostRegressor(**params)
offspeed_model.fit(x_train_offspeed, y_train_offspeed)

0:	learn: 0.2452046	total: 26.6ms	remaining: 13.3s
1:	learn: 0.2451991	total: 48.8ms	remaining: 12.2s
2:	learn: 0.2451908	total: 71.6ms	remaining: 11.9s
3:	learn: 0.2451833	total: 90ms	remaining: 11.2s
4:	learn: 0.2451767	total: 110ms	remaining: 10.9s
5:	learn: 0.2451721	total: 129ms	remaining: 10.6s
6:	learn: 0.2451669	total: 148ms	remaining: 10.4s
7:	learn: 0.2451614	total: 169ms	remaining: 10.4s
8:	learn: 0.2451574	total: 189ms	remaining: 10.3s
9:	learn: 0.2451540	total: 208ms	remaining: 10.2s
10:	learn: 0.2451503	total: 227ms	remaining: 10.1s
11:	learn: 0.2451468	total: 249ms	remaining: 10.1s
12:	learn: 0.2451431	total: 269ms	remaining: 10.1s
13:	learn: 0.2451405	total: 289ms	remaining: 10s
14:	learn: 0.2451369	total: 308ms	remaining: 9.96s
15:	learn: 0.2451344	total: 326ms	remaining: 9.87s
16:	learn: 0.2451308	total: 346ms	remaining: 9.84s
17:	learn: 0.2451289	total: 367ms	remaining: 9.83s
18:	learn: 0.2451263	total: 385ms	remaining: 9.76s
19:	learn: 0.2451244	total: 405ms	remaini

<catboost.core.CatBoostRegressor at 0x1585174d0>

In [47]:
#Predictions
fastball_23['pred'] = fastball_model.predict(fastball_23[fball_features])

breakingball_23['pred'] = breakingball_model.predict(breakingball_23[features])

offspeed_23['pred'] = offspeed_model.predict(offspeed_23[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fastball_23['pred'] = fastball_model.predict(fastball_23[fball_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  breakingball_23['pred'] = breakingball_model.predict(breakingball_23[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offspeed_23['pred'] = offspeed_model.predict(offspe

Unnamed: 0,player_name,pitch_type,release_speed,release_pos_x,release_pos_z,release_spin_rate,release_extension,spin_axis,pfx_x,pfx_z,...,calc_spin,axis_dif,most_common_fastball,avg_fastball_velocity,avg_fastball_pfx_x,avg_fastball_pfx_z,velocity_diff,horizontal_movement_diff,vertical_movement_diff,pred
103,"Gallen, Zac",Changeup,85.9,-3.33,5.65,1422.0,6.7,223.0,3.12,12.96,...,76.464144,146.535856,Fastball,93.658800,2.916861,16.988677,7.758800,-0.203139,4.028677,0.059421
106,"Eovaldi, Nathan",Splitter,87.3,-2.61,5.49,1498.0,6.7,240.0,10.32,1.32,...,7.288952,232.711048,Fastball,95.189638,12.576188,13.535718,7.889638,2.256188,12.215718,-0.007973
107,"Eovaldi, Nathan",Splitter,88.6,-2.39,5.61,1638.0,6.7,237.0,11.04,2.88,...,14.620874,222.379126,Fastball,95.189638,12.576188,13.535718,6.589638,1.536188,10.655718,-0.007377
108,"Eovaldi, Nathan",Splitter,85.6,-2.38,5.64,1574.0,6.7,241.0,12.00,2.04,...,9.648045,231.351955,Fastball,95.189638,12.576188,13.535718,9.589638,0.576188,11.495718,-0.005088
109,"Eovaldi, Nathan",Splitter,88.5,-2.51,5.51,1634.0,6.7,236.0,12.96,1.56,...,6.863692,229.136308,Fastball,95.189638,12.576188,13.535718,6.689638,-0.383812,11.975718,-0.008909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
719181,"Gilbert, Logan",Splitter,84.5,-1.34,6.04,1055.0,7.5,232.0,11.16,3.48,...,17.318938,214.681062,Fastball,95.685094,6.130123,17.003145,11.185094,-5.029877,13.523145,-0.011158
719183,"Gilbert, Logan",Splitter,86.0,-1.39,6.02,754.0,7.4,242.0,10.44,6.00,...,29.886527,212.113473,Fastball,95.685094,6.130123,17.003145,9.685094,-4.309877,11.003145,0.001070
719184,"Gilbert, Logan",Splitter,85.8,-1.37,6.17,1055.0,7.3,225.0,14.76,1.92,...,7.411493,217.588507,Fastball,95.685094,6.130123,17.003145,9.885094,-8.629877,15.083145,-0.018010
719217,"Gilbert, Logan",Splitter,87.3,-1.30,6.08,1219.0,7.6,222.0,10.68,2.76,...,14.489763,207.510237,Fastball,95.685094,6.130123,17.003145,8.385094,-4.549877,14.243145,-0.017573


In [41]:
combined_df = pd.concat([fastball_23, breakingball_23, offspeed_23])

def scale_and_score(df):
    df['scaled'] = df['pred'] - df['pred'].max()
    df['scaled'] = abs(df['scaled'])
    df['scaled'] = df['scaled'] - df['scaled'].mean()
    df['scaled'] = df['scaled'] / df['scaled'].std()
    df['score'] = df['scaled'] * 10 + 100
    return df

scale_and_score(combined_df)

fastball_23 = combined_df[combined_df.pitch_type == 'Fastball']
sinker_23 = combined_df[combined_df.pitch_type == 'Sinker']
cutter_23 = combined_df[combined_df.pitch_type == 'Cutter']
slider_23 = combined_df[combined_df.pitch_type == 'Slider']
sweeper_23 = combined_df[combined_df.pitch_type == 'Sweeper']
curveball_23 = combined_df[combined_df.pitch_type == 'Curveball']
changeup_23 = combined_df[combined_df.pitch_type == 'Changeup']
splitter_23 = combined_df[combined_df.pitch_type == 'Splitter']


def calculate_mean_scores(df):
    mean_scores = df.groupby('player_name')['score'].mean().reset_index()
    mean_scores.rename(columns={'score': 'mean_scores'}, inplace=True)
    mean_scores = mean_scores.sort_values(by='mean_scores', ascending=False)
    mean_scores['mean_scores'] = mean_scores['mean_scores'].round(2)
    return mean_scores

mean_scores_fastball = calculate_mean_scores(fastball_23)
mean_scores_sinker = calculate_mean_scores(sinker_23)
mean_scores_cutter = calculate_mean_scores(cutter_23)
mean_scores_slider = calculate_mean_scores(slider_23)
mean_scores_sweeper = calculate_mean_scores(sweeper_23)
mean_scores_curveball = calculate_mean_scores(curveball_23)
mean_scores_changeup = calculate_mean_scores(changeup_23)
mean_scores_splitter = calculate_mean_scores(splitter_23)


combined_scores = mean_scores_fastball
combined_scores = combined_scores.merge(mean_scores_sinker, how = 'outer', on = 'player_name', suffixes=('','_sinker'))
combined_scores = combined_scores.merge(mean_scores_cutter, how = 'outer', on = 'player_name', suffixes=('','_cutter'))
combined_scores = combined_scores.merge(mean_scores_slider, how = 'outer', on = 'player_name', suffixes=('','_slider'))
combined_scores = combined_scores.merge(mean_scores_sweeper, how = 'outer', on = 'player_name', suffixes=('','_sweeper'))
combined_scores = combined_scores.merge(mean_scores_curveball, how = 'outer', on = 'player_name', suffixes=('','_curveball'))
combined_scores = combined_scores.merge(mean_scores_changeup, how = 'outer', on = 'player_name', suffixes=('','_changeup'))
combined_scores = combined_scores.merge(mean_scores_splitter, how = 'outer', on = 'player_name', suffixes=('','_splitter'))



combined_scores = combined_scores.rename(columns = {
    'mean_scores': 'fastball',
    'mean_scores_sinker': 'sinker',
    'mean_scores_cutter': 'cutter',
    'mean_scores_slider': 'slider',
    'mean_scores_sweeper': 'sweeper',
    'mean_scores_curveball': 'curveball',
    'mean_scores_changeup': 'changeup',
    'mean_scores_splitter': 'splitter'
    }
)

pitch_counts = data23.groupby(['player_name', 'pitch_type']).size().unstack(fill_value=0).reset_index()

combined_data = combined_scores.merge(pitch_counts, how='outer', on='player_name')
combined_data.fillna(0, inplace=True)

combined_data['weighted_fastball'] = combined_data['fastball'] * combined_data['Fastball']
combined_data['weighted_sinker'] = combined_data['sinker'] * combined_data['Sinker']
combined_data['weighted_cutter'] = combined_data['cutter'] * combined_data['Cutter']
combined_data['weighted_slider'] = combined_data['slider'] * combined_data['Slider']
combined_data['weighted_sweeper'] = combined_data['sweeper'] * combined_data['Sweeper']
combined_data['weighted_curveball'] = combined_data['curveball'] * combined_data['Curveball']
combined_data['weighted_changeup'] = combined_data['changeup'] * combined_data['Changeup']
combined_data['weighted_splitter'] = combined_data['splitter'] * combined_data['Splitter']

combined_data['total_weighted_score'] = (
    combined_data['weighted_fastball'] +
    combined_data['weighted_sinker'] +
    combined_data['weighted_cutter'] +
    combined_data['weighted_slider'] +
    combined_data['weighted_sweeper'] +
    combined_data['weighted_curveball'] +
    combined_data['weighted_changeup'] +
    combined_data['weighted_splitter']
)

combined_data['total_pitches'] = (
    combined_data['Fastball'] +
    combined_data['Sinker'] +
    combined_data['Cutter'] +
    combined_data['Slider'] +
    combined_data['Sweeper'] +
    combined_data['Curveball'] +
    combined_data['Changeup'] +
    combined_data['Splitter']
)

combined_data['overall'] = combined_data['total_weighted_score'] / combined_data['total_pitches']
combined_data['overall'] = combined_data['overall'].round(2)

min_pitch_count = 100
combined_data = combined_data[combined_data['total_pitches'] >= min_pitch_count]

combined_data = combined_data[['player_name', 'overall', 'fastball', 'sinker', 'cutter', 'slider', 'sweeper', 'curveball', 'changeup', 'splitter']]
combined_data = combined_data.sort_values(by='overall', ascending=False)

combined_data.to_csv("pitch_scores.csv", index=False)
combined_data

Unnamed: 0,player_name,overall,fastball,sinker,cutter,slider,sweeper,curveball,changeup,splitter
311,"Helsley, Ryan",115.26,115.94,0.00,0.00,117.53,0.00,97.98,0.00,0.0
860,"deGrom, Jacob",114.79,110.84,0.00,0.00,121.51,0.00,105.43,110.88,0.0
218,"Fairbanks, Pete",114.53,116.05,0.00,0.00,113.38,0.00,0.00,84.00,0.0
387,"Kerkering, Orion",114.22,102.09,106.28,0.00,0.00,118.77,0.00,0.00,0.0
139,"Clase, Emmanuel",112.20,100.35,0.00,109.33,118.40,0.00,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...
439,"Logue, Zach",86.77,87.86,91.18,94.42,0.00,0.00,88.58,78.36,0.0
813,"Weber, Ryan",86.68,0.00,104.31,0.00,87.63,0.00,0.00,20.02,0.0
517,"Miller, Tyson",83.59,101.97,84.09,0.00,91.69,0.00,-1.01,77.75,0.0
197,"Doyle, Tommy",83.30,87.87,97.98,77.24,91.03,0.00,0.00,0.00,0.0
