In [None]:
!pip install pybaseball
!pip install pandas
!pip uninstall xgboost
!pip uninstall scikit-learn
!pip install scikit-learn
!pip install xgboost

from pathlib import Path
import numpy as np
import pybaseball as pb
import pandas as pd
import requests
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
import sklearn
import xgboost

def get_mlb_game_data(start, end):
    # Get all the mlb game codes using the given web api link
    all_games = []
    for i in range(start, end+1):
        response = requests.get(f'https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={i}-01-01&endDate={i}-12-31&gameType=R&fields=dates,date,games,gamePk')
        json = response.json()
        for date in json['dates']:
            for game in date['games']:
                all_games.append(game['gamePk'])
    
    # Get the game data for all the game codes
    all_data = None
    for code in all_games:
        new_data = None
        try:
            new_data = pb.statcast_single_game(code).iloc[::-1]
        except:
            continue
        if all_data is None:
            all_data = new_data
        else:
            all_data = pd.concat([all_data, new_data])
    
    return all_data

def get_end_of_inn(start, pitches):
    for i in range(start, len(pitches)):
        if i == len(pitches)-1 or pitches.iloc[i]['inning'] != pitches.iloc[i+1]['inning'] or pitches.iloc[i]['inning_topbot'] != pitches.iloc[i+1]['inning_topbot']:
            return pitches.iloc[i]['post_bat_score']
        
def encode_pitch(pitch):
    if pitch['balls'] == 4:
        return f'{pitch["balls"]-1}-{pitch["strikes"]}', f"{'X' if np.isnan(pitch['on_1b']) else 'O'}{'X' if np.isnan(pitch['on_2b']) else 'O'}{'X' if np.isnan(pitch['on_3b']) else 'O'}"
    return f'{pitch["balls"]}-{pitch["strikes"]}', f"{'X' if np.isnan(pitch['on_1b']) else 'O'}{'X' if np.isnan(pitch['on_2b']) else 'O'}{'X' if np.isnan(pitch['on_3b']) else 'O'}"

def build_re288(pitches):
    counts = ['0-2', '1-2', '0-1', '2-2', '1-1', '0-0', '1-0', '2-1', '3-2', '2-0', '3-1', '3-0']
    bases = ['XXX', 'OXX', 'XOX', 'OOX', 'XXO', 'OXO', 'XOO', 'OOO']
    total_runs = np.zeros((12, 24))
    tot_count = np.zeros((12, 24))
    inn_run_end = get_end_of_inn(0, pitches)
    for i in range(len(pitches)):
        count, base = encode_pitch(pitches.iloc[i])
        total_runs[counts.index(count)][pitches.iloc[i]['outs_when_up']*8+bases.index(base)] += inn_run_end - pitches.iloc[i]['bat_score']
        tot_count[counts.index(count)][pitches.iloc[i]['outs_when_up']*8+bases.index(base)] += 1
        if i != len(pitches)-1 and (pitches.iloc[i]['inning'] != pitches.iloc[i+1]['inning'] or pitches.iloc[i]['inning_topbot'] != pitches.iloc[i+1]['inning_topbot']):
            inn_run_end = get_end_of_inn(i+1, pitches)
                
    final_runs = total_runs / tot_count
    return final_runs




In [None]:
re288 = build_re288(get_mlb_game_data(2024, 2024))

[[ 4934.  2094.   723.   870.   125.   397.   294.   388.  1796.  1455.
    618.   826.   347.   501.   376.   578.   373.   524.   331.   381.
    123.   206.   154.   280.]
 [ 7569.  3021.  1241.  1220.   199.   437.   448.   501.  2837.  2097.
   1103.  1327.   442.   678.   574.   786.   799.   799.   598.   557.
    241.   267.   344.   462.]
 [10817.  4296.  1613.  1851.   198.   756.   538.   805.  3962.  3170.
   1366.  1920.   659.  1013.   767.  1161.  1079.  1269.   883.   936.
    336.   584.   354.   632.]
 [ 7109.  2407.  1130.   930.   203.   415.   360.   448.  2691.  1896.
   1012.  1224.   465.   535.   639.   713.   787.   728.   720.   624.
    213.   232.   351.   520.]
 [ 8588.  3338.  1362.  1364.   185.   564.   449.   569.  3288.  2394.
   1203.  1469.   528.   827.   651.   845.  1054.   942.   825.   861.
    350.   459.   428.   599.]
 [21960.  9504.  3208.  3987.   424.  1760.  1061.  1620.  8664.  6981.
   2905.  4195.  1372.  2434.  1723.  2420.  2582.  3

array([[0.40568985, 0.75432277, 0.93410853, 1.32420091, 1.42045455,
        1.64049587, 1.94701987, 1.94974874, 0.19652041, 0.4118313 ,
        0.51715481, 0.68320926, 0.85679012, 1.01212121, 1.22875817,
        1.42716049, 0.05299048, 0.14777214, 0.20936116, 0.25656566,
        0.20230263, 0.32907348, 0.37108434, 0.55445545],
       [0.41766913, 0.82699151, 1.00242326, 1.36618141, 1.2208589 ,
        1.57194245, 1.94782609, 2.02834008, 0.2132762 , 0.4679759 ,
        0.57961114, 0.82833958, 0.81399632, 1.05607477, 1.13663366,
        1.43169399, 0.07553413, 0.17775306, 0.22729   , 0.27197266,
        0.25051975, 0.34407216, 0.46423752, 0.6647482 ],
       [0.454324  , 0.83417476, 1.07748831, 1.40227273, 1.28571429,
        1.81730769, 1.95636364, 2.15817694, 0.22814695, 0.48508034,
        0.62289102, 0.85599643, 0.9077135 , 1.09395248, 1.33391304,
        1.51764706, 0.07918104, 0.1909994 , 0.30617198, 0.33937636,
        0.29577465, 0.43909774, 0.49372385, 0.67956989],
       [0.462

In [22]:
counts = ['0-2', '1-2', '0-1', '2-2', '1-1', '0-0', '1-0', '2-1', '3-2', '2-0', '3-1', '3-0']
bases = ['XXX', 'OXX', 'XOX', 'OOX', 'XXO', 'OXO', 'XOO', 'OOO']
positions = []
for i in range(3):
    for base in bases:
        positions.append(f'{i}{base}')

re288_df = pd.DataFrame({'Position': positions, **{counts[i]: list(re288[i]) for i in range(len(counts))}})
re288_df

Unnamed: 0,Position,0-2,1-2,0-1,2-2,1-1,0-0,1-0,2-1,3-2,2-0,3-1,3-0
0,0XXX,0.40569,0.417669,0.454324,0.462976,0.472024,0.485916,0.514833,0.527267,0.559951,0.564498,0.62906,0.686211
1,0OXX,0.754323,0.826992,0.834175,0.868639,0.894427,0.890722,0.958561,0.948873,1.064369,1.023828,1.01745,1.092838
2,0XOX,0.934109,1.002423,1.077488,1.054104,1.097502,1.074707,1.131229,1.130372,1.145478,1.255556,1.259887,1.373418
3,0OOX,1.324201,1.366181,1.402273,1.288089,1.422315,1.477214,1.561311,1.479508,1.510152,1.660436,1.699507,1.875
4,0XXO,1.420455,1.220859,1.285714,1.492647,1.350365,1.422819,1.406504,1.454545,1.3625,1.511628,1.527778,1.692308
5,0OXO,1.640496,1.571942,1.817308,1.765957,1.819355,1.906826,1.951705,1.948718,1.738854,2.131579,2.104478,2.612903
6,0XOO,1.94702,1.947826,1.956364,1.846154,1.969298,2.013283,2.086957,1.935185,2.025641,1.972603,2.142857,2.142857
7,0OOO,1.949749,2.02834,2.158177,2.206897,2.303644,2.294618,2.441667,2.525424,2.357143,2.441558,2.525,2.470588
8,1XXX,0.19652,0.213276,0.228147,0.235145,0.247162,0.261444,0.294625,0.284915,0.310538,0.343437,0.367392,0.438071
9,1OXX,0.411831,0.467976,0.48508,0.543111,0.519983,0.52548,0.565126,0.563132,0.648272,0.637013,0.681768,0.783158


In [34]:
def generate_run_value(pitches, percent_re = 0.7, percent_woba = 0.3):
    run_values = []
    woba_scale = pd.read_csv(Path('./woba_scale.csv'))
    for i in range(len(pitches)):
        cur_year = pitches.iloc[i]['game_date'].year
        runs_tot = percent_re * (pitches.iloc[i]['post_bat_score']-pitches.iloc[i]['bat_score'])
        count, base = encode_pitch(pitches.iloc[i])
        if i == len(pitches) - 1 or pitches.iloc[i]['inning'] != pitches.iloc[i+1]['inning'] or pitches.iloc[i]['inning_topbot'] != pitches.iloc[i+1]['inning_topbot']:
            if runs_tot == 0:
                runs_tot -= percent_re* re288[counts.index(count)][pitches.iloc[i]['outs_when_up']*8+bases.index(base)]
        else:
            next_count, next_base = encode_pitch(pitches.iloc[i+1])
            runs_tot += percent_re * (re288[counts.index(next_count)][pitches.iloc[i]['outs_when_up']*8+bases.index(next_base)] - re288[counts.index(count)][pitches.iloc[i]['outs_when_up']*8+bases.index(base)])
            
        woba = 0
        if np.isnan(pitches.iloc[i]['estimated_woba_using_speedangle']):
            if pitches.iloc[i]['description'] == 'ball':
                woba = woba_scale[woba_scale['Season'] == cur_year].iloc[0]['wBB']/4
        else:
            woba = pitches.iloc[i]['estimated_woba_using_speedangle']
        
        runs_tot += percent_woba * ((woba - woba_scale[woba_scale['Season'] == cur_year].iloc[0]['wOBA'])/woba_scale[woba_scale['Season'] == cur_year].iloc[0]['wOBAScale'])
        
        run_values.append(-1*runs_tot)
    return run_values

In [35]:
#all_pitches_2024 = get_mlb_game_data(2024, 2024)
run_values = generate_run_value(all_pitches_2024)
run_values

[np.float64(0.013031140441944675),
 np.float64(0.10484558491252326),
 np.float64(-0.005397057051659007),
 np.float64(-0.22114558300368842),
 np.float64(-0.014214861753462371),
 np.float64(0.11977358366045071),
 np.float64(-0.004839551497614926),
 np.float64(0.10133396968724494),
 np.float64(0.10315889655516143),
 np.float64(0.12615354931925446),
 np.float64(-0.006028268517941596),
 np.float64(0.03462645066284937),
 np.float64(0.019523089953210313),
 np.float64(0.23933045799958924),
 np.float64(0.013031140441944675),
 np.float64(0.10484558491252326),
 np.float64(-0.005397057051659007),
 np.float64(0.11988249135982093),
 np.float64(-0.03460937129833905),
 np.float64(0.12477136622508118),
 np.float64(0.052657004830917876),
 np.float64(0.08789388210062088),
 np.float64(0.09321261628348496),
 np.float64(0.017492394777645307),
 np.float64(0.026929379329524952),
 np.float64(0.0748792270531401),
 np.float64(0.13409668378752684),
 np.float64(0.09699360276906069),
 np.float64(0.10892312609291996

In [46]:
y_train = run_values
x_train = all_pitches_2024[['release_speed', 'release_pos_x', 'release_pos_z', 'p_throws', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 
                            'release_spin_rate', 'release_extension', 'release_pos_y', 'spin_axis', 'pitch_name']]
x_train = pd.get_dummies(x_train, columns=['p_throws', 'pitch_name']).to_numpy()
x_scaler = MinMaxScaler()
x_train = x_scaler.fit_transform(x_train)

xgb = XGBRegressor()
xgb.fit(x_train, y_train)

ImportError: sklearn needs to be installed in order to use this module