# Predicting Pitches
For our project, we'll aim to solve the problem of predicting the next pitch a Major League Baseball Pitcher will throw during an at bat

In [1]:
import utils.utils as utils
import pandas as pd

## Part 1: Preprocessing the data

In [2]:
# get the data
pitch_data = utils.get_pitch_data()
pitch_data.head()

Unnamed: 0,pitch_data_id,season,pitcher_id,team_id,team_abbrev,era,wins,losses,throws,b1_id,...,runs_pitcher_team,runs_batter_team,game_id,inning_id,half_inning_id,at_bat_id,gid,p1_pitch_type,p0_at_bat_o,p0_pitch_des
0,334805,2018,572971,117,HOU,3.57,11.0,10.0,L,572287,...,0,0,e46d949f-eac3-47bb-bd50-2fe05c9feafc,119581e7-5af3-4a4a-b6b9-67f22d760984,41102429-e42f-448e-913e-587b601164c0,5937bd7f-00d6-42b0-bdc5-8745a98309ca,gid_2018_09_19_seamlb_houmlb_1,FF,0,
1,334806,2018,572971,117,HOU,3.57,11.0,10.0,L,572287,...,0,0,e46d949f-eac3-47bb-bd50-2fe05c9feafc,119581e7-5af3-4a4a-b6b9-67f22d760984,41102429-e42f-448e-913e-587b601164c0,5937bd7f-00d6-42b0-bdc5-8745a98309ca,gid_2018_09_19_seamlb_houmlb_1,FF,0,
2,334807,2018,572971,117,HOU,3.57,11.0,10.0,L,572287,...,0,0,e46d949f-eac3-47bb-bd50-2fe05c9feafc,119581e7-5af3-4a4a-b6b9-67f22d760984,41102429-e42f-448e-913e-587b601164c0,5937bd7f-00d6-42b0-bdc5-8745a98309ca,gid_2018_09_19_seamlb_houmlb_1,SL,0,
3,334808,2018,446372,114,CLE,2.02,8.0,2.0,R,456715,...,0,0,0268a063-3acb-498f-b166-019d6080cf3e,5c5faa08-34b4-4570-83cd-0577e4abc0d0,f61b94d3-7da7-4633-a5c4-142806e756de,593958a2-c7f2-4174-a45a-7f9fcb5bd0aa,gid_2018_06_05_milmlb_clemlb_1,SI,0,
4,334809,2018,446372,114,CLE,2.02,8.0,2.0,R,456715,...,0,0,0268a063-3acb-498f-b166-019d6080cf3e,5c5faa08-34b4-4570-83cd-0577e4abc0d0,f61b94d3-7da7-4633-a5c4-142806e756de,593958a2-c7f2-4174-a45a-7f9fcb5bd0aa,gid_2018_06_05_milmlb_clemlb_1,SI,0,


In [3]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 74 columns):
pitch_data_id          10000 non-null int64
season                 10000 non-null int64
pitcher_id             10000 non-null int64
team_id                10000 non-null object
team_abbrev            10000 non-null object
era                    10000 non-null float64
wins                   9958 non-null float64
losses                 9958 non-null float64
throws                 10000 non-null object
b1_id                  10000 non-null int64
b1_team_id             10000 non-null object
b1_stand               10000 non-null object
b1_height              10000 non-null int64
b1_bats                10000 non-null object
b1_avg                 10000 non-null float64
b1_hr                  10000 non-null int64
b1_rbi                 10000 non-null int64
b1_bat_order           9419 non-null float64
b1_game_position       9419 non-null object
p1_pitch_id            10000 non-null objec

In [4]:
cols_to_drop=['p1_pitch_id','p0_pitch_id','result_type','pitch_data_id','season','team_id','game_id','inning_id','half_inning_id','at_bat_id','gid']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
# drop pickoffs, automatic balls, strikes, etc..
pitch_data = utils.drop_unwanted_pitches(pitch_data)
# set category of cols
pitch_data = utils.set_dtypes(pitch_data)
# drop nans
pitch_data = pitch_data.dropna()
all_data = pitch_data

In [5]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8775 entries, 0 to 9999
Data columns (total 63 columns):
pitcher_id             8775 non-null object
team_abbrev            8775 non-null object
era                    8775 non-null float64
wins                   8775 non-null float64
losses                 8775 non-null float64
throws                 8775 non-null object
b1_id                  8775 non-null object
b1_team_id             8775 non-null object
b1_stand               8775 non-null int64
b1_height              8775 non-null int64
b1_bats                8775 non-null object
b1_avg                 8775 non-null float64
b1_hr                  8775 non-null int64
b1_rbi                 8775 non-null int64
b1_bat_order           8775 non-null float64
b1_game_position       8775 non-null object
p1_pitch_seqno         8775 non-null int64
p0_pitch_seqno         8775 non-null float64
p0_inning              8775 non-null float64
result_type_simple     8775 non-null object
x          

In [6]:
pitch_types = pitch_data.loc[:,'p1_pitch_type']
Y = utils.encode_simple_pitch_types(pitch_types)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8775 entries, 0 to 9999
Columns: 1258 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(1207)
memory usage: 13.6 MB


In [7]:
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7020 entries, 5477 to 8262
Columns: 1258 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(1207)
memory usage: 10.9 MB


## Part2: Train an XGBoost multiclassifier

In [20]:
import models.xgboost_model as xgb_model

In [8]:
xgb_multi_class_model = xgb_model.get_multi_class_classifier_model()

In [11]:
xgb_model.fit_multi_class_model(model=xgb_multi_class_model,x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test,save_location='./training/xgb_multi_class.bin')


Model Report
Accuracy (Train) : 1.0
Accuracy (Test) : 0.4447544642857143


It looks like we're overfitting. I'll remove the pitcher and batter ids

In [21]:
pitch_data = all_data
# remove the pitch
cols_to_drop=['pitcher_id','b1_id']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8775 entries, 0 to 9998
Columns: 156 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(105)
memory usage: 4.4 MB


In [22]:
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7020 entries, 5478 to 8314
Columns: 156 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(105)
memory usage: 3.5 MB


In [24]:
xgb_multi_class_model = xgb_model.get_multi_class_classifier_model()

In [25]:
xgb_model.fit_multi_class_model(model=xgb_multi_class_model,x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test,save_location='./training/xgb_multi_class_no_pitcher_batter_id.bin',useTrainCV=True)

  if getattr(data, 'base', None) is not None and \



Model Report
Accuracy (Train) : 0.7210826210826211
Accuracy (Test) : 0.4153846153846154


## Part3: Try an NN

In [16]:
pitch_data = all_data
# remove the pitch
cols_to_drop=['pitcher_id','b1_id']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8775 entries, 0 to 9999
Columns: 156 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(105)
memory usage: 4.7 MB


In [17]:
import models.nn_model as nn_model
import keras

In [18]:
# 16 pitch types
num_pitch_types = 16
num_cols = len(X.iloc[0,:])
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)
y_test = keras.utils.to_categorical(y_test,num_classes=num_pitch_types)
y_train = keras.utils.to_categorical(y_train,num_classes=num_pitch_types)

In [19]:
model = nn_model.get_multi_class_classifier_model(num_cols,num_pitch_types)

In [20]:
score=nn_model.fit_multi_class_model(model,X_train,y_train,X_test,y_test)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155

Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [21]:
score

[12.86691668638137, 0.20170940287262626]