# Predicting Pitches
For our project, we'll aim to solve the problem of predicting the next pitch a Major League Baseball Pitcher will throw during an at bat

In [1]:
import utils.utils as utils
import pandas as pd

## Part 1: Preprocessing the data

In [2]:
# get the data
pitch_data = utils.get_pitch_data()
pitch_data.head()

Unnamed: 0,pitch_data_id,season,pitcher_id,team_id,team_abbrev,era,wins,losses,throws,b1_id,...,runs_pitcher_team,runs_batter_team,game_id,inning_id,half_inning_id,at_bat_id,gid,p1_pitch_type,p0_at_bat_o,p0_pitch_des
0,344084,2018,642152,133,OAK,1.41,7.0,1.0,R,594828,...,0,0,ce624c44-dd62-4d1a-9b7d-5a43999701f4,a6bfde1c-43c2-44fe-b4f2-fea67da79c8c,ee07b373-da76-43a2-b05e-31a23d6fd8a0,5d179556-61ce-44e8-9436-ce7a5c7b8b15,gid_2018_07_09_oakmlb_houmlb_1,FF,0,
1,344085,2018,642152,133,OAK,1.41,7.0,1.0,R,594828,...,0,0,ce624c44-dd62-4d1a-9b7d-5a43999701f4,a6bfde1c-43c2-44fe-b4f2-fea67da79c8c,ee07b373-da76-43a2-b05e-31a23d6fd8a0,5d179556-61ce-44e8-9436-ce7a5c7b8b15,gid_2018_07_09_oakmlb_houmlb_1,FT,0,
2,344086,2018,642152,133,OAK,1.41,7.0,1.0,R,594828,...,0,0,ce624c44-dd62-4d1a-9b7d-5a43999701f4,a6bfde1c-43c2-44fe-b4f2-fea67da79c8c,ee07b373-da76-43a2-b05e-31a23d6fd8a0,5d179556-61ce-44e8-9436-ce7a5c7b8b15,gid_2018_07_09_oakmlb_houmlb_1,FC,0,
3,344087,2018,457918,147,NYY,3.84,14.0,6.0,L,641820,...,0,0,ec37c36d-2b98-4832-b71c-fd367ef9d4a9,82374928-e870-439e-864a-655395161508,8a2b0aeb-2218-4a4d-9d80-4341f61c1173,5d17ae4f-0501-4b46-a4b2-89bc23ae9714,gid_2018_08_25_nyamlb_balmlb_1,FF,0,
4,344088,2018,457918,147,NYY,3.84,14.0,6.0,L,641820,...,0,0,ec37c36d-2b98-4832-b71c-fd367ef9d4a9,82374928-e870-439e-864a-655395161508,8a2b0aeb-2218-4a4d-9d80-4341f61c1173,5d17ae4f-0501-4b46-a4b2-89bc23ae9714,gid_2018_08_25_nyamlb_balmlb_1,CH,0,


In [3]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 74 columns):
pitch_data_id          10000 non-null int64
season                 10000 non-null int64
pitcher_id             10000 non-null int64
team_id                10000 non-null object
team_abbrev            10000 non-null object
era                    10000 non-null float64
wins                   9962 non-null float64
losses                 9962 non-null float64
throws                 10000 non-null object
b1_id                  10000 non-null int64
b1_team_id             10000 non-null object
b1_stand               10000 non-null object
b1_height              10000 non-null int64
b1_bats                10000 non-null object
b1_avg                 10000 non-null float64
b1_hr                  10000 non-null int64
b1_rbi                 10000 non-null int64
b1_bat_order           9371 non-null float64
b1_game_position       9371 non-null object
p1_pitch_id            10000 non-null objec

In [4]:
cols_to_drop=['p1_pitch_id','p0_pitch_id','result_type','pitch_data_id','season','team_id','game_id','inning_id','half_inning_id','at_bat_id','gid']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
# drop pickoffs, automatic balls, strikes, etc..
pitch_data = utils.drop_unwanted_pitches(pitch_data)
# set category of cols
pitch_data = utils.set_dtypes(pitch_data)
# drop nans
pitch_data = pitch_data.dropna()
all_data = pitch_data

In [5]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8751 entries, 0 to 9999
Data columns (total 63 columns):
pitcher_id             8751 non-null object
team_abbrev            8751 non-null object
era                    8751 non-null float64
wins                   8751 non-null float64
losses                 8751 non-null float64
throws                 8751 non-null object
b1_id                  8751 non-null object
b1_team_id             8751 non-null object
b1_stand               8751 non-null int64
b1_height              8751 non-null int64
b1_bats                8751 non-null object
b1_avg                 8751 non-null float64
b1_hr                  8751 non-null int64
b1_rbi                 8751 non-null int64
b1_bat_order           8751 non-null float64
b1_game_position       8751 non-null object
p1_pitch_seqno         8751 non-null int64
p0_pitch_seqno         8751 non-null float64
p0_inning              8751 non-null float64
result_type_simple     8751 non-null object
x          

In [6]:
pitch_types = pitch_data.loc[:,'p1_pitch_type']
Y = utils.encode_simple_pitch_types(pitch_types)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8751 entries, 0 to 9999
Columns: 1260 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(1209)
memory usage: 13.6 MB


In [7]:
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7000 entries, 8831 to 8322
Columns: 1260 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(1209)
memory usage: 10.8 MB


## Part2: Train an XGBoost multiclassifier

In [20]:
import models.xgboost_model as xgb_model

In [8]:
xgb_multi_class_model = xgb_model.get_multi_class_classifier_model()

In [11]:
xgb_model.fit_multi_class_model(model=xgb_multi_class_model,x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test,save_location='./training/xgb_multi_class.bin')


Model Report
Accuracy (Train) : 1.0
Accuracy (Test) : 0.4447544642857143


It looks like we're overfitting. I'll remove the pitcher and batter ids

In [21]:
pitch_data = all_data
# remove the pitch
cols_to_drop=['pitcher_id','b1_id']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8775 entries, 0 to 9998
Columns: 156 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(105)
memory usage: 4.4 MB


In [22]:
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7020 entries, 5478 to 8314
Columns: 156 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(105)
memory usage: 3.5 MB


In [24]:
xgb_multi_class_model = xgb_model.get_multi_class_classifier_model()

In [25]:
xgb_model.fit_multi_class_model(model=xgb_multi_class_model,x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test,save_location='./training/xgb_multi_class_no_pitcher_batter_id.bin',useTrainCV=True)

  if getattr(data, 'base', None) is not None and \



Model Report
Accuracy (Train) : 0.7210826210826211
Accuracy (Test) : 0.4153846153846154


## Part3: Try an NN

In [9]:
pitch_data = all_data
# remove the pitch
cols_to_drop=['pitcher_id','b1_id']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8751 entries, 0 to 9999
Columns: 155 entries, era to p0_pitch_des_
dtypes: float64(35), int64(16), uint8(104)
memory usage: 4.3 MB


In [10]:
import models.nn_model as nn_model

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [12]:
# 16 pitch types
num_pitch_types = 16
num_cols = len(X.iloc[0,:])
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)

In [13]:
model = nn_model.get_multi_class_classifier_model(num_cols,num_pitch_types)

W1208 19:54:53.695750 140020965308224 deprecation_wrapper.py:119] From /home/jpalomares/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1208 19:54:53.712112 140020965308224 deprecation_wrapper.py:119] From /home/jpalomares/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1208 19:54:53.714039 140020965308224 deprecation_wrapper.py:119] From /home/jpalomares/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1208 19:54:53.728638 140020965308224 deprecation_wrapper.py:119] From /home/jpalomares/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.place

In [14]:
score=nn_model.fit_multi_class_model(model,X_train,y_train,X_test,y_test)

ValueError: Error when checking target: expected dense_3 to have shape (16,) but got array with shape (1,)

In [15]:
y_test

2670    2
3183    4
5786    1
7850    2
6448    4
       ..
5695    4
7385    2
4193    2
9023    1
1436    1
Name: p1_pitch_type, Length: 1751, dtype: int64