# Predicting Pitches
For our project, we'll aim to solve the problem of predicting the next pitch a Major League Baseball Pitcher will throw during an at bat

In [1]:
import utils.utils as utils
import pandas as pd

## Part 1: Preprocessing the data

In [2]:
# get the data
pitch_data = utils.get_pitch_data()
pitch_data.head()

Unnamed: 0,pitch_data_id,season,pitcher_id,team_id,team_abbrev,era,wins,losses,throws,b1_id,...,is_runner_on_second,is_runner_on_third,runs_pitcher_team,runs_batter_team,game_id,inning_id,half_inning_id,at_bat_id,gid,p1_pitch_type
0,262971,2018,594577,138,STL,3.1,2.0,0.0,R,660162,...,False,False,0,0,18ebcf8e-a95e-440e-afba-0a8109cc8190,e4c0158e-2211-43ac-9fcd-1a2291990694,bcdb17e8-0729-44f1-bee4-32d548d269d9,3a7990d9-3b12-4b2c-b08d-ab17339654c3,gid_2018_07_11_slnmlb_chamlb_1,SL
1,262972,2018,594577,138,STL,3.1,2.0,0.0,R,660162,...,False,False,0,0,18ebcf8e-a95e-440e-afba-0a8109cc8190,e4c0158e-2211-43ac-9fcd-1a2291990694,bcdb17e8-0729-44f1-bee4-32d548d269d9,3a7990d9-3b12-4b2c-b08d-ab17339654c3,gid_2018_07_11_slnmlb_chamlb_1,FF
2,262973,2018,502522,120,WSH,1.42,4.0,0.0,R,607054,...,False,False,0,0,ba1d90b7-1b70-49a7-8ac4-5e3cf2a3aa54,c0dec6a9-c0a9-4ec9-8f72-8e8367080d2c,b3c85d00-dbe8-4b5e-9596-fa855c99fc4f,3a7a53cb-95ba-4542-b4ff-0a304ada8e01,gid_2018_06_19_balmlb_wasmlb_1,FF
3,262974,2018,571945,138,STL,2.8,13.0,3.0,R,467827,...,False,False,0,0,55360ecc-ca53-4f4f-ac6e-9e1870bb63ae,5520a3ea-0c68-44c5-9854-2f6eb523d83c,84f20b62-fde1-468d-8951-ae7f598f1f94,3a7aa9d8-1de0-4c5a-a9e6-882f30d1e727,gid_2018_08_24_slnmlb_colmlb_1,FT
4,262975,2018,571945,138,STL,2.8,13.0,3.0,R,467827,...,False,False,0,0,55360ecc-ca53-4f4f-ac6e-9e1870bb63ae,5520a3ea-0c68-44c5-9854-2f6eb523d83c,84f20b62-fde1-468d-8951-ae7f598f1f94,3a7aa9d8-1de0-4c5a-a9e6-882f30d1e727,gid_2018_08_24_slnmlb_colmlb_1,SL


In [3]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 72 columns):
pitch_data_id          10000 non-null int64
season                 10000 non-null int64
pitcher_id             10000 non-null int64
team_id                10000 non-null object
team_abbrev            10000 non-null object
era                    10000 non-null float64
wins                   9981 non-null float64
losses                 9981 non-null float64
throws                 10000 non-null object
b1_id                  10000 non-null int64
b1_team_id             10000 non-null object
b1_stand               10000 non-null object
b1_height              10000 non-null int64
b1_bats                10000 non-null object
b1_avg                 10000 non-null float64
b1_hr                  10000 non-null int64
b1_rbi                 10000 non-null int64
b1_bat_order           9504 non-null float64
b1_game_position       9504 non-null object
p1_pitch_id            10000 non-null objec

In [4]:
cols_to_drop=['p1_pitch_id','p0_pitch_id','result_type','pitch_data_id','season','team_id','game_id','inning_id','half_inning_id','at_bat_id','gid']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
# drop pickoff attempts
pitch_data = utils.drop_pickoffs(pitch_data)
# set category of cols
pitch_data = utils.set_dtypes(pitch_data)
# drop nans
pitch_data = pitch_data.dropna()
all_data = pitch_data

In [5]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8872 entries, 0 to 9999
Data columns (total 61 columns):
pitcher_id             8872 non-null object
team_abbrev            8872 non-null object
era                    8872 non-null float64
wins                   8872 non-null float64
losses                 8872 non-null float64
throws                 8872 non-null object
b1_id                  8872 non-null object
b1_team_id             8872 non-null object
b1_stand               8872 non-null int64
b1_height              8872 non-null int64
b1_bats                8872 non-null object
b1_avg                 8872 non-null float64
b1_hr                  8872 non-null int64
b1_rbi                 8872 non-null int64
b1_bat_order           8872 non-null float64
b1_game_position       8872 non-null object
p1_pitch_seqno         8872 non-null int64
p0_pitch_seqno         8872 non-null float64
p0_inning              8872 non-null float64
result_type_simple     8872 non-null object
x          

In [6]:
pitch_types = pitch_data.loc[:,'p1_pitch_type']
Y = utils.encode_pitch_types(pitch_types)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8872 entries, 0 to 9999
Columns: 1254 entries, era to inning_15
dtypes: float64(35), int64(15), uint8(1204)
memory usage: 13.6 MB


In [7]:
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7166 entries, 6012 to 8143
Columns: 1148 entries, era to inning_14
dtypes: float64(35), int64(15), uint8(1098)
memory usage: 10.3 MB


## Part2: Train an XGBoost multiclassifier

In [7]:
import models.xgboost_model as xgb_model

In [8]:
xgb_multi_class_model = xgb_model.get_multi_class_classifier_model()

In [11]:
xgb_model.fit_multi_class_model(model=xgb_multi_class_model,x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test,save_location='./training/xgb_multi_class.bin')


Model Report
Accuracy (Train) : 1.0
Accuracy (Test) : 0.4447544642857143


It looks like we're overfitting. I'll remove the pitcher and batter ids

In [9]:
pitch_data = all_data
# remove the pitch
cols_to_drop=['pitcher_id','b1_id']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8872 entries, 0 to 9999
Columns: 155 entries, era to inning_15
dtypes: float64(35), int64(15), uint8(105)
memory usage: 4.3 MB


In [10]:
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)