# Predicting Pitches
For our project, we'll aim to solve the problem of predicting the next pitch a Major League Baseball Pitcher will throw during an at bat

In [17]:
import utils.utils as utils
import pandas as pd

## Part 1: Preprocessing the data

In [18]:
# get the data
pitch_data = utils.get_pitch_data()
pitch_data.head()

Unnamed: 0,pitch_data_id,season,pitcher_id,team_id,team_abbrev,era,wins,losses,throws,b1_id,...,is_runner_on_second,is_runner_on_third,runs_pitcher_team,runs_batter_team,game_id,inning_id,half_inning_id,at_bat_id,gid,p1_pitch_type
0,61213,2018,456501,137,SF,0.45,1.0,0.0,R,545361,...,False,False,0,0,0863a476-aa02-4aa3-b28b-8c6f20aef475,5dbdbd3e-13d6-4be5-9be8-8b0c23b241bd,7cb16c94-a5b0-4655-9aff-f20d2770ed57,6c20aadb-e47c-4d47-8c9c-d74c9f990553,gid_2018_04_22_sfnmlb_anamlb_1,CH
1,61214,2018,456501,137,SF,0.45,1.0,0.0,R,545361,...,False,False,0,0,0863a476-aa02-4aa3-b28b-8c6f20aef475,5dbdbd3e-13d6-4be5-9be8-8b0c23b241bd,7cb16c94-a5b0-4655-9aff-f20d2770ed57,6c20aadb-e47c-4d47-8c9c-d74c9f990553,gid_2018_04_22_sfnmlb_anamlb_1,FF
2,61215,2018,456501,137,SF,0.45,1.0,0.0,R,545361,...,False,False,0,0,0863a476-aa02-4aa3-b28b-8c6f20aef475,5dbdbd3e-13d6-4be5-9be8-8b0c23b241bd,7cb16c94-a5b0-4655-9aff-f20d2770ed57,6c20aadb-e47c-4d47-8c9c-d74c9f990553,gid_2018_04_22_sfnmlb_anamlb_1,SL
3,61216,2018,456501,137,SF,0.45,1.0,0.0,R,545361,...,False,False,0,0,0863a476-aa02-4aa3-b28b-8c6f20aef475,5dbdbd3e-13d6-4be5-9be8-8b0c23b241bd,7cb16c94-a5b0-4655-9aff-f20d2770ed57,6c20aadb-e47c-4d47-8c9c-d74c9f990553,gid_2018_04_22_sfnmlb_anamlb_1,CH
4,61217,2018,607215,118,KC,9.31,0.0,2.0,L,547982,...,False,False,0,0,099c115c-e56d-40d8-938d-30ca878ad873,ccbbd868-1412-448a-8efc-354c0f242555,e89f8a7c-bf1e-4b78-b5a7-73477e6730b0,6c213c7a-efa9-4683-b0c7-7d92621c46d3,gid_2018_04_22_kcamlb_detmlb_1,FF


In [19]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 72 columns):
pitch_data_id          10000 non-null int64
season                 10000 non-null int64
pitcher_id             10000 non-null int64
team_id                10000 non-null object
team_abbrev            10000 non-null object
era                    10000 non-null float64
wins                   9990 non-null float64
losses                 9990 non-null float64
throws                 10000 non-null object
b1_id                  10000 non-null int64
b1_team_id             10000 non-null object
b1_stand               10000 non-null object
b1_height              10000 non-null int64
b1_bats                10000 non-null object
b1_avg                 10000 non-null float64
b1_hr                  10000 non-null int64
b1_rbi                 10000 non-null int64
b1_bat_order           9449 non-null float64
b1_game_position       9449 non-null object
p1_pitch_id            10000 non-null objec

In [20]:
cols_to_drop=['pitch_data_id','season','team_id','game_id','inning_id','half_inning_id','at_bat_id','gid']
# drop unneeded cols
pitch_data = utils.drop_columns_by_list(pitch_data,cols_to_drop)
# drop pickoff attempts
pitch_data = utils.drop_pickoffs(pitch_data)
# set category of cols
pitch_data = utils.set_dtypes(pitch_data)
# drop nans
pitch_data = pitch_data.dropna()


In [21]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8812 entries, 0 to 9999
Data columns (total 64 columns):
pitcher_id             8812 non-null object
team_abbrev            8812 non-null object
era                    8812 non-null float64
wins                   8812 non-null float64
losses                 8812 non-null float64
throws                 8812 non-null object
b1_id                  8812 non-null object
b1_team_id             8812 non-null object
b1_stand               8812 non-null int64
b1_height              8812 non-null int64
b1_bats                8812 non-null object
b1_avg                 8812 non-null float64
b1_hr                  8812 non-null int64
b1_rbi                 8812 non-null int64
b1_bat_order           8812 non-null float64
b1_game_position       8812 non-null object
p1_pitch_id            8812 non-null object
p0_pitch_id            8812 non-null object
p1_pitch_seqno         8812 non-null int64
p0_pitch_seqno         8812 non-null float64
p0_inning   

In [22]:
pitch_types = pitch_data.loc[:,'p1_pitch_type']
Y = utils.encode_pitch_types(pitch_types)
X = pitch_data.drop('p1_pitch_type',axis=1)
# one hot encode necessary cols
X = utils.one_hot_encode(X)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8812 entries, 0 to 9999
Columns: 18700 entries, era to inning_15
dtypes: float64(35), int64(15), uint8(18650)
memory usage: 160.2 MB


In [23]:
X_train, X_test, y_train, y_test = utils.split_dataset_into_train_and_test(X,Y)

In [24]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7049 entries, 3251 to 8313
Columns: 18700 entries, era to inning_15
dtypes: float64(35), int64(15), uint8(18650)
memory usage: 128.1 MB


## Part2: Train an XGBoost multiclassifier

In [25]:
import models.xgboost_model as xgb_model

In [26]:
xgb_multi_class_model = xgb_model.get_multi_class_classifier_model()

In [None]:
xgb_model.fit_multi_class_model(model=xgb_multi_class_model,x_train=X_train,y_train=y_train,x_test=X_test,y_test=y_test,save_location='./training/xgb_multi_class.bin')