# Predicting Pitches with Neural Network Model
For our project, we'll aim to solve the problem of predicting the next pitch a Major League Baseball Pitcher will throw during an at bat

In [1]:
import utils.utils as utils
import pandas as pd
import numpy as np

# Set Pandas option to display all columns [scrollable]
pd.set_option('display.max_columns', None)


In [2]:
# Read csv files of saved pitch data from the MLB 2016-2019 seasons
pitch_data = pd.read_csv('raw_pitch_data_all_base_v2.csv', index_col=0)
                         

  mask |= (ar1 == a)


In [3]:
#
# Only keep rows with finite (non-NULL/NaN) label values (p1_pitch_type)
#
pre_filter_rows = len(pitch_data.index)
pitch_data = pitch_data[pd.notnull(pitch_data['p1_pitch_type'])]
post_filter_rows = len(pitch_data.index)

filter_diff = pre_filter_rows - post_filter_rows
filter_pcnt = (filter_diff)/pre_filter_rows

print('Removed Null/NaN labeled pitch types rows, filtered %d of %d rows at %f%%' % (filter_diff, pre_filter_rows, filter_pcnt))

Removed Null/NaN labeled pitch types rows, filtered 33043 of 2906621 rows at 0.011368%


In [4]:
#
# Drop rows with unwanted pitchtypes (including automatic ball/strikes, pitchouts, etc)
#
pre_filter_rows = len(pitch_data.index)
pitch_data = utils.drop_unwanted_pitches(pitch_data)
post_filter_rows = len(pitch_data.index)

filter_diff = pre_filter_rows - post_filter_rows
filter_pcnt = (filter_diff)/pre_filter_rows

print('Removed rows w/ unwanted pitch types, filtered %d of %d rows at %f%%' % (filter_diff, pre_filter_rows, filter_pcnt))

Removed rows w/ unwanted pitch types, filtered 5700 of 2873578 rows at 0.001984%


In [5]:
#pitch_data.info()

In [6]:
#
# Drop unwanted dataset columns 
# 

# ID columns to drop
id_cols_to_drop=['p1_pitch_id','p0_pitch_id','pitch_data_id','team_id','game_id',
                 'inning_id','half_inning_id','at_bat_id','gid','b1_id','b1_team_id',
                 'team_abbrev']
pitch_data = utils.drop_columns_by_list(pitch_data,id_cols_to_drop)


In [7]:
# Pitch data columns to drop
pitch_cols_to_drop=['p0_pitch_seqno','p1_pitch_seqno','p0_inning','result_type',
                    'type_confidence','p0_at_bat_o','p0_pitch_des','nasty',
                    'x','y','sz_top','sz_bot','pfx_x','pfx_z','px','pz',
                    'x0','y0','z0','vx0','vy0','vz0','ax','ay','az','break_y']
pitch_data = utils.drop_columns_by_list(pitch_data,pitch_cols_to_drop)


In [8]:
# Optional pitch data columns to drop
opt_pitch_cols_to_drop=['pitch_count_atbat','pitch_count_team','start_speed','spin_dir']
pitch_data = utils.drop_columns_by_list(pitch_data,opt_pitch_cols_to_drop)


In [9]:
#pitch_data.info()

In [10]:
#
# Create new column of run differential
#
pitch_data['run_diff'] = pitch_data['runs_pitcher_team'] - pitch_data['runs_batter_team']


In [11]:
pitch_data.describe()

Unnamed: 0,season,pitcher_id,era,wins,losses,b1_stand,b1_height,b1_avg,b1_hr,b1_rbi,b1_bat_order,end_speed,break_angle,break_length,zone,spin_rate,outcome,inning,balls,strikes,outs,runs_pitcher_team,runs_batter_team,run_diff
count,2867878.0,2867878.0,2867878.0,2862540.0,2862540.0,2867878.0,2867878.0,2867878.0,2867878.0,2867878.0,2754636.0,2780001.0,2779966.0,2779966.0,2060707.0,2060707.0,2785126.0,2867878.0,2867878.0,2867878.0,2867878.0,2867878.0,2867878.0,2867878.0
mean,2017.519,545786.8,4.23949,3.123467,2.857613,0.5854946,73.10459,0.2512505,7.858821,26.87278,4.71898,81.46273,9.89447,6.618559,9.837344,1735.026,0.6057841,5.002174,0.8861036,0.891177,0.9615751,0.0,0.0,0.0
std,1.12509,74956.9,3.188281,3.44098,2.972095,0.4926365,2.170047,0.07250525,8.09173,23.60062,2.528324,5.331172,22.91872,134.9071,4.067332,688.7583,0.4886817,2.669988,0.9679374,0.8258254,0.8211289,0.0,0.0,0.0
min,2016.0,112526.0,0.0,0.0,0.0,0.0,66.0,0.0,0.0,0.0,0.0,32.4,-90.0,0.1,1.0,2.325,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,500610.0,2.97,1.0,0.0,0.0,72.0,0.227,2.0,8.0,3.0,78.0,-6.5,4.3,6.0,1234.178,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2018.0,547874.0,4.01,2.0,2.0,1.0,73.0,0.258,5.0,21.0,5.0,82.6,12.2,6.0,11.0,1877.886,1.0,5.0,1.0,1.0,1.0,0.0,0.0,0.0
75%,2019.0,605309.0,4.95,5.0,4.0,1.0,75.0,0.287,12.0,41.0,7.0,85.5,28.8,8.2,13.0,2256.24,1.0,7.0,2.0,2.0,2.0,0.0,0.0,0.0
max,2019.0,681915.0,162.0,22.0,19.0,1.0,82.0,1.0,59.0,133.0,9.0,96.9,269.4,224889.3,14.0,6539.259,1.0,19.0,4.0,2.0,3.0,0.0,0.0,0.0


In [12]:
#
# Drop runs columns (pitcher/batter)
cols_to_drop=['runs_pitcher_team','runs_batter_team']
pitch_data = utils.drop_columns_by_list(pitch_data, cols_to_drop)


In [13]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2867878 entries, 0 to 2906620
Data columns (total 31 columns):
season                 int64
pitcher_id             int64
era                    float64
wins                   float64
losses                 float64
throws                 object
b1_stand               int64
b1_height              int64
b1_bats                object
b1_avg                 float64
b1_hr                  int64
b1_rbi                 int64
b1_bat_order           float64
b1_game_position       object
result_type_simple     object
end_speed              float64
break_angle            float64
break_length           float64
p0_pitch_type          object
zone                   float64
spin_rate              float64
outcome                float64
inning                 int64
balls                  int64
strikes                int64
outs                   int64
is_runner_on_first     bool
is_runner_on_second    bool
is_runner_on_third     bool
p1_pitch_type         

In [14]:
# Set intended data types of the remaining columns
pitch_data = utils.set_dtypes(pitch_data)


In [15]:
pitch_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2867878 entries, 0 to 2906620
Data columns (total 31 columns):
season                 int64
pitcher_id             object
era                    float64
wins                   float64
losses                 float64
throws                 object
b1_stand               int64
b1_height              int64
b1_bats                object
b1_avg                 float64
b1_hr                  int64
b1_rbi                 int64
b1_bat_order           float64
b1_game_position       object
result_type_simple     object
end_speed              float64
break_angle            float64
break_length           float64
p0_pitch_type          object
zone                   float64
spin_rate              float64
outcome                float64
inning                 object
balls                  int64
strikes                int64
outs                   int64
is_runner_on_first     int64
is_runner_on_second    int64
is_runner_on_third     int64
p1_pitch_type    

In [16]:
#
# Replace Nulls/NaN values that are left in the remaining object columns
#
pitch_data['p0_pitch_type'] = pitch_data['p0_pitch_type'].fillna('NP') # 'NP' is for No Pitch

pitch_data['result_type_simple'] = pitch_data['result_type_simple'].fillna('X') # 'X' is for in play 

pitch_data['b1_game_position'] = pitch_data['b1_game_position'].fillna('Unknown')

pitch_data['b1_bats'] = pitch_data['b1_bats'].fillna('R') # 'R' is for right handed (Other values are L or S)

pitch_data['throws'] = pitch_data['throws'].fillna('R') # 'R' is for right handed (Other value is L)

print('Current number of dataframe Null/NaN values: %d' % (pitch_data.isnull().sum().sum()))

Current number of dataframe Null/NaN values: 2084713


In [17]:
#
# Fill the rest of Null/NaN values with zero in numeric columns
#
pitch_data = pitch_data.fillna(0)

print('Current number of dataframe Null/NaN values: %d' % (pitch_data.isnull().sum().sum()))


Current number of dataframe Null/NaN values: 0


In [18]:
#
# Split data into train and test sets
#
pd_train = pitch_data[pitch_data['season']!=2019].copy()
pd_test = pitch_data[pitch_data['season']==2019].copy()

print('Shape of training data set is {}'.format(pd_train.shape))
print('Shape of test data set is {}'.format(pd_test.shape))


Shape of training data set is (2125821, 31)
Shape of test data set is (742057, 31)


In [19]:
# Get the top 3 pitcher ids with the most training pitch data available
pd_train['pitcher_id'].value_counts().nlargest(n=3)


434378    10613
453286    10105
519144     9762
Name: pitcher_id, dtype: int64

In [20]:
# Top pitchers of available training pitch data are
#  1) Justin Verlander (id=434378)
#  2) Max Scherzer (id=453286)
#  3) Rick Porcello (id=519144)

pd_train['pitcher_id'] = pd_train['pitcher_id'].astype(dtype='int64')

pd_train_verlander = pd_train[pd_train['pitcher_id']==434378].copy()
pd_test_verlander = pd_test[pd_test['pitcher_id']==434378].copy()
print('Verlander pitch data rows: train=%d, test=%d.' % (len(pd_train_verlander.index), len(pd_test_verlander.index)))

pd_train_scherzer = pd_train[pd_train['pitcher_id']==453286].copy()
pd_test_scherzer = pd_test[pd_test['pitcher_id']==453286].copy()
print('Scherzer pitch data rows: train=%d, test=%d.' % (len(pd_train_scherzer.index), len(pd_test_scherzer.index)))

pd_train_porcello = pd_train[pd_train['pitcher_id']==519144].copy()
pd_test_porcello = pd_test[pd_test['pitcher_id']==519144].copy()
print('Porcello pitch data rows: train=%d, test=%d.' % (len(pd_train_porcello.index), len(pd_test_porcello.index)))


Verlander pitch data rows: train=10613, test=3256.
Scherzer pitch data rows: train=10105, test=2657.
Porcello pitch data rows: train=9762, test=2960.


In [21]:
#
# Lastly drop season and pitch_id columns
#
cols_to_drop=['season','pitcher_id']

pd_test = utils.drop_columns_by_list(pd_test, cols_to_drop)
pd_train = utils.drop_columns_by_list(pd_train, cols_to_drop)

pd_test_verlander = utils.drop_columns_by_list(pd_test_verlander, cols_to_drop)
pd_train_verlander = utils.drop_columns_by_list(pd_train_verlander, cols_to_drop)

pd_test_scherzer = utils.drop_columns_by_list(pd_test_scherzer, cols_to_drop)
pd_train_scherzer = utils.drop_columns_by_list(pd_train_scherzer, cols_to_drop)

pd_test_porcello = utils.drop_columns_by_list(pd_test_porcello, cols_to_drop)
pd_train_porcello = utils.drop_columns_by_list(pd_train_porcello, cols_to_drop)


## Part2: Train an Neural Network multiclassifier