In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder 
from sklearn.model_selection import cross_val_score
pd.set_option('display.max_columns', None)

In [2]:
df=pd.read_csv('top50.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574057 entries, 0 to 574056
Data columns (total 22 columns):
Unnamed: 0        574057 non-null int64
Pitch_Type        574057 non-null int64
type_of_pitch     574057 non-null object
abid              574057 non-null int64
Pitcher_name      574057 non-null object
Batter_name       574057 non-null object
inning            574057 non-null int64
run_difference    574057 non-null int64
pitcher_score     574057 non-null int64
batter_score      574057 non-null int64
ball_count        574057 non-null int64
strike_count      574057 non-null int64
outs              574057 non-null int64
pitch_num_ab      574057 non-null int64
on_1b             574057 non-null int64
on_2b             574057 non-null int64
on_3b             574057 non-null int64
pitcher_throws    574057 non-null int64
batter_stance     574057 non-null int64
pitcher_id        574057 non-null int64
batter_id         574057 non-null int64
top               574057 non-null int64
dtypes

In [4]:
df['Pitcher_name'].value_counts()

Chris Archer         13643
Chris Sale           13590
Max Scherzer         13092
Jeff Samardzija      13028
Jose Quintana        12971
Justin Verlander     12872
Rick Porcello        12862
Gio Gonzalez         12759
Jon Lester           12581
Julio Teheran        12343
Corey Kluber         12318
Marco Estrada        12232
Trevor Bauer         12154
Carlos Martinez      12071
Wade Miley           12027
Jake Arrieta         11996
Zack Greinke         11995
Gerrit Cole          11711
Kevin Gausman        11680
Jacob deGrom         11643
Johnny Cueto         11586
Tanner Roark         11555
Ian Kennedy          11533
John Lackey          11438
Jason Hammel         11374
Jake Odorizzi        11289
Jimmy Nelson         11247
Cole Hamels          11194
Carlos Carrasco      11081
Ervin Santana        11059
Mike Fiers           11048
Kyle Gibson          10882
Masahiro Tanaka      10838
Mike Leake           10834
Robbie Ray           10829
J.A. Happ            10820
Andrew Cashner       10716
U

### Testing on single Pitcher: Madison Bumgarner (10,162 pitches)

In [5]:
dfmb=df[df.Pitcher_name== 'Madison Bumgarner'] 

In [6]:
dfmb.head(10)

Unnamed: 0.1,Unnamed: 0,Pitch_Type,type_of_pitch,abid,Pitcher_name,Batter_name,inning,run_difference,pitcher_score,batter_score,ball_count,strike_count,outs,pitch_num_ab,on_1b,on_2b,on_3b,pitcher_throws,batter_stance,pitcher_id,batter_id,top
1072,3167,1,FT,847,Madison Bumgarner,A.J. Pollock,1,0,0,0,0,0,0,1,0,0,0,1,0,518516,572041,0
1073,3168,1,FT,848,Madison Bumgarner,Ender Inciarte,1,0,0,0,0,0,1,1,0,0,0,1,1,518516,542255,0
1074,3169,1,FF,848,Madison Bumgarner,Ender Inciarte,1,0,0,0,1,0,1,2,0,0,0,1,1,518516,542255,0
1075,3170,1,FF,848,Madison Bumgarner,Ender Inciarte,1,0,0,0,1,1,1,3,0,0,0,1,1,518516,542255,0
1076,3171,1,FF,848,Madison Bumgarner,Ender Inciarte,1,0,0,0,2,1,1,4,0,0,0,1,1,518516,542255,0
1077,3172,3,CU,848,Madison Bumgarner,Ender Inciarte,1,0,0,0,2,2,1,5,0,0,0,1,1,518516,542255,0
1078,3173,3,SL,849,Madison Bumgarner,Paul Goldschmidt,1,0,0,0,0,0,1,1,1,0,0,1,0,518516,502671,0
1079,3174,1,FF,849,Madison Bumgarner,Paul Goldschmidt,1,0,0,0,1,0,1,2,1,0,0,1,0,518516,502671,0
1080,3175,1,FF,849,Madison Bumgarner,Paul Goldschmidt,1,0,0,0,1,1,1,3,1,0,0,1,0,518516,502671,0
1081,3176,3,SL,849,Madison Bumgarner,Paul Goldschmidt,1,0,0,0,1,2,1,4,1,0,0,1,0,518516,502671,0


In [7]:
corr_matrix= dfmb.corr()
#looking at each attribute's correlation
corr_matrix['Pitch_Type'].sort_values(ascending=False)

Pitch_Type        1.000000
strike_count      0.119269
pitch_num_ab      0.097534
pitcher_score     0.069847
abid              0.063339
run_difference    0.051084
ball_count        0.032018
inning            0.024126
batter_score      0.024092
top               0.019988
on_3b             0.003236
on_1b            -0.001471
outs             -0.004817
on_2b            -0.005157
batter_id        -0.062651
batter_stance    -0.067145
Unnamed: 0       -0.258268
pitcher_throws         NaN
pitcher_id             NaN
Name: Pitch_Type, dtype: float64

In [8]:
#Split Data
X= dfmb[['inning', 'ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference', 'pitch_num_ab', 'batter_stance']]
y=dfmb['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,inning,ball_count,strike_count,on_1b,on_2b,on_3b,run_difference,pitch_num_ab,batter_stance
94602,2,3,2,0,0,0,2,7,1
546155,6,2,2,1,0,0,-1,5,0
183491,1,0,0,0,0,0,0,1,1
425575,5,0,1,0,0,0,2,2,0
125037,1,2,1,0,0,0,0,4,0


In [9]:
#Train Model

#Create the Decision Tree Object

# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='gini', random_state=43, splitter= 'best', max_depth= 5  )

In [10]:
#Fit the model to the training data

clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=43, splitter='best')

In [11]:
#guess pitch

y_pred = clf.predict(X_train)
y_pred[0:10]

array([1, 1, 1, 3, 1, 1, 1, 1, 1, 1])

In [12]:
#Estimate the probability of a pitch type

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.59677419, 0.01612903, 0.38709677],
       [0.67460317, 0.0026455 , 0.32275132],
       [0.85995086, 0.02457002, 0.11547912],
       ...,
       [0.93103448, 0.        , 0.06896552],
       [0.90384615, 0.        , 0.09615385],
       [0.514862  , 0.09023355, 0.39490446]])

In [13]:
#Accuracy: number of correct predictions over the number of total instances that have been evaluated.

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.62


In [14]:
confusion_matrix(y_train, y_pred)

array([[4008,    0,  263],
       [ 245,    0,   25],
       [2159,    0,  413]])

In [15]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.63      0.94      0.75      4271
           2       0.00      0.00      0.00       270
           3       0.59      0.16      0.25      2572

    accuracy                           0.62      7113
   macro avg       0.40      0.37      0.33      7113
weighted avg       0.59      0.62      0.54      7113



  'precision', 'predicted', average, warn_for)


In [16]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.59


### Random Forrest

In [17]:
#Create the Random Forest Object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                             
                            random_state=123)

In [18]:
#Fit the model to the training data

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [19]:
#Print Feature Importances

print(rf.feature_importances_)

[0.2790715  0.07485868 0.06065769 0.06197872 0.0464945  0.0296627
 0.28416738 0.11565127 0.04745755]


In [20]:
#Estimate whether pitch is fastball, offspeed, and breaking ball

y_pred = rf.predict(X_train)

In [21]:
#Estimate the probability of a fastball, offspeed, and breaking ball

y_pred_proba = rf.predict_proba(X_train)

In [22]:
#Evaluate Model

#Compute the Accuracy

print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.73


In [23]:
#Create a confusion matrix

print(confusion_matrix(y_train, y_pred))

[[3829    5  437]
 [ 197   14   59]
 [1255    3 1314]]


In [24]:
#Create a classificaiton report

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.73      0.90      0.80      4271
           2       0.64      0.05      0.10       270
           3       0.73      0.51      0.60      2572

    accuracy                           0.73      7113
   macro avg       0.70      0.49      0.50      7113
weighted avg       0.72      0.73      0.70      7113



In [25]:
#Test Model

#Compute the accuracy of the model when run on the test data

print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

Accuracy of random forest classifier on test set: 0.59


In [26]:
df.head()

Unnamed: 0.1,Unnamed: 0,Pitch_Type,type_of_pitch,abid,Pitcher_name,Batter_name,inning,run_difference,pitcher_score,batter_score,ball_count,strike_count,outs,pitch_num_ab,on_1b,on_2b,on_3b,pitcher_throws,batter_stance,pitcher_id,batter_id,top
0,0,1,FF,1,Jon Lester,Matt Carpenter,1,0,0,0,0,0,0,1,0,0,0,1,1,452657,572761,1
1,1,1,FF,1,Jon Lester,Matt Carpenter,1,0,0,0,0,1,0,2,0,0,0,1,1,452657,572761,1
2,2,1,FF,1,Jon Lester,Matt Carpenter,1,0,0,0,0,2,0,3,0,0,0,1,1,452657,572761,1
3,3,1,FF,1,Jon Lester,Matt Carpenter,1,0,0,0,0,2,0,4,0,0,0,1,1,452657,572761,1
4,4,3,CU,1,Jon Lester,Matt Carpenter,1,0,0,0,1,2,0,5,0,0,0,1,1,452657,572761,1


## One Hot encoding Pitchers

In [27]:
df1= pd.get_dummies(df, columns=["Pitcher_name"])

In [28]:
df=df1

In [29]:
df= df.drop(columns=['Unnamed: 0', 'Batter_name', 'abid','type_of_pitch', 'pitcher_score', 'batter_score', 'pitcher_throws', 'pitcher_id', 'batter_id', 'top'])

In [30]:
df.head()

Unnamed: 0,Pitch_Type,inning,run_difference,ball_count,strike_count,outs,pitch_num_ab,on_1b,on_2b,on_3b,batter_stance,Pitcher_name_Andrew Cashner,Pitcher_name_CC Sabathia,Pitcher_name_Carlos Carrasco,Pitcher_name_Carlos Martinez,Pitcher_name_Chris Archer,Pitcher_name_Chris Sale,Pitcher_name_Clayton Kershaw,Pitcher_name_Cole Hamels,Pitcher_name_Corey Kluber,Pitcher_name_Dallas Keuchel,Pitcher_name_Drew Pomeranz,Pitcher_name_Ervin Santana,Pitcher_name_Gerrit Cole,Pitcher_name_Gio Gonzalez,Pitcher_name_Ian Kennedy,Pitcher_name_J.A. Happ,Pitcher_name_Jacob deGrom,Pitcher_name_Jake Arrieta,Pitcher_name_Jake Odorizzi,Pitcher_name_James Shields,Pitcher_name_Jason Hammel,Pitcher_name_Jeff Samardzija,Pitcher_name_Jeremy Hellickson,Pitcher_name_Jimmy Nelson,Pitcher_name_John Lackey,Pitcher_name_Johnny Cueto,Pitcher_name_Jon Lester,Pitcher_name_Jose Quintana,Pitcher_name_Julio Teheran,Pitcher_name_Justin Verlander,Pitcher_name_Kevin Gausman,Pitcher_name_Kyle Gibson,Pitcher_name_Kyle Hendricks,Pitcher_name_Madison Bumgarner,Pitcher_name_Marco Estrada,Pitcher_name_Martin Perez,Pitcher_name_Masahiro Tanaka,Pitcher_name_Max Scherzer,Pitcher_name_Michael Wacha,Pitcher_name_Mike Fiers,Pitcher_name_Mike Leake,Pitcher_name_Rick Porcello,Pitcher_name_Robbie Ray,Pitcher_name_Sonny Gray,Pitcher_name_Taijuan Walker,Pitcher_name_Tanner Roark,Pitcher_name_Trevor Bauer,Pitcher_name_Ubaldo Jimenez,Pitcher_name_Wade Miley,Pitcher_name_Zack Greinke
0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,2,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,2,0,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,3,1,0,1,2,0,5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
#df.to_csv('OHtop50.csv')

In [52]:
#Split Data
X= df.drop(columns=['Pitch_Type'])
y=df[['Pitch_Type']]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,inning,run_difference,ball_count,strike_count,outs,pitch_num_ab,on_1b,on_2b,on_3b,batter_stance,Pitcher_name_Andrew Cashner,Pitcher_name_CC Sabathia,Pitcher_name_Carlos Carrasco,Pitcher_name_Carlos Martinez,Pitcher_name_Chris Archer,Pitcher_name_Chris Sale,Pitcher_name_Clayton Kershaw,Pitcher_name_Cole Hamels,Pitcher_name_Corey Kluber,Pitcher_name_Dallas Keuchel,Pitcher_name_Drew Pomeranz,Pitcher_name_Ervin Santana,Pitcher_name_Gerrit Cole,Pitcher_name_Gio Gonzalez,Pitcher_name_Ian Kennedy,Pitcher_name_J.A. Happ,Pitcher_name_Jacob deGrom,Pitcher_name_Jake Arrieta,Pitcher_name_Jake Odorizzi,Pitcher_name_James Shields,Pitcher_name_Jason Hammel,Pitcher_name_Jeff Samardzija,Pitcher_name_Jeremy Hellickson,Pitcher_name_Jimmy Nelson,Pitcher_name_John Lackey,Pitcher_name_Johnny Cueto,Pitcher_name_Jon Lester,Pitcher_name_Jose Quintana,Pitcher_name_Julio Teheran,Pitcher_name_Justin Verlander,Pitcher_name_Kevin Gausman,Pitcher_name_Kyle Gibson,Pitcher_name_Kyle Hendricks,Pitcher_name_Madison Bumgarner,Pitcher_name_Marco Estrada,Pitcher_name_Martin Perez,Pitcher_name_Masahiro Tanaka,Pitcher_name_Max Scherzer,Pitcher_name_Michael Wacha,Pitcher_name_Mike Fiers,Pitcher_name_Mike Leake,Pitcher_name_Rick Porcello,Pitcher_name_Robbie Ray,Pitcher_name_Sonny Gray,Pitcher_name_Taijuan Walker,Pitcher_name_Tanner Roark,Pitcher_name_Trevor Bauer,Pitcher_name_Ubaldo Jimenez,Pitcher_name_Wade Miley,Pitcher_name_Zack Greinke
363984,2,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
169610,6,4,1,1,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
476647,8,3,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
216049,2,1,1,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
166455,6,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [53]:
#Train Model

#Create the Decision Tree Object

# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='gini', random_state=43, splitter= 'best')

In [54]:
#Fit the model to the training data

clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=43, splitter='best')

In [55]:
#guess pitch

y_pred = clf.predict(X_train)
y_pred[0:5]

array([1, 3, 1, 3, 2])

In [56]:
#Estimate the probability of a pitch type

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[1.        , 0.        , 0.        ],
       [0.16666667, 0.33333333, 0.5       ],
       [1.        , 0.        , 0.        ],
       ...,
       [0.5       , 0.        , 0.5       ],
       [1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        ]])

In [57]:
#Accuracy: number of correct predictions over the number of total instances that have been evaluated.

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.87


In [58]:
confusion_matrix(y_train, y_pred)

array([[210355,   2360,   7826],
       [ 11838,  38381,   2247],
       [ 23526,   3356, 101950]])

In [59]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.86      0.95      0.90    220541
           2       0.87      0.73      0.79     52466
           3       0.91      0.79      0.85    128832

    accuracy                           0.87    401839
   macro avg       0.88      0.83      0.85    401839
weighted avg       0.88      0.87      0.87    401839



In [60]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.64
