In [25]:
import pandas as pd
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_columns', None)

In [2]:
dftrain= pd.read_csv('dftrain.csv')

In [5]:
#switching batters stance and pitcher throw to numerical 
#Had to fix some features: stance, pitcher's hand
dftrain.pitcher_throws = dftrain.pitcher_throws.replace(['R', 'L'], [0,1])
dftrain.batter_stance = dftrain.batter_stance.replace(['R', 'L'], [0,1])

In [11]:
dftrain.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Pitch_Type,type_of_pitch,Pitcher_name,Batter_name,inning,run_difference,pitcher_score,batter_score,ball_count,strike_count,outs,pitch_num_ab,on_1b,on_2b,on_3b,pitcher_throws,batter_stance,weather,pitcher_id,batter_id,top,year
0,0,0,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,0,0,0,1,0,0,0,1,1,"44 degrees, clear",452657,572761,True,2015
1,1,1,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,0,1,0,2,0,0,0,1,1,"44 degrees, clear",452657,572761,True,2015
2,2,2,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,0,2,0,3,0,0,0,1,1,"44 degrees, clear",452657,572761,True,2015
3,3,3,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,0,2,0,4,0,0,0,1,1,"44 degrees, clear",452657,572761,True,2015
4,4,4,3,CU,Jon Lester,Matt Carpenter,1,0,0,0,1,2,0,5,0,0,0,1,1,"44 degrees, clear",452657,572761,True,2015


In [12]:
dfall= dftrain[['Pitch_Type', 'inning', 'ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference', 'batter_stance', 'pitcher_throws']]

In [14]:
#Split Data
X= dfall[['ball_count', 'inning', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference', 'batter_stance', 'pitcher_throws']]
y=dfall['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,ball_count,inning,strike_count,on_1b,on_2b,on_3b,run_difference,batter_stance,pitcher_throws
780086,3,8,2,1,1,0,-1,1,0
578325,0,5,1,0,0,0,-2,0,1
705263,2,8,1,0,0,0,4,0,0
1047168,1,2,0,0,0,0,1,1,0
1212563,1,7,1,0,0,0,1,1,0


### Decision Tree

In [15]:
#Train Model

#Create the Decision Tree Object

# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='gini', random_state=43, splitter= 'best', max_depth= 10  )

In [16]:
#Fit the model to the training data

clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=43, splitter='best')

In [17]:
#guess pitch

y_pred = clf.predict(X_train)
y_pred[0:10]

array([1, 1, 1, 1, 1, 1, 3, 1, 3, 1])

In [18]:
#Estimate the probability of a pitch type

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.53664921, 0.08900524, 0.37434555],
       [0.44894853, 0.22869397, 0.3223575 ],
       [0.59936659, 0.06571655, 0.33491686],
       ...,
       [0.44417888, 0.2322533 , 0.32356782],
       [0.42677458, 0.09362354, 0.47960188],
       [0.56401263, 0.1788846 , 0.25710277]])

In [19]:
#Accuracy: number of correct predictions over the number of total instances that have been evaluated.

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.54


In [20]:
confusion_matrix(y_train, y_pred)

array([[478997,     84,  39932],
       [109142,    157,   8205],
       [298054,     91,  45570]])

In [22]:
labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns={1:'fastball', 2:'offspeed', 3:'breaking ball'})

Unnamed: 0,1,2,3
1,478997,84,39932
2,109142,157,8205
3,298054,91,45570


In [23]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.54      0.92      0.68    519013
           2       0.47      0.00      0.00    117504
           3       0.49      0.13      0.21    343715

    accuracy                           0.54    980232
   macro avg       0.50      0.35      0.30    980232
weighted avg       0.51      0.54      0.43    980232



In [24]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.53


#### Random forrest

In [28]:
#Create the Random Forest Object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=1000,
                            max_depth=10, 
                            random_state=123)

In [29]:
#Fit the model to the training data

rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=123,
                       verbose=0, warm_start=False)

In [30]:
#Print Feature Importances

print(rf.feature_importances_)

[0.24098976 0.12521221 0.23318104 0.02288873 0.03540642 0.02435812
 0.06707075 0.07887758 0.17201539]


In [31]:
#Estimate whether pitch is fastball, offspeed, and breaking ball

y_pred = rf.predict(X_train)

In [32]:
#Estimate the probability of a fastball, offspeed, and breaking ball

y_pred_proba = rf.predict_proba(X_train)

In [33]:
#Evaluate Model

#Compute the Accuracy

print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

Accuracy of random forest classifier on training set: 0.54


In [35]:
#Create a confusion matrix

print(confusion_matrix(y_train, y_pred))

[[495405      0  23608]
 [112869      3   4632]
 [314052      0  29663]]


In [37]:
#Create a classificaiton report

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.54      0.95      0.69    519013
           2       1.00      0.00      0.00    117504
           3       0.51      0.09      0.15    343715

    accuracy                           0.54    980232
   macro avg       0.68      0.35      0.28    980232
weighted avg       0.58      0.54      0.42    980232



In [None]:
#Test Model

#Compute the accuracy of the model when run on the test data

print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))