In [15]:
import pandas as pd
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
pd.set_option('display.max_columns', None)

In [2]:
dftrain= pd.read_csv('dftrain.csv')

In [3]:
dfscore= dftrain[['Pitch_Type','ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference']]

In [4]:
#Split Data
X= dfscore[['ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference']]
y=dfscore['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,ball_count,strike_count,on_1b,on_2b,on_3b,run_difference
780086,3,2,1,1,0,-1
578325,0,1,0,0,0,-2
705263,2,1,0,0,0,4
1047168,1,0,0,0,0,1
1212563,1,1,0,0,0,1


In [5]:
#Train Model

#Create the Decision Tree Object

# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='gini', random_state=43, splitter= 'best', max_depth= 10  )

In [6]:
#Fit the model to the training data

clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=43, splitter='best')

In [7]:
#guess pitch

y_pred = clf.predict(X_train)
y_pred[0:5]

array([1, 1, 1, 1, 1])

In [8]:
#Estimate the probability of a pitch type

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.58910891, 0.12871287, 0.28217822],
       [0.44670355, 0.15592439, 0.39737206],
       [0.60083309, 0.11749497, 0.28167193],
       ...,
       [0.45289266, 0.14929866, 0.39780868],
       [0.40409207, 0.15473146, 0.44117647],
       [0.61392494, 0.10926919, 0.27680587]])

In [9]:
#Accuracy: number of correct predictions over the number of total instances that have been evaluated.

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.53


In [10]:
confusion_matrix(y_train, y_pred)

array([[502121,     12,  16880],
       [112050,     66,   5388],
       [325286,     19,  18410]])

In [11]:
import pandas as pd

labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)


Unnamed: 0,1,2,3
1,502121,12,16880
2,112050,66,5388
3,325286,19,18410


In [12]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.53      0.97      0.69    519013
           2       0.68      0.00      0.00    117504
           3       0.45      0.05      0.10    343715

    accuracy                           0.53    980232
   macro avg       0.56      0.34      0.26    980232
weighted avg       0.52      0.53      0.40    980232



In [13]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.53


In [30]:
###Add More Features outs and inning
dfx= dftrain[['inning', 'outs']]
df=dfscore.append(dfx)

In [31]:
df.head()

Unnamed: 0,Pitch_Type,ball_count,inning,on_1b,on_2b,on_3b,outs,run_difference,strike_count
0,1.0,0.0,,0.0,0.0,0.0,,0.0,0.0
1,1.0,0.0,,0.0,0.0,0.0,,0.0,1.0
2,1.0,0.0,,0.0,0.0,0.0,,0.0,2.0
3,1.0,0.0,,0.0,0.0,0.0,,0.0,2.0
4,3.0,1.0,,0.0,0.0,0.0,,0.0,2.0


In [32]:
df=  pd.concat([dfscore, dfx], axis=1)

In [35]:
corr_matrix= df.corr()
#looking at each attribute's correlation
corr_matrix['Pitch_Type'].sort_values(ascending=False)

Pitch_Type        1.000000
strike_count      0.078061
inning            0.039954
outs              0.024042
on_2b             0.023975
on_3b             0.016815
on_1b            -0.009317
run_difference   -0.016543
ball_count       -0.054307
Name: Pitch_Type, dtype: float64

In [37]:
#Split Data
X= df[['ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference', 'inning', 'outs']]
y=df['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,ball_count,strike_count,on_1b,on_2b,on_3b,run_difference,inning,outs
780086,3,2,1,1,0,-1,8,2
578325,0,1,0,0,0,-2,5,2
705263,2,1,0,0,0,4,8,0
1047168,1,0,0,0,0,1,2,0
1212563,1,1,0,0,0,1,7,0


In [38]:
#Train Model

#Create the Decision Tree Object

# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(criterion='gini', random_state=43, splitter= 'best', max_depth= 12  )

In [39]:
#Fit the model to the training data

clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=43, splitter='best')

In [40]:
#guess pitch

y_pred = clf.predict(X_train)
y_pred[0:5]

array([1, 1, 1, 1, 1])

In [41]:
#Estimate the probability of a pitch type

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba

array([[0.58356941, 0.09915014, 0.31728045],
       [0.44223946, 0.17277046, 0.38499008],
       [0.65346535, 0.06883545, 0.2776992 ],
       ...,
       [0.45281124, 0.13679719, 0.41039157],
       [0.40344828, 0.14827586, 0.44827586],
       [0.6264308 , 0.11654527, 0.25702393]])

In [42]:
#Accuracy: number of correct predictions over the number of total instances that have been evaluated.

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))

Accuracy of Decision Tree classifier on training set: 0.53


In [43]:
confusion_matrix(y_train, y_pred)

array([[481334,     92,  37587],
       [105956,    311,  11237],
       [301767,    104,  41844]])

In [44]:
import pandas as pd

labels = sorted(y_train.unique())

pd.DataFrame(confusion_matrix(y_train, y_pred), index=labels, columns=labels)



Unnamed: 0,1,2,3
1,481334,92,37587
2,105956,311,11237
3,301767,104,41844


In [45]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.54      0.93      0.68    519013
           2       0.61      0.00      0.01    117504
           3       0.46      0.12      0.19    343715

    accuracy                           0.53    980232
   macro avg       0.54      0.35      0.29    980232
weighted avg       0.52      0.53      0.43    980232



In [46]:
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on test set: 0.53


In [None]:
###No Improvement with more features