In [1]:
import warnings

warnings.filterwarnings("ignore")
import pyspark
import pyspark.sql
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
dftrain=pd.read_csv('dftrain.csv')

In [3]:
dftrain.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Pitch_Type,type_of_pitch,Pitcher_name,Batter_name,inning,run_difference,pitcher_score,batter_score,...,on_1b,on_2b,on_3b,pitcher_throws,batter_stance,weather,pitcher_id,batter_id,top,year
0,0,0,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,...,0,0,0,L,L,"44 degrees, clear",452657,572761,True,2015
1,1,1,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,...,0,0,0,L,L,"44 degrees, clear",452657,572761,True,2015
2,2,2,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,...,0,0,0,L,L,"44 degrees, clear",452657,572761,True,2015
3,3,3,1,FF,Jon Lester,Matt Carpenter,1,0,0,0,...,0,0,0,L,L,"44 degrees, clear",452657,572761,True,2015
4,4,4,3,CU,Jon Lester,Matt Carpenter,1,0,0,0,...,0,0,0,L,L,"44 degrees, clear",452657,572761,True,2015


In [4]:
dfruncount= dftrain[['Pitch_Type','ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b']]

In [5]:
dfruncount.head()

Unnamed: 0,Pitch_Type,ball_count,strike_count,on_1b,on_2b,on_3b
0,1,0,0,0,0,0
1,1,0,1,0,0,0
2,1,0,2,0,0,0
3,1,0,2,0,0,0
4,3,1,2,0,0,0


In [6]:
corr_matrix= dfruncount.corr()
#looking at each attribute's correlation
corr_matrix['Pitch_Type'].sort_values(ascending=False)

Pitch_Type      1.000000
strike_count    0.078061
on_2b           0.023975
on_3b           0.016815
on_1b          -0.009317
ball_count     -0.054307
Name: Pitch_Type, dtype: float64

In [7]:
dfruncount.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400332 entries, 0 to 1400331
Data columns (total 6 columns):
Pitch_Type      1400332 non-null int64
ball_count      1400332 non-null int64
strike_count    1400332 non-null int64
on_1b           1400332 non-null int64
on_2b           1400332 non-null int64
on_3b           1400332 non-null int64
dtypes: int64(6)
memory usage: 64.1 MB


In [9]:
#Split Data
X= dfruncount[['ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b']]
y=dfruncount['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,ball_count,strike_count,on_1b,on_2b,on_3b
780086,3,2,1,1,0
578325,0,1,0,0,0
705263,2,1,0,0,0
1047168,1,0,0,0,0
1212563,1,1,0,0,0


In [10]:
#Train Model

#Create the logistic regression object


logit = LogisticRegression(C=1, random_state = 123, class_weight= 'balanced', multi_class='multinomial', solver='newton-cg')

In [11]:
#Fit the model to the training data

logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=123, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
#Print the coefficients and intercept of the model

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.10889827 -0.19405876  0.04016246 -0.07996727 -0.03746403]
 [ 0.02475743  0.07867264 -0.01070763  0.01472636 -0.04522622]
 [-0.1336557   0.11538612 -0.02945483  0.06524091  0.08269025]]
Intercept: 
 [ 0.08352965 -0.08382235  0.0002927 ]


In [13]:
#Estimate pitch 

y_pred = logit.predict(X_train)

In [14]:
#Estimate the probability each type of pitch

y_pred_proba = logit.predict_proba(X_train)

In [15]:
#Evaluate Model

#Compute the accuracy

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.42


In [16]:
#Create a confusion matrix

print(confusion_matrix(y_train, y_pred))

[[263842 127619 127552]
 [ 46768  37856  32880]
 [139537  98691 105487]]


In [17]:
#Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.59      0.51      0.54    519013
           2       0.14      0.32      0.20    117504
           3       0.40      0.31      0.35    343715

    accuracy                           0.42    980232
   macro avg       0.38      0.38      0.36    980232
weighted avg       0.47      0.42      0.43    980232



### will Include Score 

In [18]:
dfscore= dftrain[['Pitch_Type','ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference']]

In [20]:
corr_matrix= dfscore.corr()
#looking at each attribute's correlation
corr_matrix['Pitch_Type'].sort_values(ascending=False)

Pitch_Type        1.000000
strike_count      0.078061
on_2b             0.023975
on_3b             0.016815
on_1b            -0.009317
run_difference   -0.016543
ball_count       -0.054307
Name: Pitch_Type, dtype: float64

In [21]:
#Split Data
X= dfscore[['ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b', 'run_difference']]
y=dfscore['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,ball_count,strike_count,on_1b,on_2b,on_3b,run_difference
780086,3,2,1,1,0,-1
578325,0,1,0,0,0,-2
705263,2,1,0,0,0,4
1047168,1,0,0,0,0,1
1212563,1,1,0,0,0,1


In [22]:
#Split Data
X= dfruncount[['ball_count', 'strike_count', 'on_1b', 'on_2b', 'on_3b']]
y=dfruncount['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,ball_count,strike_count,on_1b,on_2b,on_3b
780086,3,2,1,1,0
578325,0,1,0,0,0
705263,2,1,0,0,0
1047168,1,0,0,0,0
1212563,1,1,0,0,0


In [23]:
#Fit the model to the training data

logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=123, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
#Print the coefficients and intercept of the model

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.10889827 -0.19405876  0.04016246 -0.07996727 -0.03746403]
 [ 0.02475743  0.07867264 -0.01070763  0.01472636 -0.04522622]
 [-0.1336557   0.11538612 -0.02945483  0.06524091  0.08269025]]
Intercept: 
 [ 0.08352965 -0.08382235  0.0002927 ]


In [25]:
#Estimate pitch 

y_pred = logit.predict(X_train)

In [26]:
#Estimate the probability each type of pitch

y_pred_proba = logit.predict_proba(X_train)

In [27]:
#Evaluate Model

#Compute the accuracy

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.42


In [28]:
print('Accuracy of Logistic Regression classifier on test set: {:.2f}'
     .format(logit.score(X_test, y_test)))

Accuracy of Logistic Regression classifier on test set: 0.42
