In [29]:
import warnings

warnings.filterwarnings("ignore")
import pyspark
import pyspark.sql
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [18]:
dftrain=pd.read_csv('dftrain.csv')

In [19]:
dfcount= dftrain[['Pitch_Type','ball_count', 'strike_count']]

In [25]:
dfcount.groupby('ball_count').sum()

Unnamed: 0_level_0,Pitch_Type,strike_count
ball_count,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1171636,362823
1,780656,408577
2,417927,297488
3,179815,163445
4,3,5


In [28]:
corr_matrix= dfcount.corr()
#looking at each attribute's correlation
corr_matrix['Pitch_Type'].sort_values(ascending=False)

Pitch_Type      1.000000
strike_count    0.078061
ball_count     -0.054307
Name: Pitch_Type, dtype: float64

In [30]:
dfcount.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400332 entries, 0 to 1400331
Data columns (total 3 columns):
Pitch_Type      1400332 non-null int64
ball_count      1400332 non-null int64
strike_count    1400332 non-null int64
dtypes: int64(3)
memory usage: 32.1 MB


In [31]:
#Split Data
X= dfcount[['ball_count', 'strike_count']]
y=dfcount['Pitch_Type']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,ball_count,strike_count
780086,3,2
578325,0,1
705263,2,1
1047168,1,0
1212563,1,1


In [32]:
#Train Model

#Create the logistic regression object


logit = LogisticRegression(C=1, random_state = 123, class_weight= 'balanced', multi_class='multinomial', solver='newton-cg')

In [34]:
#Fit the model to the training data

logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight='balanced', dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=123, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
#Print the coefficients and intercept of the model

print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[ 0.10721616 -0.19362715]
 [ 0.02473169  0.07908288]
 [-0.13194786  0.11454427]]
Intercept: 
 [ 0.0784202  -0.08929525  0.01087505]


In [36]:
#Estimate pitch 

y_pred = logit.predict(X_train)

In [37]:
#Estimate the probability each type of pitch

y_pred_proba = logit.predict_proba(X_train)

In [38]:
#Evaluate Model

#Compute the accuracy

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.43


In [39]:
#Create a confusion matrix

print(confusion_matrix(y_train, y_pred))

[[274370 112756 131887]
 [ 48340  32759  36405]
 [148753  79987 114975]]


In [40]:
#Compute Precision, Recall, F1-score, and Support

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.58      0.53      0.55    519013
           2       0.15      0.28      0.19    117504
           3       0.41      0.33      0.37    343715

    accuracy                           0.43    980232
   macro avg       0.38      0.38      0.37    980232
weighted avg       0.47      0.43      0.44    980232

