In [28]:
# Import pandas
import pandas as pd

# Import model dependencies
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Import pickle
import pickle

# Import sqlalchemy and sqlite dependencies
import sqlalchemy
import sqlite3 as sq
from sqlalchemy import create_engine, func
from sqlalchemy.orm import Session
from sqlalchemy.ext.automap import automap_base
import psycopg2

# Create global seed
yogi = 8

':'

In [4]:
# Combine 'ball' and 'in play' values into 'not strike' bin
not_strike = ['B', 'X']

pitchers_df['type'].replace(not_strike, 'N', inplace = True)

In [8]:
# Drop superfluous columns 
pitchers_df.drop(columns = ['stand_L', 'p_throws_L'], axis = 1, inplace = True)
pitchers_df.head()


Unnamed: 0,release_speed,player_name,zone,type,balls,strikes,inning,release_spin_rate,release_extension,pitch_number,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,stand_R,p_throws_R
0,80.8,"Bauer, Trevor",14.0,S,0,2,8,2881.0,6.4,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,79.6,"Bauer, Trevor",13.0,S,0,1,8,2842.0,6.2,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,78.1,"Bauer, Trevor",7.0,S,0,0,8,2866.0,6.2,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,79.5,"Bauer, Trevor",3.0,N,0,0,8,2793.0,6.2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,77.8,"Bauer, Trevor",14.0,S,1,2,8,3061.0,6.4,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Compiling, Training, and Testing Data

In [9]:
# Define features set
X = pitchers_df.copy()
X.drop(columns = ['player_name', 'type'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,zone,balls,strikes,inning,release_spin_rate,release_extension,pitch_number,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,stand_R,p_throws_R
0,80.8,14.0,0,2,8,2881.0,6.4,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,79.6,13.0,0,1,8,2842.0,6.2,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,78.1,7.0,0,0,8,2866.0,6.2,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,79.5,3.0,0,0,8,2793.0,6.2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,77.8,14.0,1,2,8,3061.0,6.4,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [10]:
# Define target vector
y = pitchers_df['type'].values
y[:5]

array(['S', 'S', 'S', 'N', 'S'], dtype=object)

In [11]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [12]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [13]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.702
Accuracy Score (validation): 0.713

Learning Rate: 0.1
Accuracy Score (training): 0.702
Accuracy Score (validation): 0.713

Learning Rate: 0.25
Accuracy Score (training): 0.705
Accuracy Score (validation): 0.714

Learning Rate: 0.5
Accuracy Score (training): 0.710
Accuracy Score (validation): 0.711

Learning Rate: 0.75
Accuracy Score (training): 0.711
Accuracy Score (validation): 0.710

Learning Rate: 1
Accuracy Score (training): 0.714
Accuracy Score (validation): 0.700



### Create model with best learning rate

In [14]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,N,N
1,N,N
2,N,S
3,S,N
4,S,S
5,S,S
6,S,S
7,N,N
8,N,N
9,N,N


### Evaluate Model

In [15]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.7128953771289538


In [16]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           N       0.70      0.73      0.71      4207
           S       0.73      0.69      0.71      4424

    accuracy                           0.71      8631
   macro avg       0.71      0.71      0.71      8631
weighted avg       0.71      0.71      0.71      8631



In [17]:
# Save model file
filename = '../Saved/all_pitchers_gbc_pitch.sav'
pickle.dump(classifier, open(filename, 'wb'))