In [32]:
# Import pandas
import pandas as pd

# Import model dependencies
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Import pickle
import pickle

# Import sqlalchemy and sqlite dependencies
import sqlalchemy
import sqlite3 as sq
from sqlalchemy import create_engine, func
from sqlalchemy.orm import Session
from sqlalchemy.ext.automap import automap_base
import psycopg2

# Create global seed
yogi = 8

In [33]:
# Define engine path
engine_path = 'postgresql://postgres:vanderbilt@team-country-music.czuzjwoakfyk.us-east-2.rds.amazonaws.com:5432/team_country_music'

# Create Engine
engine = create_engine(engine_path)

# Create Base
Base = automap_base()

# Reflect
Base.prepare(engine, reflect = True)

# Save Base Class
Pitching = Base.classes.pitching_data

# Create Session
session = Session(engine)


In [34]:
# Query Pitching Data
results = []
results = session.query(
    Pitching.ID,
    Pitching.release_speed,
    Pitching.zone,
    Pitching.balls,
    Pitching.strikes,
    Pitching.inning,
    Pitching.release_spin_rate,
    Pitching.release_extension,
    Pitching.pitch_number,
    Pitching.pitch_type_CH,
    Pitching.pitch_type_CS,
    Pitching.pitch_type_CU,
    Pitching.pitch_type_EP,
    Pitching.pitch_type_FA,
    Pitching.pitch_type_FC,
    Pitching.pitch_type_FF,
    Pitching.pitch_type_FO,
    Pitching.pitch_type_FS,
    Pitching.pitch_type_FT,
    Pitching.pitch_type_KC,
    Pitching.pitch_type_KN,
    Pitching.pitch_type_SI,
    Pitching.pitch_type_SL,
    Pitching.pitch_type_nan,
    Pitching.stand_R,
    Pitching.p_throws_L,
    Pitching.p_throws_R
).all()

# Create DataFrame
pitchers_df = pd.DataFrame(
    results, 
    columns = [
        'ID',
        'release_speed',
        'zone',
        'balls',
        'strikes',
        'inning',
        'release_spin_rate',
        'release_extension',
        'pitch_number',
        'pitch_type_CH',
        'pitch_type_CS',
        'pitch_type_CU',
        'pitch_type_EP',
        'pitch_type_FA',
        'pitch_type_FC',
        'pitch_type_FF',
        'pitch_type_FO',
        'pitch_type_FS',
        'pitch_type_FT',
        'pitch_type_KC',
        'pitch_type_KN',
        'pitch_type_SI',
        'pitch_type_SL',
        'pitch_type_nan',
        'stand_r',
        'p_throws_L',
        'p_throws_R'
    ]
)

In [35]:
# Drop rows with NAs 
pitchers_df.dropna(inplace = True)

### Compiling, Training, and Testing Data

In [36]:
# Define features set
X = pitchers_df.copy()
X.drop(columns = ['zone'], axis = 1, inplace = True)
X.head()

Unnamed: 0,ID,release_speed,balls,strikes,inning,release_spin_rate,release_extension,pitch_number,pitch_type_CH,pitch_type_CS,...,pitch_type_FS,pitch_type_FT,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,pitch_type_nan,stand_r,p_throws_L,p_throws_R
0,18993,80.1,1,2,3,2243.0,7.0,4,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,18994,82.1,1,0,4,1555.0,6.4,2,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,18995,85.6,1,1,3,1839.0,7.0,3,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,18996,75.5,0,0,4,1783.0,6.3,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,18997,87.4,0,1,3,1908.0,6.9,2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [37]:
# Define target vector
y = pitchers_df['zone'].values
y[:5]

array([ 7.,  4.,  7., 11., 13.])

In [38]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi, stratify = y)

### Find Best Learning Rate

In [39]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test,
            y_test
        )
    ))
    print()

    if classifier.score(X_test, y_test) > best_acc:
        best_acc = classifier.score(X_test, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.233
Accuracy Score (validation): 0.219

Learning Rate: 0.1
Accuracy Score (training): 0.264
Accuracy Score (validation): 0.228

Learning Rate: 0.25
Accuracy Score (training): 0.295
Accuracy Score (validation): 0.231

Learning Rate: 0.5
Accuracy Score (training): 0.307
Accuracy Score (validation): 0.221

Learning Rate: 0.75
Accuracy Score (training): 0.312
Accuracy Score (validation): 0.219

Learning Rate: 1
Accuracy Score (training): 0.313
Accuracy Score (validation): 0.212



### Create model with best learning rate

In [40]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train, y_train)

# Make predictions
predictions = classifier.predict(X_test)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,13.0,4.0
1,11.0,4.0
2,14.0,11.0
3,11.0,11.0
4,14.0,12.0
5,14.0,7.0
6,14.0,13.0
7,11.0,12.0
8,14.0,9.0
9,11.0,14.0


### Evaluate Model

In [41]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.22758532857870606


In [42]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       319
         2.0       0.00      0.00      0.00       385
         3.0       0.10      0.01      0.01       291
         4.0       0.20      0.00      0.01       439
         5.0       0.10      0.01      0.01       561
         6.0       0.18      0.01      0.01       489
         7.0       0.00      0.00      0.00       346
         8.0       0.00      0.00      0.00       492
         9.0       0.08      0.00      0.01       486
        11.0       0.15      0.31      0.20       842
        12.0       0.16      0.14      0.15       620
        13.0       0.25      0.26      0.25       928
        14.0       0.27      0.72      0.39      1654

    accuracy                           0.23      7852
   macro avg       0.11      0.11      0.08      7852
weighted avg       0.15      0.23      0.15      7852



In [43]:
# Save model file
filename = '../Saved/all_pitchers_gbc_pitch.sav'
pickle.dump(classifier, open(filename, 'wb'))