In [14]:
# Import pandas
import pandas as pd

# Import model dependencies
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Import pickle
import pickle

# Import sqlalchemy and sqlite dependencies
import sqlalchemy
import sqlite3 as sq
from sqlalchemy import create_engine, func
from sqlalchemy.orm import Session
from sqlalchemy.ext.automap import automap_base
import psycopg2

# Import config information
from config import USER_NAME, PASSWORD


# Create global seed
yogi = 8

In [15]:
# Define engine path
engine_path = f'postgresql://{USER_NAME}:{PASSWORD}@team-country-music.czuzjwoakfyk.us-east-2.rds.amazonaws.com:5432/team_country_music'

# Create Engine
engine = create_engine(engine_path)

# Create Base
Base = automap_base()

# Reflect
Base.prepare(engine, reflect = True)

# Save Base Class
Pitching = Base.classes.pitching_data

# Create Session
session = Session(engine)


In [16]:
# Query Pitching Data
results = []
results = session.query(
    Pitching.ID,
    Pitching.release_speed,
    Pitching.zone,
    Pitching.type,
    Pitching.balls,
    Pitching.strikes,
    Pitching.inning,
    Pitching.release_spin_rate,
    Pitching.release_extension,
    Pitching.pitch_number,
    Pitching.pitch_type_CH,
    Pitching.pitch_type_CS,
    Pitching.pitch_type_CU,
    Pitching.pitch_type_EP,
    Pitching.pitch_type_FA,
    Pitching.pitch_type_FC,
    Pitching.pitch_type_FF,
    Pitching.pitch_type_FO,
    Pitching.pitch_type_FS,
    Pitching.pitch_type_FT,
    Pitching.pitch_type_KC,
    Pitching.pitch_type_KN,
    Pitching.pitch_type_SI,
    Pitching.pitch_type_SL,
    Pitching.pitch_type_nan,
    Pitching.stand_R,
    Pitching.p_throws_L,
    Pitching.p_throws_R
).all()

# Create DataFrame
pitchers_df = pd.DataFrame(
    results, 
    columns = [
        'ID',
        'release_speed',
        'zone',
        'type',
        'balls',
        'strikes',
        'inning',
        'release_spin_rate',
        'release_extension',
        'pitch_number',
        'pitch_type_CH',
        'pitch_type_CS',
        'pitch_type_CU',
        'pitch_type_EP',
        'pitch_type_FA',
        'pitch_type_FC',
        'pitch_type_FF',
        'pitch_type_FO',
        'pitch_type_FS',
        'pitch_type_FT',
        'pitch_type_KC',
        'pitch_type_KN',
        'pitch_type_SI',
        'pitch_type_SL',
        'pitch_type_nan',
        'stand_r',
        'p_throws_L',
        'p_throws_R'
    ]
)

In [17]:
# Combine 'ball' and 'in play' values into 'not strike' bin
not_strike = ['B', 'X']

pitchers_df['type'].replace(not_strike, 'N', inplace = True)

In [18]:
# Drop rows with NAs
pitchers_df.dropna(inplace = True)

### Compiling, Training, and Testing Data

In [19]:
# Define features set
X = pitchers_df.copy()
X.drop(columns = ['type', 'ID', 'p_throws_L'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,zone,balls,strikes,inning,release_spin_rate,release_extension,pitch_number,pitch_type_CH,pitch_type_CS,...,pitch_type_FO,pitch_type_FS,pitch_type_FT,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,pitch_type_nan,stand_r,p_throws_R
0,80.1,7.0,1,2,3,2243.0,7.0,4,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,82.1,4.0,1,0,4,1555.0,6.4,2,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,85.6,7.0,1,1,3,1839.0,7.0,3,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,75.5,11.0,0,0,4,1783.0,6.3,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,87.4,13.0,0,1,3,1908.0,6.9,2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
# Define target vector
y = pitchers_df['type'].values
y[:5]

array(['N', 'S', 'S', 'N', 'N'], dtype=object)

In [21]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

### Find Best Learning Rate

In [22]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test,
            y_test
        )
    ))
    print()

    if classifier.score(X_test, y_test) > best_acc:
        best_acc = classifier.score(X_test, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.710
Accuracy Score (validation): 0.717

Learning Rate: 0.1
Accuracy Score (training): 0.711
Accuracy Score (validation): 0.717

Learning Rate: 0.25
Accuracy Score (training): 0.712
Accuracy Score (validation): 0.716

Learning Rate: 0.5
Accuracy Score (training): 0.716
Accuracy Score (validation): 0.716

Learning Rate: 0.75
Accuracy Score (training): 0.718
Accuracy Score (validation): 0.712

Learning Rate: 1
Accuracy Score (training): 0.720
Accuracy Score (validation): 0.708



### Create model with best learning rate

In [23]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train, y_train)

# Make predictions
predictions = classifier.predict(X_test)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,S,S
1,S,S
2,N,N
3,S,S
4,S,S
5,S,N
6,S,S
7,S,N
8,S,S
9,S,N


### Evaluate Model

In [24]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.716760061130922


In [25]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           N       0.73      0.72      0.72      4060
           S       0.70      0.71      0.71      3792

    accuracy                           0.72      7852
   macro avg       0.72      0.72      0.72      7852
weighted avg       0.72      0.72      0.72      7852



In [26]:
# Save model file
filename = '../Saved/all_pitchers_gbc_pitch.sav'
pickle.dump(classifier, open(filename, 'wb'))