In [17]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
import pickle

# Create global seed
yogi = 8

In [18]:
# Read in data 
file_path = '../Data/Pitchers/top_nine_pitchers.csv'
pitchers_df = pd.read_csv(file_path)

pitchers_df.dropna(inplace = True)

pitchers_df.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,zone,game_type,stand,...,effective_speed,release_spin_rate,release_extension,release_pos_y,at_bat_number,pitch_number,pitch_name,spin_axis,delta_home_win_exp,delta_run_exp
0,0,18993,KC,80.8,-1.25,5.89,"Bauer, Trevor",14.0,R,L,...,80.1,2881.0,6.4,54.15,59,3,Knuckle Curve,23.0,0.004,-0.159
1,1,18994,KC,79.6,-1.44,5.91,"Bauer, Trevor",13.0,R,L,...,79.1,2842.0,6.2,54.26,59,2,Knuckle Curve,22.0,0.0,-0.053
2,2,18995,KC,78.1,-1.38,5.99,"Bauer, Trevor",7.0,R,L,...,77.6,2866.0,6.2,54.29,59,1,Knuckle Curve,24.0,0.0,-0.041
3,3,18996,SL,79.5,-1.73,5.58,"Bauer, Trevor",3.0,R,R,...,79.2,2793.0,6.2,54.3,58,1,Slider,52.0,-0.002,0.139
4,4,18997,KC,77.8,-1.28,5.98,"Bauer, Trevor",14.0,R,L,...,77.2,3061.0,6.4,54.12,57,4,Knuckle Curve,21.0,0.003,-0.118
5,5,18998,SL,77.7,-1.86,5.58,"Bauer, Trevor",4.0,R,L,...,77.3,2743.0,6.1,54.41,57,3,Slider,53.0,0.0,-0.044
6,6,18999,FF,92.3,-1.46,5.76,"Bauer, Trevor",3.0,R,L,...,92.0,2605.0,6.3,54.23,57,2,4-Seam Fastball,211.0,0.0,-0.033
7,7,19000,SL,77.9,-1.93,5.56,"Bauer, Trevor",8.0,R,L,...,77.6,2664.0,6.2,54.29,57,1,Slider,55.0,0.0,0.027
8,8,19001,FF,90.6,-1.59,5.64,"Bauer, Trevor",2.0,R,L,...,90.6,2604.0,6.4,54.11,56,1,4-Seam Fastball,209.0,0.006,-0.25
9,9,19016,SL,79.4,-1.82,5.59,"Bauer, Trevor",8.0,R,R,...,79.4,2821.0,6.3,54.24,52,3,Slider,51.0,0.003,-0.108


In [19]:
list(pitchers_df.columns)

['Unnamed: 0',
 'Unnamed: 0.1',
 'pitch_type',
 'release_speed',
 'release_pos_x',
 'release_pos_z',
 'player_name',
 'zone',
 'game_type',
 'stand',
 'p_throws',
 'type',
 'balls',
 'strikes',
 'pfx_x',
 'pfx_z',
 'plate_x',
 'plate_z',
 'inning',
 'vx0',
 'vy0',
 'ax',
 'ay',
 'az',
 'sz_top',
 'sz_bot',
 'effective_speed',
 'release_spin_rate',
 'release_extension',
 'release_pos_y',
 'at_bat_number',
 'pitch_number',
 'pitch_name',
 'spin_axis',
 'delta_home_win_exp',
 'delta_run_exp']

In [20]:
pitchers_df['strikes'].unique()

array([2, 1, 0], dtype=int64)

In [21]:
# Drop Unnamed columns
pitchers_df.drop(['Unnamed: 0', 'Unnamed: 0.1', 'pitch_name', 'game_type', 'plate_x', 'plate_z', 'vx0', 'vy0', 'ax', 'ay', 'az', 'effective_speed', 'release_pos_y', 'at_bat_number', 'spin_axis', 'type', 'delta_home_win_exp', 'pfx_x', 'pfx_z', 'sz_top', 'sz_bot', 'delta_run_exp'], axis = 1, inplace = True)

In [22]:
# ball_zone = [11.0, 12.0, 13.0, 14.0]
# strike_zone = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]

# pitchers_df['zone'].replace(ball_zone, 0, inplace = True)

# pitchers_df['zone'].replace(strike_zone, 1, inplace = True)

pitchers_df['zone'].unique()

array([14., 13.,  7.,  3.,  4.,  8.,  2., 11.,  1.,  6.,  5.,  9., 12.])

In [23]:
# Generate categorical variable list
pitch_cat = pitchers_df.dtypes[pitchers_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('player_name')
pitch_cat

['pitch_type', 'stand', 'p_throws']

In [24]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(pitchers_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,stand_L,stand_R,p_throws_L,p_throws_R
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [25]:
# Merge one-hot encoded features and drop the originals
pitchers_df = pitchers_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

pitchers_df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,inning,release_spin_rate,release_extension,...,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,stand_L,stand_R,p_throws_L,p_throws_R
0,80.8,-1.25,5.89,"Bauer, Trevor",14.0,0,2,8,2881.0,6.4,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,79.6,-1.44,5.91,"Bauer, Trevor",13.0,0,1,8,2842.0,6.2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,78.1,-1.38,5.99,"Bauer, Trevor",7.0,0,0,8,2866.0,6.2,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,79.5,-1.73,5.58,"Bauer, Trevor",3.0,0,0,8,2793.0,6.2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,77.8,-1.28,5.98,"Bauer, Trevor",14.0,1,2,8,3061.0,6.4,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [26]:
# Drop superfluous columns 
pitchers_df.drop(columns = ['stand_L', 'p_throws_L'], axis = 1, inplace = True)
pitchers_df.head()


Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,inning,release_spin_rate,release_extension,...,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,stand_R,p_throws_R
0,80.8,-1.25,5.89,"Bauer, Trevor",14.0,0,2,8,2881.0,6.4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,79.6,-1.44,5.91,"Bauer, Trevor",13.0,0,1,8,2842.0,6.2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,78.1,-1.38,5.99,"Bauer, Trevor",7.0,0,0,8,2866.0,6.2,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,79.5,-1.73,5.58,"Bauer, Trevor",3.0,0,0,8,2793.0,6.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,77.8,-1.28,5.98,"Bauer, Trevor",14.0,1,2,8,3061.0,6.4,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Compiling, Training, and Testing Data

In [27]:
# Define features set
X = pitchers_df.copy()
X.drop(columns = ['player_name', 'zone'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,balls,strikes,inning,release_spin_rate,release_extension,pitch_number,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,stand_R,p_throws_R
0,80.8,-1.25,5.89,0,2,8,2881.0,6.4,3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,79.6,-1.44,5.91,0,1,8,2842.0,6.2,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,78.1,-1.38,5.99,0,0,8,2866.0,6.2,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,79.5,-1.73,5.58,0,0,8,2793.0,6.2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,77.8,-1.28,5.98,1,2,8,3061.0,6.4,4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [28]:
# Define target vector
y = pitchers_df['zone'].values
y[:5]

array([14., 13.,  7.,  3., 14.])

In [29]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi, stratify = y)

In [30]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
X.columns

Index(['release_speed', 'release_pos_x', 'release_pos_z', 'balls', 'strikes',
       'inning', 'release_spin_rate', 'release_extension', 'pitch_number',
       'pitch_type_CH', 'pitch_type_CU', 'pitch_type_FC', 'pitch_type_FF',
       'pitch_type_FT', 'pitch_type_KC', 'pitch_type_SI', 'pitch_type_SL',
       'stand_R', 'p_throws_R'],
      dtype='object')

### Find Best Learning Rate

In [32]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 9,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.285
Accuracy Score (validation): 0.278

Learning Rate: 0.1
Accuracy Score (training): 0.315
Accuracy Score (validation): 0.289

Learning Rate: 0.25
Accuracy Score (training): 0.344
Accuracy Score (validation): 0.290

Learning Rate: 0.5
Accuracy Score (training): 0.366
Accuracy Score (validation): 0.283

Learning Rate: 0.75
Accuracy Score (training): 0.367
Accuracy Score (validation): 0.278

Learning Rate: 1
Accuracy Score (training): 0.334
Accuracy Score (validation): 0.260



### Create model with best learning rate

In [33]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 9,
    max_depth = 4,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,12.0,4.0
1,12.0,3.0
2,14.0,14.0
3,11.0,13.0
4,14.0,14.0
5,14.0,9.0
6,14.0,7.0
7,14.0,11.0
8,12.0,3.0
9,14.0,4.0


### Evaluate Model

In [34]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.29046460433321747


In [35]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

         1.0       0.05      0.01      0.01       362
         2.0       0.09      0.01      0.01       443
         3.0       0.11      0.01      0.02       338
         4.0       0.06      0.01      0.02       460
         5.0       0.14      0.03      0.05       619
         6.0       0.12      0.01      0.02       571
         7.0       0.06      0.01      0.02       337
         8.0       0.07      0.01      0.01       516
         9.0       0.12      0.01      0.01       527
        11.0       0.20      0.39      0.26       963
        12.0       0.19      0.18      0.19       672
        13.0       0.35      0.44      0.39       782
        14.0       0.35      0.79      0.49      2041

    accuracy                           0.29      8631
   macro avg       0.15      0.15      0.11      8631
weighted avg       0.20      0.29      0.20      8631



In [36]:
# Save model file
# filename = 'all_pitchers_zone.sav'
# pickle.dump(classifier, open(filename, 'wb'))

In [37]:
# Load model
# loaded_model = pickle.load(open('all_pitchers_zone.sav', 'rb'))
# result = loaded_model.predict(X_test_scaled[0].reshape(1, -1))
# print(result)