In [13]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [14]:
# Read in data 
file_path = '../Data/Pitchers/bauer.csv'
bauer_df = pd.read_csv(file_path)

bauer_df.dropna(inplace = True)

bauer_df.head(10)

Unnamed: 0.1,Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,zone,game_type,stand,p_throws,...,effective_speed,release_spin_rate,release_extension,release_pos_y,at_bat_number,pitch_number,pitch_name,spin_axis,delta_home_win_exp,delta_run_exp
0,18993,KC,80.8,-1.25,5.89,"Bauer, Trevor",14.0,R,L,R,...,80.1,2881.0,6.4,54.15,59,3,Knuckle Curve,23.0,0.004,-0.159
1,18994,KC,79.6,-1.44,5.91,"Bauer, Trevor",13.0,R,L,R,...,79.1,2842.0,6.2,54.26,59,2,Knuckle Curve,22.0,0.0,-0.053
2,18995,KC,78.1,-1.38,5.99,"Bauer, Trevor",7.0,R,L,R,...,77.6,2866.0,6.2,54.29,59,1,Knuckle Curve,24.0,0.0,-0.041
3,18996,SL,79.5,-1.73,5.58,"Bauer, Trevor",3.0,R,R,R,...,79.2,2793.0,6.2,54.3,58,1,Slider,52.0,-0.002,0.139
4,18997,KC,77.8,-1.28,5.98,"Bauer, Trevor",14.0,R,L,R,...,77.2,3061.0,6.4,54.12,57,4,Knuckle Curve,21.0,0.003,-0.118
5,18998,SL,77.7,-1.86,5.58,"Bauer, Trevor",4.0,R,L,R,...,77.3,2743.0,6.1,54.41,57,3,Slider,53.0,0.0,-0.044
6,18999,FF,92.3,-1.46,5.76,"Bauer, Trevor",3.0,R,L,R,...,92.0,2605.0,6.3,54.23,57,2,4-Seam Fastball,211.0,0.0,-0.033
7,19000,SL,77.9,-1.93,5.56,"Bauer, Trevor",8.0,R,L,R,...,77.6,2664.0,6.2,54.29,57,1,Slider,55.0,0.0,0.027
8,19001,FF,90.6,-1.59,5.64,"Bauer, Trevor",2.0,R,L,R,...,90.6,2604.0,6.4,54.11,56,1,4-Seam Fastball,209.0,0.006,-0.25
9,19016,SL,79.4,-1.82,5.59,"Bauer, Trevor",8.0,R,R,R,...,79.4,2821.0,6.3,54.24,52,3,Slider,51.0,0.003,-0.108


In [15]:
# Drop Unnamed columns
bauer_df.drop(['Unnamed: 0', 'pitch_name'], axis = 1, inplace = True)

In [16]:
# Generate categorical variable list
pitch_cat = bauer_df.dtypes[bauer_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('player_name')
pitch_cat

['pitch_type', 'game_type', 'stand', 'p_throws', 'type']

In [17]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(bauer_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_FC,pitch_type_FF,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [18]:
# Merge one-hot encoded features and drop the originals
bauer_df = bauer_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

bauer_df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,80.8,-1.25,5.89,"Bauer, Trevor",14.0,0,2,0.85,-1.73,1.08,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,79.6,-1.44,5.91,"Bauer, Trevor",13.0,0,1,0.64,-1.6,-0.32,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,78.1,-1.38,5.99,"Bauer, Trevor",7.0,0,0,0.73,-1.65,-0.33,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,79.5,-1.73,5.58,"Bauer, Trevor",3.0,0,0,1.57,-0.03,0.28,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
4,77.8,-1.28,5.98,"Bauer, Trevor",14.0,1,2,0.84,-1.68,0.54,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [19]:
# Drop superfluous columns 
bauer_df.drop(columns = ['stand_L'], axis = 1, inplace = True)
bauer_df.head()


Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_FF,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,80.8,-1.25,5.89,"Bauer, Trevor",14.0,0,2,0.85,-1.73,1.08,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,79.6,-1.44,5.91,"Bauer, Trevor",13.0,0,1,0.64,-1.6,-0.32,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,78.1,-1.38,5.99,"Bauer, Trevor",7.0,0,0,0.73,-1.65,-0.33,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,79.5,-1.73,5.58,"Bauer, Trevor",3.0,0,0,1.57,-0.03,0.28,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,77.8,-1.28,5.98,"Bauer, Trevor",14.0,1,2,0.84,-1.68,0.54,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


### Compiling, Training, and Testing Data

In [20]:
# Define features set
X = bauer_df.copy()
X.drop(columns = ['player_name', 'zone', 'balls', 'strikes', 'delta_run_exp', 'delta_home_win_exp'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,pfx_x,pfx_z,plate_x,plate_z,inning,vx0,vy0,...,pitch_type_FF,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,80.8,-1.25,5.89,0.85,-1.73,1.08,-1.33,8,3.55877,-117.379924,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,79.6,-1.44,5.91,0.64,-1.6,-0.32,1.4,8,1.241716,-115.865455,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,78.1,-1.38,5.99,0.73,-1.65,-0.33,1.79,8,0.901278,-113.644059,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,79.5,-1.73,5.58,1.57,-0.03,0.28,3.19,8,1.466798,-115.776629,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,77.8,-1.28,5.98,0.84,-1.68,0.54,0.82,8,2.364114,-113.203327,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [21]:
# Define target vector
y = bauer_df['zone'].values
y[:5]

array([14., 13.,  7.,  3., 14.])

In [22]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [23]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [24]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.915
Accuracy Score (validation): 0.839

Learning Rate: 0.1
Accuracy Score (training): 0.956
Accuracy Score (validation): 0.887

Learning Rate: 0.25
Accuracy Score (training): 0.994
Accuracy Score (validation): 0.899

Learning Rate: 0.5
Accuracy Score (training): 1.000
Accuracy Score (validation): 0.898

Learning Rate: 0.75
Accuracy Score (training): 0.794
Accuracy Score (validation): 0.679

Learning Rate: 1
Accuracy Score (training): 0.638
Accuracy Score (validation): 0.553



### Create model with best learning rate

In [25]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,4.0,4.0
1,9.0,9.0
2,6.0,3.0
3,14.0,14.0
4,5.0,5.0
5,6.0,6.0
6,12.0,12.0
7,9.0,9.0
8,4.0,4.0
9,8.0,8.0


### Evaluate Model

In [26]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.893048128342246


In [27]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

         1.0       0.81      0.84      0.83        57
         2.0       0.81      0.83      0.82        58
         3.0       0.98      0.75      0.85        73
         4.0       0.86      0.79      0.82        56
         5.0       0.86      0.85      0.85        59
         6.0       0.83      0.97      0.89        88
         7.0       0.83      0.90      0.86        21
         8.0       0.88      0.85      0.87        68
         9.0       0.92      0.91      0.91        64
        11.0       0.94      0.90      0.92       132
        12.0       0.90      0.87      0.88       114
        13.0       0.54      0.72      0.62        18
        14.0       0.94      0.97      0.96       314

    accuracy                           0.89      1122
   macro avg       0.86      0.86      0.85      1122
weighted avg       0.90      0.89      0.89      1122

