In [3]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [4]:
# Read in data 
file_path = '../Data/Pitchers/buehler.csv'
buehler_df = pd.read_csv(file_path)

buehler_df.dropna(inplace = True)

buehler_df.head(10)

Unnamed: 0.1,Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,zone,game_type,stand,p_throws,...,effective_speed,release_spin_rate,release_extension,release_pos_y,at_bat_number,pitch_number,pitch_name,spin_axis,delta_home_win_exp,delta_run_exp
0,16707,FF,97.4,-0.88,5.83,"Buehler, Walker",11.0,R,L,R,...,97.9,2567.0,6.2,54.27,27,6,4-Seam Fastball,198.0,0.035,-0.344
1,16708,KC,84.6,-0.72,5.78,"Buehler, Walker",14.0,R,L,R,...,84.4,3240.0,6.5,53.99,27,5,Knuckle Curve,22.0,-0.01,0.062
2,16709,FF,97.7,-1.02,5.66,"Buehler, Walker",9.0,R,L,R,...,98.0,2427.0,6.4,54.09,27,4,4-Seam Fastball,204.0,0.0,-0.064
3,16710,FF,96.9,-1.19,5.67,"Buehler, Walker",4.0,R,L,R,...,97.3,2508.0,6.5,54.02,27,3,4-Seam Fastball,205.0,0.0,-0.05
4,16711,KC,82.6,-0.89,5.76,"Buehler, Walker",14.0,R,L,R,...,82.6,3291.0,6.5,54.03,27,2,Knuckle Curve,24.0,0.0,0.044
5,16712,FF,98.3,-1.19,5.6,"Buehler, Walker",13.0,R,L,R,...,98.3,2731.0,6.5,54.02,27,1,4-Seam Fastball,204.0,0.0,0.033
6,16713,FF,98.8,-0.86,5.68,"Buehler, Walker",14.0,R,R,R,...,99.0,2658.0,6.6,53.93,26,9,4-Seam Fastball,202.0,-0.014,0.116
7,16714,FF,97.7,-0.89,5.68,"Buehler, Walker",3.0,R,R,R,...,97.8,2632.0,6.3,54.16,26,8,4-Seam Fastball,201.0,0.0,0.0
8,16715,FF,97.6,-0.82,5.8,"Buehler, Walker",8.0,R,R,R,...,97.6,2719.0,6.4,54.07,26,7,4-Seam Fastball,199.0,0.0,0.0
9,16716,SL,89.1,-0.86,5.62,"Buehler, Walker",14.0,R,R,R,...,89.9,2797.0,6.5,54.02,26,6,Slider,275.0,0.0,0.039


In [5]:
# Drop Unnamed columns
buehler_df.drop(['Unnamed: 0', 'pitch_name'], axis = 1, inplace = True)

In [6]:
# Generate categorical variable list
pitch_cat = buehler_df.dtypes[buehler_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('player_name')
pitch_cat

['pitch_type', 'game_type', 'stand', 'p_throws', 'type']

In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(buehler_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0


In [8]:
# Merge one-hot encoded features and drop the originals
buehler_df = buehler_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

buehler_df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,97.4,-0.88,5.83,"Buehler, Walker",11.0,3,2,-0.36,1.62,-0.37,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,84.6,-0.72,5.78,"Buehler, Walker",14.0,2,2,1.05,-1.41,1.24,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
2,97.7,-1.02,5.66,"Buehler, Walker",9.0,2,1,-0.35,1.53,0.66,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,96.9,-1.19,5.67,"Buehler, Walker",4.0,2,0,-0.56,1.64,-0.59,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,82.6,-0.89,5.76,"Buehler, Walker",14.0,1,0,1.16,-1.2,1.95,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0


In [9]:
# Drop superfluous columns 
buehler_df.drop(columns = ['stand_L'], axis = 1, inplace = True)
buehler_df.head()


Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,97.4,-0.88,5.83,"Buehler, Walker",11.0,3,2,-0.36,1.62,-0.37,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,84.6,-0.72,5.78,"Buehler, Walker",14.0,2,2,1.05,-1.41,1.24,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,97.7,-1.02,5.66,"Buehler, Walker",9.0,2,1,-0.35,1.53,0.66,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,96.9,-1.19,5.67,"Buehler, Walker",4.0,2,0,-0.56,1.64,-0.59,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,82.6,-0.89,5.76,"Buehler, Walker",14.0,1,0,1.16,-1.2,1.95,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


### Compiling, Training, and Testing Data

In [10]:
# Define features set
X = buehler_df.copy()
X.drop(columns = ['player_name', 'zone', 'balls', 'strikes', 'delta_run_exp', 'delta_home_win_exp'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,pfx_x,pfx_z,plate_x,plate_z,inning,vx0,vy0,...,pitch_type_FT,pitch_type_KC,pitch_type_SI,pitch_type_SL,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,97.4,-0.88,5.83,-0.36,1.62,-0.37,3.65,4,2.16791,-141.999094,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,84.6,-0.72,5.78,1.05,-1.41,1.24,-1.41,4,2.454581,-123.012175,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,97.7,-1.02,5.66,-0.35,1.53,0.66,2.08,4,5.325439,-142.065986,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,96.9,-1.19,5.67,-0.56,1.64,-0.59,2.83,4,2.879675,-141.016565,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,82.6,-0.89,5.76,1.16,-1.2,1.95,0.16,4,4.190735,-120.242117,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [11]:
# Define target vector
y = buehler_df['zone'].values
y[:5]

array([11., 14.,  9.,  4., 14.])

In [12]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [14]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.929
Accuracy Score (validation): 0.863

Learning Rate: 0.1
Accuracy Score (training): 0.963
Accuracy Score (validation): 0.883

Learning Rate: 0.25
Accuracy Score (training): 0.998
Accuracy Score (validation): 0.887

Learning Rate: 0.5
Accuracy Score (training): 1.000
Accuracy Score (validation): 0.870

Learning Rate: 0.75
Accuracy Score (training): 1.000
Accuracy Score (validation): 0.863

Learning Rate: 1
Accuracy Score (training): 0.150
Accuracy Score (validation): 0.144



### Create model with best learning rate

In [15]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,12.0,12.0
1,1.0,1.0
2,6.0,6.0
3,3.0,12.0
4,5.0,5.0
5,12.0,12.0
6,2.0,2.0
7,14.0,14.0
8,14.0,14.0
9,5.0,8.0


### Evaluate Model

In [16]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.8880866425992779


In [17]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

         1.0       0.93      0.78      0.85        36
         2.0       0.81      0.79      0.80        53
         3.0       0.78      0.76      0.77        41
         4.0       0.82      0.93      0.87        43
         5.0       0.84      0.90      0.87        70
         6.0       0.84      0.89      0.86        64
         7.0       0.83      0.86      0.84        22
         8.0       0.88      0.83      0.85        53
         9.0       0.87      0.94      0.90        49
        11.0       0.92      0.97      0.95       110
        12.0       0.92      0.75      0.83        64
        13.0       1.00      0.92      0.96        39
        14.0       0.95      0.95      0.95       187

    accuracy                           0.89       831
   macro avg       0.88      0.87      0.87       831
weighted avg       0.89      0.89      0.89       831

