In [1]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [2]:
# Read in data 
file_path = '../Data/Pitchers/bieber.csv'
bieber_df = pd.read_csv(file_path)

bieber_df.dropna(inplace = True)

bieber_df.head(10)

Unnamed: 0.1,Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,zone,game_type,stand,p_throws,...,effective_speed,release_spin_rate,release_extension,release_pos_y,at_bat_number,pitch_number,pitch_name,spin_axis,delta_home_win_exp,delta_run_exp
0,999,FF,92.9,-1.08,5.66,"Bieber, Shane",5.0,F,R,R,...,93.7,2403.0,6.7,53.84,42,6,4-Seam Fastball,215.0,-0.102,1.896
1,1000,FF,93.0,-1.06,5.64,"Bieber, Shane",6.0,F,R,R,...,93.8,2452.0,6.7,53.76,42,5,4-Seam Fastball,213.0,0.0,0.0
2,1001,KC,83.7,-1.23,5.64,"Bieber, Shane",13.0,F,R,R,...,84.0,2441.0,6.6,53.92,42,4,Knuckle Curve,30.0,0.0,0.039
3,1002,KC,84.4,-1.21,5.65,"Bieber, Shane",14.0,F,R,R,...,84.5,2385.0,6.4,54.08,42,3,Knuckle Curve,30.0,0.0,0.018
4,1003,FF,93.2,-1.15,5.61,"Bieber, Shane",9.0,F,R,R,...,94.2,2380.0,6.9,53.65,42,2,4-Seam Fastball,211.0,0.0,-0.053
5,1004,FF,93.1,-1.14,5.66,"Bieber, Shane",5.0,F,R,R,...,93.9,2436.0,6.7,53.78,42,1,4-Seam Fastball,212.0,0.0,-0.041
6,1005,KC,83.4,-1.21,5.68,"Bieber, Shane",13.0,F,R,R,...,84.2,2308.0,6.8,53.7,41,4,Knuckle Curve,31.0,-0.007,0.172
7,1006,KC,84.0,-1.27,5.71,"Bieber, Shane",4.0,F,R,R,...,84.2,2369.0,6.5,53.99,41,3,Knuckle Curve,30.0,0.0,-0.029
8,1007,KC,84.6,-1.2,5.73,"Bieber, Shane",14.0,F,R,R,...,84.9,2658.0,6.6,53.88,41,2,Knuckle Curve,25.0,0.0,0.013
9,1008,KC,84.4,-1.31,5.72,"Bieber, Shane",14.0,F,R,R,...,84.4,2498.0,6.5,53.95,41,1,Knuckle Curve,30.0,0.0,-0.017


In [3]:
# Drop Unnamed columns
bieber_df.drop(['Unnamed: 0', 'pitch_name'], axis = 1, inplace = True)

In [4]:
# Generate categorical variable list
pitch_cat = bieber_df.dtypes[bieber_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('player_name')
pitch_cat

['pitch_type', 'game_type', 'stand', 'p_throws', 'type']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(bieber_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_FC,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SL,game_type_F,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
bieber_df = bieber_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

bieber_df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_KC,pitch_type_SL,game_type_F,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,92.9,-1.08,5.66,"Bieber, Shane",5.0,2,2,-0.72,1.47,0.03,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
1,93.0,-1.06,5.64,"Bieber, Shane",6.0,2,2,-0.77,1.52,0.42,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,83.7,-1.23,5.64,"Bieber, Shane",13.0,1,2,0.57,-1.17,-0.18,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,84.4,-1.21,5.65,"Bieber, Shane",14.0,0,2,0.9,-1.04,1.24,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
4,93.2,-1.15,5.61,"Bieber, Shane",9.0,0,1,-0.72,1.45,0.34,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [7]:
# Drop superfluous columns 
bieber_df.drop(columns = ['stand_L'], axis = 1, inplace = True)
bieber_df.head()


Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_FT,pitch_type_KC,pitch_type_SL,game_type_F,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,92.9,-1.08,5.66,"Bieber, Shane",5.0,2,2,-0.72,1.47,0.03,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
1,93.0,-1.06,5.64,"Bieber, Shane",6.0,2,2,-0.77,1.52,0.42,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,83.7,-1.23,5.64,"Bieber, Shane",13.0,1,2,0.57,-1.17,-0.18,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
3,84.4,-1.21,5.65,"Bieber, Shane",14.0,0,2,0.9,-1.04,1.24,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
4,93.2,-1.15,5.61,"Bieber, Shane",9.0,0,1,-0.72,1.45,0.34,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


### Compiling, Training, and Testing Data

In [8]:
# Define features set
X = bieber_df.copy()
X.drop(columns = ['player_name', 'zone', 'balls', 'strikes', 'delta_run_exp', 'delta_home_win_exp'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,pfx_x,pfx_z,plate_x,plate_z,inning,vx0,vy0,...,pitch_type_FT,pitch_type_KC,pitch_type_SL,game_type_F,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,92.9,-1.08,5.66,-0.72,1.47,0.03,2.05,5,4.44276,-135.189851,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
1,93.0,-1.06,5.64,-0.77,1.52,0.42,2.7,5,5.544992,-135.308691,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
2,83.7,-1.23,5.64,0.57,-1.17,-0.18,0.89,5,1.307688,-121.903161,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
3,84.4,-1.21,5.65,0.9,-1.04,1.24,1.43,5,3.926764,-122.899988,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
4,93.2,-1.15,5.61,-0.72,1.45,0.34,2.06,5,5.463886,-135.483875,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [9]:
# Define target vector
y = bieber_df['zone'].values
y[:5]

array([ 5.,  6., 13., 14.,  9.])

In [10]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [12]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.933
Accuracy Score (validation): 0.859

Learning Rate: 0.1
Accuracy Score (training): 0.957
Accuracy Score (validation): 0.894

Learning Rate: 0.25
Accuracy Score (training): 0.996
Accuracy Score (validation): 0.907

Learning Rate: 0.5
Accuracy Score (training): 1.000
Accuracy Score (validation): 0.891

Learning Rate: 0.75
Accuracy Score (training): 0.783
Accuracy Score (validation): 0.633

Learning Rate: 1
Accuracy Score (training): 0.056
Accuracy Score (validation): 0.066



### Create model with best learning rate

In [13]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,3.0,3.0
1,5.0,5.0
2,14.0,14.0
3,4.0,4.0
4,7.0,7.0
5,5.0,5.0
6,14.0,14.0
7,13.0,13.0
8,5.0,5.0
9,14.0,14.0


### Evaluate Model

In [14]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.9056603773584906


In [15]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

         1.0       0.83      0.77      0.80        31
         2.0       0.89      0.81      0.85        48
         3.0       0.76      0.90      0.83        29
         4.0       0.83      0.83      0.83        53
         5.0       0.84      0.97      0.90        72
         6.0       0.91      0.83      0.87        64
         7.0       0.80      0.75      0.78        44
         8.0       0.84      0.89      0.86        64
         9.0       0.89      0.91      0.90        74
        11.0       0.90      0.89      0.90        91
        12.0       0.85      0.81      0.83        63
        13.0       0.91      0.87      0.89        83
        14.0       0.98      0.98      0.98       397

    accuracy                           0.91      1113
   macro avg       0.86      0.86      0.86      1113
weighted avg       0.91      0.91      0.91      1113

