In [1]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [2]:
# Read in data 
file_path = '../Data/Pitchers/degrom.csv'
degrom_df = pd.read_csv(file_path)

degrom_df.dropna(inplace = True)

degrom_df.head(10)

Unnamed: 0.1,Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,zone,game_type,stand,p_throws,...,effective_speed,release_spin_rate,release_extension,release_pos_y,at_bat_number,pitch_number,pitch_name,spin_axis,delta_home_win_exp,delta_run_exp
0,9175,SL,92.8,-1.03,5.53,"deGrom, Jacob",9.0,R,L,R,...,94.2,2603.0,7.0,53.51,43,10,Slider,162.0,-0.064,-0.263
1,9176,FF,99.3,-1.06,5.46,"deGrom, Jacob",2.0,R,L,R,...,100.8,2570.0,7.0,53.47,43,9,4-Seam Fastball,228.0,0.0,0.0
2,9177,CH,92.2,-1.01,5.39,"deGrom, Jacob",11.0,R,L,R,...,93.7,1480.0,7.0,53.48,43,8,Changeup,231.0,0.0,0.0
3,9178,CH,89.8,-0.93,5.45,"deGrom, Jacob",11.0,R,L,R,...,91.2,1468.0,7.0,53.49,43,7,Changeup,230.0,0.0,0.014
4,9179,CH,91.8,-0.97,5.37,"deGrom, Jacob",7.0,R,L,R,...,92.9,1519.0,7.0,53.52,43,6,Changeup,234.0,0.0,0.0
5,9180,SL,89.8,-1.03,5.52,"deGrom, Jacob",14.0,R,L,R,...,91.3,2618.0,6.9,53.61,43,5,Slider,147.0,0.0,0.0
6,9181,FF,98.8,-1.03,5.46,"deGrom, Jacob",11.0,R,L,R,...,99.8,2557.0,6.9,53.56,43,4,4-Seam Fastball,215.0,0.0,0.044
7,9182,SL,91.8,-0.94,5.56,"deGrom, Jacob",14.0,R,L,R,...,93.2,2446.0,7.0,53.54,43,3,Slider,143.0,0.0,0.0
8,9183,SL,91.0,-0.81,5.5,"deGrom, Jacob",6.0,R,L,R,...,91.9,2621.0,6.8,53.72,43,2,Slider,136.0,0.023,-0.053
9,9184,SL,92.3,-0.97,5.5,"deGrom, Jacob",6.0,R,L,R,...,93.8,2632.0,7.0,53.49,43,1,Slider,155.0,0.0,-0.041


In [3]:
# Drop Unnamed columns
degrom_df.drop(['Unnamed: 0', 'pitch_name'], axis = 1, inplace = True)

In [4]:
# Generate categorical variable list
pitch_cat = degrom_df.dtypes[degrom_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('player_name')
pitch_cat

['pitch_type', 'game_type', 'stand', 'p_throws', 'type']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(degrom_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_CU,pitch_type_FF,pitch_type_FT,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
degrom_df = degrom_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

degrom_df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_FF,pitch_type_FT,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R,type_B,type_S,type_X
0,92.8,-1.03,5.53,"deGrom, Jacob",9.0,2,2,0.5,0.27,0.76,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,99.3,-1.06,5.46,"deGrom, Jacob",2.0,2,2,-0.74,1.43,0.18,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
2,92.2,-1.01,5.39,"deGrom, Jacob",11.0,2,2,-1.17,0.73,-1.01,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
3,89.8,-0.93,5.45,"deGrom, Jacob",11.0,1,2,-1.23,0.59,-1.57,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,91.8,-0.97,5.37,"deGrom, Jacob",7.0,1,2,-0.96,0.73,-0.65,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0


In [7]:
# Drop superfluous columns 
degrom_df.drop(columns = ['stand_L'], axis = 1, inplace = True)
degrom_df.head()


Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,balls,strikes,pfx_x,pfx_z,plate_x,...,pitch_type_CU,pitch_type_FF,pitch_type_FT,pitch_type_SL,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,92.8,-1.03,5.53,"deGrom, Jacob",9.0,2,2,0.5,0.27,0.76,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,99.3,-1.06,5.46,"deGrom, Jacob",2.0,2,2,-0.74,1.43,0.18,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,92.2,-1.01,5.39,"deGrom, Jacob",11.0,2,2,-1.17,0.73,-1.01,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,89.8,-0.93,5.45,"deGrom, Jacob",11.0,1,2,-1.23,0.59,-1.57,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,91.8,-0.97,5.37,"deGrom, Jacob",7.0,1,2,-0.96,0.73,-0.65,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


### Compiling, Training, and Testing Data

In [8]:
# Define features set
X = degrom_df.copy()
X.drop(columns = ['player_name', 'zone', 'balls', 'strikes', 'delta_run_exp', 'delta_home_win_exp'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,pfx_x,pfx_z,plate_x,plate_z,inning,vx0,vy0,...,pitch_type_CU,pitch_type_FF,pitch_type_FT,pitch_type_SL,game_type_R,stand_R,p_throws_R,type_B,type_S,type_X
0,92.8,-1.03,5.53,0.5,0.27,0.76,1.92,5,3.542409,-135.183439,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,99.3,-1.06,5.46,-0.74,1.43,0.18,2.94,5,5.210126,-144.565725,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,92.2,-1.01,5.39,-1.17,0.73,-1.01,2.4,5,2.572459,-134.270492,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,89.8,-0.93,5.45,-1.23,0.59,-1.57,3.69,5,1.04322,-130.808454,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,91.8,-0.97,5.37,-0.96,0.73,-0.65,1.77,5,2.930863,-133.637327,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [9]:
# Define target vector
y = degrom_df['zone'].values
y[:5]

array([ 9.,  2., 11., 11.,  7.])

In [10]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [12]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.938
Accuracy Score (validation): 0.823

Learning Rate: 0.1
Accuracy Score (training): 0.959
Accuracy Score (validation): 0.856

Learning Rate: 0.25
Accuracy Score (training): 0.998
Accuracy Score (validation): 0.874

Learning Rate: 0.5
Accuracy Score (training): 1.000
Accuracy Score (validation): 0.849

Learning Rate: 0.75
Accuracy Score (training): 0.859
Accuracy Score (validation): 0.688

Learning Rate: 1
Accuracy Score (training): 0.254
Accuracy Score (validation): 0.232



### Create model with best learning rate

In [13]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,3.0,3.0
1,14.0,14.0
2,14.0,14.0
3,14.0,14.0
4,1.0,1.0
5,14.0,14.0
6,1.0,1.0
7,5.0,5.0
8,9.0,9.0
9,14.0,14.0


### Evaluate Model

In [14]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.8635437881873728


In [15]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

         1.0       0.84      0.65      0.73        55
         2.0       0.77      0.85      0.81        48
         3.0       0.74      0.80      0.77        44
         4.0       0.69      0.88      0.77        40
         5.0       0.84      0.82      0.83        56
         6.0       0.82      0.79      0.80        62
         7.0       0.74      0.70      0.72        20
         8.0       0.75      0.88      0.81        43
         9.0       0.84      0.82      0.83        56
        11.0       0.90      0.90      0.90       145
        12.0       0.89      0.92      0.90       118
        13.0       0.89      0.72      0.80        57
        14.0       0.97      0.96      0.96       238

    accuracy                           0.86       982
   macro avg       0.82      0.82      0.82       982
weighted avg       0.87      0.86      0.86       982

