In [1]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [2]:
# Read in data 
file_path = '../Data/Pitchers/scherzer.csv'
scherzer_df = pd.read_csv(file_path)

scherzer_df.dropna(inplace = True)

scherzer_df.head(10)

Unnamed: 0.1,Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,zone,game_type,stand,p_throws,...,effective_speed,release_spin_rate,release_extension,release_pos_y,at_bat_number,pitch_number,pitch_name,spin_axis,delta_home_win_exp,delta_run_exp
0,9156,FF,96.6,-3.11,5.51,"Scherzer, Max",12.0,R,L,R,...,96.5,2575.0,6.3,54.24,48,4,4-Seam Fastball,221.0,0.106,-0.405
1,9157,FF,97.1,-3.01,5.6,"Scherzer, Max",12.0,R,L,R,...,96.9,2560.0,6.3,54.18,48,3,4-Seam Fastball,214.0,0.0,0.043
2,9158,FC,91.6,-3.28,5.26,"Scherzer, Max",2.0,R,L,R,...,91.7,2471.0,6.2,54.27,48,2,Cutter,217.0,0.0,-0.115
3,9159,FF,95.4,-3.17,5.37,"Scherzer, Max",1.0,R,L,R,...,95.7,2353.0,6.3,54.17,48,1,4-Seam Fastball,225.0,0.0,-0.071
4,9160,FF,96.6,-3.35,5.28,"Scherzer, Max",5.0,R,R,R,...,97.0,2489.0,6.5,54.04,47,5,4-Seam Fastball,226.0,0.14,-0.586
5,9161,SL,86.9,-3.25,5.11,"Scherzer, Max",14.0,R,R,R,...,86.5,2325.0,6.3,54.21,47,4,Slider,130.0,0.0,0.063
6,9162,SL,86.7,-3.34,5.22,"Scherzer, Max",13.0,R,R,R,...,86.6,2262.0,6.2,54.3,47,3,Slider,116.0,0.0,-0.128
7,9163,CH,86.0,-3.29,5.25,"Scherzer, Max",13.0,R,R,R,...,86.8,1512.0,6.7,53.83,47,2,Changeup,248.0,0.0,-0.136
8,9164,SL,86.8,-3.33,5.17,"Scherzer, Max",14.0,R,R,R,...,87.1,2512.0,6.4,54.07,47,1,Slider,187.0,0.0,0.077
9,9165,SL,86.7,-3.35,5.17,"Scherzer, Max",14.0,R,R,R,...,86.8,2376.0,6.3,54.16,46,5,Slider,172.0,-0.011,0.131


In [3]:
# Drop Unnamed columns
scherzer_df.drop(['Unnamed: 0', 'pitch_name'], axis = 1, inplace = True)

In [4]:
# Generate categorical variable list
pitch_cat = scherzer_df.dtypes[scherzer_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('type')
pitch_cat.remove('player_name')
pitch_cat

['pitch_type', 'game_type', 'stand', 'p_throws']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(scherzer_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R
0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [6]:
# Merge one-hot encoded features and drop the originals
scherzer_df = scherzer_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

scherzer_df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,type,balls,strikes,pfx_x,pfx_z,...,delta_run_exp,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_SL,game_type_R,stand_L,stand_R,p_throws_R
0,96.6,-3.11,5.51,"Scherzer, Max",12.0,S,1,2,-0.87,1.33,...,-0.405,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
1,97.1,-3.01,5.6,"Scherzer, Max",12.0,B,0,2,-0.79,1.39,...,0.043,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
2,91.6,-3.28,5.26,"Scherzer, Max",2.0,S,0,1,-0.02,0.76,...,-0.115,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
3,95.4,-3.17,5.37,"Scherzer, Max",1.0,S,0,0,-0.86,1.24,...,-0.071,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
4,96.6,-3.35,5.28,"Scherzer, Max",5.0,S,2,2,-0.77,1.32,...,-0.586,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [7]:
# Drop superfluous columns 
scherzer_df.drop(columns = ['stand_L'], axis = 1, inplace = True)
scherzer_df.head()


Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,type,balls,strikes,pfx_x,pfx_z,...,delta_home_win_exp,delta_run_exp,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_SL,game_type_R,stand_R,p_throws_R
0,96.6,-3.11,5.51,"Scherzer, Max",12.0,S,1,2,-0.87,1.33,...,0.106,-0.405,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,97.1,-3.01,5.6,"Scherzer, Max",12.0,B,0,2,-0.79,1.39,...,0.0,0.043,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,91.6,-3.28,5.26,"Scherzer, Max",2.0,S,0,1,-0.02,0.76,...,0.0,-0.115,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,95.4,-3.17,5.37,"Scherzer, Max",1.0,S,0,0,-0.86,1.24,...,0.0,-0.071,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,96.6,-3.35,5.28,"Scherzer, Max",5.0,S,2,2,-0.77,1.32,...,0.14,-0.586,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0


### Compiling, Training, and Testing Data

In [8]:
# Define features set
X = scherzer_df.copy()
X.drop(columns = ['player_name', 'type', 'balls', 'strikes', 'delta_run_exp', 'delta_home_win_exp'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,zone,pfx_x,pfx_z,plate_x,plate_z,inning,vx0,...,pitch_number,spin_axis,pitch_type_CH,pitch_type_CU,pitch_type_FC,pitch_type_FF,pitch_type_SL,game_type_R,stand_R,p_throws_R
0,96.6,-3.11,5.51,12.0,-0.87,1.33,0.26,3.91,6,10.898094,...,4,221.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
1,97.1,-3.01,5.6,12.0,-0.79,1.39,1.03,3.28,6,12.582908,...,3,214.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,91.6,-3.28,5.26,2.0,-0.02,0.76,0.19,3.11,6,8.800288,...,2,217.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,95.4,-3.17,5.37,1.0,-0.86,1.24,-0.39,2.63,6,9.244157,...,1,225.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,96.6,-3.35,5.28,5.0,-0.77,1.32,0.25,2.31,6,11.344196,...,5,226.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0


In [9]:
# Define target vector
y = scherzer_df['type'].values
y[:5]

array(['S', 'B', 'S', 'S', 'S'], dtype=object)

In [10]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [12]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.728
Accuracy Score (validation): 0.693

Learning Rate: 0.1
Accuracy Score (training): 0.753
Accuracy Score (validation): 0.702

Learning Rate: 0.25
Accuracy Score (training): 0.797
Accuracy Score (validation): 0.693

Learning Rate: 0.5
Accuracy Score (training): 0.855
Accuracy Score (validation): 0.697

Learning Rate: 0.75
Accuracy Score (training): 0.903
Accuracy Score (validation): 0.690

Learning Rate: 1
Accuracy Score (training): 0.918
Accuracy Score (validation): 0.662



### Create model with best learning rate

In [13]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,S,S
1,S,X
2,B,B
3,S,S
4,S,S
5,B,B
6,S,S
7,S,S
8,S,B
9,B,S


### Evaluate Model

In [14]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.6992790937178167


In [15]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           B       0.73      0.79      0.76       287
           S       0.69      0.86      0.76       523
           X       0.00      0.00      0.00       161

    accuracy                           0.70       971
   macro avg       0.47      0.55      0.51       971
weighted avg       0.58      0.70      0.64       971



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
