In [1]:
# Import dependcies
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [2]:
# Read in data 
file_path = '../Data/Pitchers/cole.csv'
cole_df = pd.read_csv(file_path)

cole_df.dropna(inplace = True)

cole_df.head(10)

Unnamed: 0.1,Unnamed: 0,pitch_type,release_speed,release_pos_x,release_pos_z,player_name,zone,game_type,stand,p_throws,...,effective_speed,release_spin_rate,release_extension,release_pos_y,at_bat_number,pitch_number,pitch_name,spin_axis,delta_home_win_exp,delta_run_exp
0,914,KC,82.9,-1.65,5.8,"Cole, Gerrit",9.0,F,R,R,...,83.7,2803.0,6.7,53.84,63,3,Knuckle Curve,43.0,0.0,-0.156
1,915,FF,96.7,-2.07,5.53,"Cole, Gerrit",8.0,F,R,R,...,97.3,2598.0,6.7,53.8,63,2,4-Seam Fastball,219.0,0.0,-0.053
2,916,SL,86.5,-1.89,5.82,"Cole, Gerrit",1.0,F,R,R,...,87.6,2586.0,6.6,53.91,63,1,Slider,134.0,0.0,-0.041
3,917,KC,82.6,-1.88,5.79,"Cole, Gerrit",12.0,F,L,R,...,83.1,2772.0,6.7,53.81,62,4,Knuckle Curve,48.0,-0.001,-0.206
4,918,FF,96.5,-1.94,5.78,"Cole, Gerrit",11.0,F,L,R,...,97.2,2609.0,6.6,53.86,62,3,4-Seam Fastball,216.0,0.0,0.016
5,919,FF,96.1,-1.93,5.8,"Cole, Gerrit",1.0,F,L,R,...,96.8,2596.0,6.6,53.88,62,2,4-Seam Fastball,213.0,0.0,-0.059
6,920,FF,94.7,-1.98,5.77,"Cole, Gerrit",5.0,F,L,R,...,94.9,2649.0,6.6,53.93,62,1,4-Seam Fastball,213.0,0.0,-0.044
7,921,FF,96.4,-1.93,5.7,"Cole, Gerrit",4.0,F,R,R,...,97.1,2604.0,6.8,53.67,61,7,4-Seam Fastball,209.0,-0.002,-0.496
8,922,KC,82.9,-1.73,5.93,"Cole, Gerrit",14.0,F,R,R,...,83.0,2795.0,6.6,53.95,61,6,Knuckle Curve,36.0,0.0,0.16
9,923,SL,87.7,-1.9,5.73,"Cole, Gerrit",1.0,F,R,R,...,88.8,2595.0,6.7,53.84,61,5,Slider,153.0,0.0,0.0


In [3]:
# Drop Unnamed columns
cole_df.drop(['Unnamed: 0', 'pitch_name'], axis = 1, inplace = True)

In [4]:
# Generate categorical variable list
pitch_cat = cole_df.dtypes[cole_df.dtypes == 'object'].index.tolist()
pitch_cat.remove('type')
pitch_cat.remove('player_name')
pitch_cat

['pitch_type', 'game_type', 'stand', 'p_throws', 'pitch_name']

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cole_df[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SL,game_type_F,game_type_R,stand_L,stand_R,p_throws_R,pitch_name_2-Seam Fastball,pitch_name_4-Seam Fastball,pitch_name_Changeup,pitch_name_Knuckle Curve,pitch_name_Slider
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
cole_df = cole_df.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

cole_df.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,type,balls,strikes,pfx_x,pfx_z,...,game_type_F,game_type_R,stand_L,stand_R,p_throws_R,pitch_name_2-Seam Fastball,pitch_name_4-Seam Fastball,pitch_name_Changeup,pitch_name_Knuckle Curve,pitch_name_Slider
0,82.9,-1.65,5.8,"Cole, Gerrit",9.0,X,0,2,0.77,-1.01,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,96.7,-2.07,5.53,"Cole, Gerrit",8.0,S,0,1,-0.92,1.53,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,86.5,-1.89,5.82,"Cole, Gerrit",1.0,S,0,0,0.47,0.39,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,82.6,-1.88,5.79,"Cole, Gerrit",12.0,S,1,2,1.27,-0.96,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,96.5,-1.94,5.78,"Cole, Gerrit",11.0,B,0,2,-0.67,1.51,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [7]:
# Drop superfluous columns 
cole_df.drop(columns = ['stand_L'], axis = 1, inplace = True)
cole_df.head()


Unnamed: 0,release_speed,release_pos_x,release_pos_z,player_name,zone,type,balls,strikes,pfx_x,pfx_z,...,pitch_type_SL,game_type_F,game_type_R,stand_R,p_throws_R,pitch_name_2-Seam Fastball,pitch_name_4-Seam Fastball,pitch_name_Changeup,pitch_name_Knuckle Curve,pitch_name_Slider
0,82.9,-1.65,5.8,"Cole, Gerrit",9.0,X,0,2,0.77,-1.01,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,96.7,-2.07,5.53,"Cole, Gerrit",8.0,S,0,1,-0.92,1.53,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,86.5,-1.89,5.82,"Cole, Gerrit",1.0,S,0,0,0.47,0.39,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,82.6,-1.88,5.79,"Cole, Gerrit",12.0,S,1,2,1.27,-0.96,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,96.5,-1.94,5.78,"Cole, Gerrit",11.0,B,0,2,-0.67,1.51,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


### Compiling, Training, and Testing Data

In [8]:
# Define features set
X = cole_df.copy()
X.drop(columns = ['player_name', 'type', 'balls', 'strikes', 'delta_run_exp', 'delta_home_win_exp'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,release_pos_x,release_pos_z,zone,pfx_x,pfx_z,plate_x,plate_z,inning,vx0,...,spin_axis,pitch_type_CH,pitch_type_FF,pitch_type_FT,pitch_type_KC,pitch_type_SL,game_type_F,game_type_R,stand_R,p_throws_R
0,82.9,-1.65,5.8,9.0,0.77,-1.01,0.39,1.84,7,3.167373,...,43.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
1,96.7,-2.07,5.53,8.0,-0.92,1.53,-0.24,1.93,7,7.04267,...,219.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
2,86.5,-1.89,5.82,1.0,0.47,0.39,-0.42,2.84,7,2.567003,...,134.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
3,82.6,-1.88,5.79,12.0,1.27,-0.96,0.87,2.44,7,3.810374,...,48.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,96.5,-1.94,5.78,11.0,-0.67,1.51,-0.96,3.4,7,4.176389,...,216.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [9]:
# Define target vector
y = cole_df['type'].values
y[:5]

array(['X', 'S', 'S', 'S', 'B'], dtype=object)

In [10]:
# Split the data into training and testing sets - stratify by pitcher
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler 
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find Best Learning Rate

In [12]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
best_rate = 0
best_acc = 0

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 20,
        learning_rate = learning_rate,
        max_features = 7,
        max_depth = 4,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()

    if classifier.score(X_test_scaled, y_test) > best_acc:
        best_acc = classifier.score(X_test_scaled, y_test)
        best_rate = learning_rate

Learning Rate: 0.05
Accuracy Score (training): 0.745
Accuracy Score (validation): 0.722

Learning Rate: 0.1
Accuracy Score (training): 0.756
Accuracy Score (validation): 0.724

Learning Rate: 0.25
Accuracy Score (training): 0.796
Accuracy Score (validation): 0.728

Learning Rate: 0.5
Accuracy Score (training): 0.856
Accuracy Score (validation): 0.699

Learning Rate: 0.75
Accuracy Score (training): 0.881
Accuracy Score (validation): 0.698

Learning Rate: 1
Accuracy Score (training): 0.917
Accuracy Score (validation): 0.699



### Create model with best learning rate

In [13]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 20,
    learning_rate = best_rate,
    max_features = 7,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make predictions
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

Unnamed: 0,Prediction,Actual
0,S,X
1,B,B
2,S,S
3,B,B
4,B,S
5,S,S
6,S,X
7,B,B
8,S,S
9,S,S


### Evaluate Model

In [14]:
# Calculate accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

Accuracy Score: 0.729004329004329


In [15]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           B       0.77      0.83      0.80       385
           S       0.71      0.85      0.77       608
           X       0.43      0.02      0.04       162

    accuracy                           0.73      1155
   macro avg       0.64      0.57      0.54      1155
weighted avg       0.69      0.73      0.68      1155

