In [1]:
# Import dependencies
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier

# Create global seed
yogi = 8

In [2]:
# Read in data
file_path = Path('Data/Saved/pitching_2020.csv')
df_pitch = pd.read_csv(file_path, index_col = 'Unnamed: 0')
df_pitch.head()

Unnamed: 0,type,pitch_type,release_speed,effective_speed,release_spin_rate,release_pos_x,release_pos_z,zone,stand,p_throws,pitch_number
0,X,FF,98.8,100.2,2483.0,-0.41,6.46,14.0,L,R,2
1,B,FF,98.7,100.0,2522.0,-0.33,6.62,12.0,L,R,1
2,S,SL,89.6,90.8,2537.0,-0.1,6.79,5.0,R,R,6
3,S,FF,100.4,101.7,2469.0,-0.38,6.5,6.0,R,R,5
4,S,FF,97.6,98.9,2339.0,-0.18,6.63,12.0,R,R,4


### Preprocessing Data

In [3]:
# Generate categorical variable list
pitch_cat = df_pitch.dtypes[df_pitch.dtypes == 'object'].index.tolist()
pitch_cat.remove('type')
pitch_cat

['pitch_type', 'stand', 'p_throws']

In [4]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_pitch[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,pitch_type_CH,pitch_type_CS,pitch_type_CU,pitch_type_EP,pitch_type_FA,pitch_type_FC,pitch_type_FF,pitch_type_FO,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,stand_L,stand_R,p_throws_L,p_throws_R
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [5]:
# Merge one-hot encoded features and drop the originals
df_pitch = df_pitch.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

df_pitch.head()

Unnamed: 0,type,release_speed,effective_speed,release_spin_rate,release_pos_x,release_pos_z,zone,pitch_number,pitch_type_CH,pitch_type_CS,...,pitch_type_FO,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,stand_L,stand_R,p_throws_L,p_throws_R
0,X,98.8,100.2,2483.0,-0.41,6.46,14.0,2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,B,98.7,100.0,2522.0,-0.33,6.62,12.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,S,89.6,90.8,2537.0,-0.1,6.79,5.0,6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
3,S,100.4,101.7,2469.0,-0.38,6.5,6.0,5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,S,97.6,98.9,2339.0,-0.18,6.63,12.0,4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [6]:
# Drop superfluous columns 'stand_L' and 'p_throws_L'
df_pitch.drop(columns = ['stand_L', 'p_throws_L'], axis = 1, inplace = True)
df_pitch.head()

Unnamed: 0,type,release_speed,effective_speed,release_spin_rate,release_pos_x,release_pos_z,zone,pitch_number,pitch_type_CH,pitch_type_CS,...,pitch_type_FC,pitch_type_FF,pitch_type_FO,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,stand_R,p_throws_R
0,X,98.8,100.2,2483.0,-0.41,6.46,14.0,2,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,B,98.7,100.0,2522.0,-0.33,6.62,12.0,1,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,S,89.6,90.8,2537.0,-0.1,6.79,5.0,6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,S,100.4,101.7,2469.0,-0.38,6.5,6.0,5,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,S,97.6,98.9,2339.0,-0.18,6.63,12.0,4,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


### Compiling, Training, and Testing Data

In [7]:
# Define features set
X = df_pitch.copy()
X.drop(columns = ['type'], axis = 1, inplace = True)
X.head()

Unnamed: 0,release_speed,effective_speed,release_spin_rate,release_pos_x,release_pos_z,zone,pitch_number,pitch_type_CH,pitch_type_CS,pitch_type_CU,...,pitch_type_FC,pitch_type_FF,pitch_type_FO,pitch_type_FS,pitch_type_KC,pitch_type_KN,pitch_type_SI,pitch_type_SL,stand_R,p_throws_R
0,98.8,100.2,2483.0,-0.41,6.46,14.0,2,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,98.7,100.0,2522.0,-0.33,6.62,12.0,1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,89.6,90.8,2537.0,-0.1,6.79,5.0,6,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,100.4,101.7,2469.0,-0.38,6.5,6.0,5,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,97.6,98.9,2339.0,-0.18,6.63,12.0,4,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [8]:
# Define target vector
y = df_pitch['type'].values
y[:5]

array(['X', 'B', 'S', 'S', 'S'], dtype=object)

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [10]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find best learning rate

In [16]:
len(X_train_scaled[0])

22

In [19]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(
        n_estimators = 40,
        learning_rate = learning_rate,
        max_features = 11,
        max_depth = 20,
        random_state = yogi)
    
    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print(f'Learning Rate: {learning_rate}')

    # Score the model
    print('Accuracy Score (training): {0:.3f}'.format(
        classifier.score(
            X_train_scaled,
            y_train
        )
    ))
    print('Accuracy Score (validation): {0:.3f}'.format(
        classifier.score(
            X_test_scaled,
            y_test
        )
    ))
    print()



### Create model with best learning rate


In [None]:
# Choose a learning rate and create classifiers
classifier = GradientBoostingClassifier(
    n_estimators = 40,
    learning_rate = ,
    max_features = 11,
    max_depth = 3,
    random_state = yogi
)

# Fit the model
classifier.fit(X_train_scaled, y_train)

# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({'Prediction': predictions, 'Actual': y_test}).head(20)

### Evaluate Model

In [None]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f'Accuracy Score: {acc_score}')

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions)
cd_df = pd.DataFrame(
    cm
)

# Display confusion matrix
display(cm_df)

In [None]:
# Generate classification report
print('Classification Report')
print(classification_report(y_test, predictions))