In [283]:
# Import dependencies
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
import tensorflow as tf


# Create global seed
yogi = 8

In [284]:
# Read in data
file_path = Path('Data/Saved/pitching_all.csv')
df_pitch = pd.read_csv(file_path, index_col = 'Unnamed: 0')
df_pitch.head()

  mask |= (ar1 == a)


Unnamed: 0,type,pitch_type,release_speed,effective_speed,release_spin_rate,release_pos_x,release_pos_z,zone,stand,p_throws,pitch_number,if_fielding_alignment,of_fielding_alignment,spin_axis
0,X,FF,98.8,100.2,2483.0,-0.41,6.46,14.0,L,R,2,Strategic,Standard,198.0
1,B,FF,98.7,100.0,2522.0,-0.33,6.62,12.0,L,R,1,Strategic,Standard,189.0
2,S,SL,89.6,90.8,2537.0,-0.1,6.79,5.0,R,R,6,Infield shift,Standard,58.0
3,S,FF,100.4,101.7,2469.0,-0.38,6.5,6.0,R,R,5,Infield shift,Standard,191.0
4,S,FF,97.6,98.9,2339.0,-0.18,6.63,12.0,R,R,4,Infield shift,Standard,191.0


### Preprocessing Data

In [285]:
# Generate categorical variable list
pitch_cat = df_pitch.dtypes[df_pitch.dtypes == 'object'].index.tolist()
# pitch_cat.remove('type')
pitch_cat

['type',
 'pitch_type',
 'stand',
 'p_throws',
 'if_fielding_alignment',
 'of_fielding_alignment']

In [286]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse = False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_pitch[pitch_cat]))

# Add the encoded varibale names to the DataFrame
encode_df.columns = enc.get_feature_names(pitch_cat)
encode_df.head()

Unnamed: 0,type_B,type_S,type_X,pitch_type_CH,pitch_type_CS,pitch_type_CU,pitch_type_EP,pitch_type_FA,pitch_type_FC,pitch_type_FF,...,stand_R,p_throws_L,p_throws_R,if_fielding_alignment_Infield shift,if_fielding_alignment_Standard,if_fielding_alignment_Strategic,of_fielding_alignment_4th outfielder,of_fielding_alignment_Extreme outfield shift,of_fielding_alignment_Standard,of_fielding_alignment_Strategic
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [287]:
# Merge one-hot encoded features and drop the originals
df_pitch = df_pitch.merge(
    encode_df,
    left_index = True,
    right_index = True
).drop(pitch_cat, 1)

df_pitch.head()

Unnamed: 0,release_speed,effective_speed,release_spin_rate,release_pos_x,release_pos_z,zone,pitch_number,spin_axis,type_B,type_S,...,stand_R,p_throws_L,p_throws_R,if_fielding_alignment_Infield shift,if_fielding_alignment_Standard,if_fielding_alignment_Strategic,of_fielding_alignment_4th outfielder,of_fielding_alignment_Extreme outfield shift,of_fielding_alignment_Standard,of_fielding_alignment_Strategic
0,98.8,100.2,2483.0,-0.41,6.46,14.0,2,198.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0,91.1,89.9,2347.0,-2.16,6.08,14.0,5,187.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0,92.2,94.1,2629.0,-1.97,6.26,12.0,4,164.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0,91.9,91.0,2422.0,-1.19,5.99,1.0,6,183.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,98.7,100.0,2522.0,-0.33,6.62,12.0,1,189.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [288]:
# Drop superfluous columns 'stand_L' and 'p_throws_L'
df_pitch.drop(columns = ['stand_L', 'p_throws_L'], axis = 1, inplace = True)
df_pitch.head()

Unnamed: 0,release_speed,effective_speed,release_spin_rate,release_pos_x,release_pos_z,zone,pitch_number,spin_axis,type_B,type_S,...,pitch_type_SL,stand_R,p_throws_R,if_fielding_alignment_Infield shift,if_fielding_alignment_Standard,if_fielding_alignment_Strategic,of_fielding_alignment_4th outfielder,of_fielding_alignment_Extreme outfield shift,of_fielding_alignment_Standard,of_fielding_alignment_Strategic
0,98.8,100.2,2483.0,-0.41,6.46,14.0,2,198.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0,91.1,89.9,2347.0,-2.16,6.08,14.0,5,187.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0,92.2,94.1,2629.0,-1.97,6.26,12.0,4,164.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
0,91.9,91.0,2422.0,-1.19,5.99,1.0,6,183.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,98.7,100.0,2522.0,-0.33,6.62,12.0,1,189.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


### Compiling, Training, and Testing Data

In [289]:
# Define features set
X = df_pitch.drop(columns = ['type_S', 'type_B', 'type_X'], axis = 1).values

In [290]:
# # Dictionary for mapping
# types = {'S': 1, 'B': 2, 'X': 3}

# # Map numbers onto 'type' column
# df_pitch['type'] = [types[item] for item in df_pitch['type']]
        

In [291]:
# Define target vector
y = df_pitch[['type_S', 'type_B', 'type_X']].values
y[:5]

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [292]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = yogi)

In [293]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Find best learning rate

In [294]:

# Define the neural network model
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 80
hidden_nodes_layer2 = 30

# Define activation function for keras layers
activation_function = 'relu'

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(
        units = hidden_nodes_layer1,
        input_dim = number_input_features,
        activation = activation_function
    )
)

# Second hidden layer
nn.add(
    tf.keras.layers.Dense(
        units = hidden_nodes_layer2,
        activation = activation_function
    )
)

# Output layer
nn.add(
    tf.keras.layers.Dense(
        units = 3,
        activation = 'softmax'
    )
)

nn.summary()

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_44 (Dense)             (None, 80)                2720      
_________________________________________________________________
dense_45 (Dense)             (None, 30)                2430      
_________________________________________________________________
dense_46 (Dense)             (None, 3)                 93        
Total params: 5,243
Trainable params: 5,243
Non-trainable params: 0
_________________________________________________________________


### Create model with best learning rate


In [295]:
# Compile the model
nn.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

### Evaluate Model

In [296]:
# Train the model 
fit_model = nn.fit(
    X_train_scaled,
    y_train,
    epochs = 100
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose = 2)
print(f'Loss: {model_loss} \nAccuracy: {model_accuracy}')

2064/2064 - 2s - loss: 1.0232 - accuracy: 0.4637
Loss: 1.0232396125793457 
Accuracy: 0.46374595165252686


In [None]:
# # Generate confusion matrix
# cm = confusion_matrix(y_test, predictions)
# cm_df = pd.DataFrame(
#     cm
# )

# # Display confusion matrix
# display(cm_df)

In [None]:
# # Generate classification report
# print('Classification Report')
# print(classification_report(y_test, predictions))