In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import SVG
#from graphviz import Source
from IPython.display import display
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier

#for validating your classification model
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV 
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#pip install scikit-plot (optional)
import scikitplot as skplt

import warnings
warnings.filterwarnings("ignore")

In [123]:
import pandas as pd
import numpy as np

data = pd.read_csv(r"C:\Users\joshd\OneDrive - Kansas State University\Adv Marketing Analytics\8. Project Guideline\merged_file.csv")



In [124]:
features = ['Pitcher','PitcherThrows','AutoPitchType','PitchCall','RelSpeed','VertRelAngle','HorzRelAngle','SpinRate','SpinAxis','Tilt','RelHeight','RelSide','Extension','InducedVertBreak','HorzBreak','VertApprAngle','HorzApprAngle','EffectiveVelo']

df = data[features]

In [125]:
df.shape

(96787, 18)

In [126]:
df.dropna(axis=0, how='any', inplace=True)


In [127]:
df.loc[:, 'SwingAndMiss']  = df['PitchCall'].apply(lambda x: 1 if x == 'StrikeSwinging' else 0)

In [128]:
pitch_group_mapping = {
    'Changeup': 'Offspeed', 'ChangeUp': 'Offspeed', 'Curveball': 'Breaking', 
    'Cutter': 'Breaking', 'Fastball': 'Fastball', 'Four-Seam': 'Fastball', 
    'NaN': 'Offspeed', 'Other': 'Offspeed', 'Sinker': 'Fastball', 'Slider': 'Breaking', 
    'Splitter': 'Offspeed', 'Undefined': 'Offspeed'
}
df['PitchGroup'] = df['AutoPitchType'].map(pitch_group_mapping)

In [129]:
#split data into left handed and right handed pitchers

df_Left = df[df['PitcherThrows'] == 'Left']
df_Right = df[df['PitcherThrows'] == 'Right']

In [130]:
#split data further by pitch group
#Fastball data frames
df_Left_FB = df_Left[df_Left['PitchGroup'] == 'Fastball']
df_Right_FB = df_Right[df_Right['PitchGroup'] == 'Fastball']

#Breaking Ball data frames
df_Left_BB = df_Left[df_Left['PitchGroup'] == 'Breaking']
df_Right_BB = df_Right[df_Right['PitchGroup'] == 'Breaking']

#Offspeed data frames
df_Left_OS = df_Left[df_Left['PitchGroup'] == 'Offspeed']
df_Right_OS = df_Right[df_Right['PitchGroup'] == 'Offspeed']

# Correlation Matrices for the 3 pitch types 

In [131]:
# Fastball Correlations

corr_matrix = df_Right_FB.corr()
print(corr_matrix['SwingAndMiss'].sort_values(ascending=False))

SwingAndMiss        1.000000
VertApprAngle       0.092114
RelSpeed            0.044613
VertRelAngle        0.041805
InducedVertBreak    0.041335
EffectiveVelo       0.039944
SpinRate            0.027500
HorzRelAngle        0.026145
HorzApprAngle       0.007928
RelSide             0.002029
RelHeight          -0.000725
Extension          -0.006998
HorzBreak          -0.025131
SpinAxis           -0.032955
Name: SwingAndMiss, dtype: float64


In [132]:
# BreakingBall Correlations

corr_matrix = df_Right_BB.corr()
print(corr_matrix['SwingAndMiss'].sort_values(ascending=False))

SwingAndMiss        1.000000
RelSpeed            0.051972
EffectiveVelo       0.047510
SpinRate            0.039066
SpinAxis            0.021899
InducedVertBreak    0.017134
Extension           0.016368
RelHeight           0.009988
HorzBreak           0.008442
RelSide            -0.007131
HorzApprAngle      -0.053014
HorzRelAngle       -0.064703
VertApprAngle      -0.073315
VertRelAngle       -0.125667
Name: SwingAndMiss, dtype: float64


In [133]:
# Offspeed Correlations

corr_matrix = df_Right_OS.corr()
print(corr_matrix['SwingAndMiss'].sort_values(ascending=False))

SwingAndMiss        1.000000
SpinAxis            0.054259
HorzBreak           0.043496
HorzApprAngle       0.042268
RelHeight           0.018039
HorzRelAngle        0.005685
Extension          -0.001380
RelSide            -0.012507
EffectiveVelo      -0.043475
SpinRate           -0.043482
RelSpeed           -0.044589
InducedVertBreak   -0.053571
VertRelAngle       -0.057801
VertApprAngle      -0.119447
Name: SwingAndMiss, dtype: float64


All correlations appear weak due to the small positive class

# Creating Lefty Fastball Model

In [134]:
# setting X and Y variables

y = df_Left_FB['SwingAndMiss']
Xi = df_Left_FB.drop(['SwingAndMiss','Pitcher','PitcherThrows','PitchGroup','AutoPitchType','PitchCall','Tilt','EffectiveVelo','SpinAxis','HorzRelAngle','VertRelAngle','RelHeight','SpinRate'], axis=1)#'HorzBreak'], axis=1)

print(Xi.shape, y.shape)

(10765, 7) (10765,)


In [135]:
from sklearn.feature_selection import mutual_info_classif

mutual_info = mutual_info_classif(Xi, y)
print(mutual_info)

[0.00627767 0.00497053 0.00066327 0.00222376 0.         0.00890697
 0.00060832]


In [136]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame
scaler = StandardScaler()

# Standardize the DataFrame (ignoring the target column if it's in the DataFrame)
X = pd.DataFrame(scaler.fit_transform(Xi), columns=Xi.columns)

In [137]:
X = sm.add_constant(X)

In [138]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Select features based on importance threshold
threshold = 0.01
selected_features = X.columns[importances > threshold]


In [139]:
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)


            Feature  Importance
6     VertApprAngle    0.156906
1          RelSpeed    0.147616
7     HorzApprAngle    0.144841
4  InducedVertBreak    0.141812
2           RelSide    0.139290
5         HorzBreak    0.136858
3         Extension    0.132676
0             const    0.000000


In [140]:
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [141]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled))

# Train a Random Forest model on the resampled data
rf_model_LFB = RandomForestClassifier(random_state=42)
rf_model_LFB.fit(X_train_resampled, y_train_resampled)



Original class distribution: Counter({0: 7855, 1: 757})
Resampled class distribution: Counter({0: 7855, 1: 7855})


In [142]:
y_probs = rf_model_LFB.predict_proba(X_test)[:, 1]

# Choose a custom threshold (e.g., 0.3)
threshold = 0.4
y_pred_custom_threshold = (y_probs > threshold).astype(int)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_custom_threshold))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom_threshold))

Confusion Matrix:
[[1646  313]
 [ 123   71]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88      1959
           1       0.18      0.37      0.25       194

    accuracy                           0.80      2153
   macro avg       0.56      0.60      0.56      2153
weighted avg       0.86      0.80      0.83      2153



In [143]:
# get and insert probabilites into Lefty Fastball dataset original dataset

probabilities = rf_model_LFB.predict_proba(X)[:, 1]

df_Left_FB['Whiff Score'] = probabilities * 100

df_Left_FB.head()

Unnamed: 0,Pitcher,PitcherThrows,AutoPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,Tilt,...,RelSide,Extension,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle,EffectiveVelo,SwingAndMiss,PitchGroup,Whiff Score
176,"Pence, Dalton",Left,Four-Seam,BallCalled,91.62001,-4.364787,0.836477,2400.557581,147.988996,11:00,...,-0.77789,6.17226,19.09914,-11.22839,-7.077883,-1.190162,90.82482,0,Fastball,21.0
178,"Pence, Dalton",Left,Four-Seam,InPlay,91.87491,-2.43653,1.874777,2451.933188,147.637747,11:00,...,-0.89843,5.9718,21.08415,-12.63221,-4.816753,-0.394708,90.77824,0,Fastball,22.0
180,"Pence, Dalton",Left,Four-Seam,StrikeCalled,91.4685,-2.28687,2.229315,2362.975795,145.656413,10:45,...,-0.81266,5.9952,21.18564,-13.68009,-4.704606,-0.228793,90.39419,0,Fastball,30.0
181,"Pence, Dalton",Left,Four-Seam,BallCalled,92.22518,-0.886696,2.566607,2363.808015,149.205179,11:00,...,-0.88825,6.11416,21.09237,-11.90666,-3.192384,0.423491,91.42261,0,Fastball,40.0
183,"Pence, Dalton",Left,Four-Seam,FoulBall,92.21408,-1.514507,1.889752,2427.368372,146.805346,11:00,...,-0.74116,6.08071,21.07277,-13.01272,-3.87091,-0.452787,91.11686,0,Fastball,20.0


# Creating Righty Fastball Model

In [144]:
# setting X and Y variables

y = df_Right_FB['SwingAndMiss']
Xi = df_Right_FB.drop(['SwingAndMiss','Pitcher','PitcherThrows','PitchGroup','AutoPitchType','PitchCall','Tilt','EffectiveVelo','SpinAxis','HorzRelAngle','VertRelAngle','RelHeight','SpinRate'], axis=1)#'HorzBreak'], axis=1)

print(Xi.shape, y.shape)

(31694, 7) (31694,)


In [145]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame
scaler = StandardScaler()

# Standardize the DataFrame (ignoring the target column if it's in the DataFrame)
X = pd.DataFrame(scaler.fit_transform(Xi), columns=Xi.columns)
X = sm.add_constant(X)

In [146]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Select features based on importance threshold
threshold = 0.01
selected_features = X.columns[importances > threshold]


feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

            Feature  Importance
6     VertApprAngle    0.155543
4  InducedVertBreak    0.143355
1          RelSpeed    0.142154
7     HorzApprAngle    0.141408
5         HorzBreak    0.140454
2           RelSide    0.138760
3         Extension    0.138327
0             const    0.000000


In [147]:
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [148]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled))

# Train a Random Forest model on the resampled data
rf_model_RFB = RandomForestClassifier(random_state=42)
rf_model_RFB.fit(X_train_resampled, y_train_resampled)

Original class distribution: Counter({0: 23328, 1: 2027})
Resampled class distribution: Counter({0: 23328, 1: 23328})


In [149]:
y_probs = rf_model_RFB.predict_proba(X_test)[:, 1]

# Choose a custom threshold (e.g., 0.3)
threshold = 0.4
y_pred_custom_threshold = (y_probs > threshold).astype(int)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_custom_threshold))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom_threshold))

Confusion Matrix:
[[4945  893]
 [ 321  180]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.85      0.89      5838
           1       0.17      0.36      0.23       501

    accuracy                           0.81      6339
   macro avg       0.55      0.60      0.56      6339
weighted avg       0.88      0.81      0.84      6339



In [150]:
# get and insert probabilites into Lefty Fastball dataset original dataset

probabilities = rf_model_RFB.predict_proba(X)[:, 1]

df_Right_FB['Whiff Score'] = probabilities * 100

df_Right_FB.head()

Unnamed: 0,Pitcher,PitcherThrows,AutoPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,Tilt,...,RelSide,Extension,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle,EffectiveVelo,SwingAndMiss,PitchGroup,Whiff Score
0,"Bovair, Connor",Right,Four-Seam,BallCalled,92.15578,-0.364869,-2.156256,2372.702159,220.485967,1:15,...,1.7254,5.06307,16.8354,13.33139,-3.66641,0.198079,89.35897,0,Fastball,9.0
1,"Bovair, Connor",Right,Four-Seam,InPlay,91.36741,-2.497417,-3.270117,2397.052783,221.459454,1:30,...,1.54931,4.91384,17.12261,14.02154,-5.872263,-0.803152,88.29696,0,Fastball,4.0
3,"Bovair, Connor",Right,Four-Seam,StrikeCalled,91.89762,-1.271536,-3.095063,2406.896522,211.035908,1:00,...,1.50492,5.17286,19.19225,10.78737,-4.200468,-1.188605,89.10257,0,Fastball,22.0
4,"Bovair, Connor",Right,Four-Seam,FoulBall,92.18498,-1.986803,-2.473535,2450.300166,215.993413,1:15,...,1.5312,5.01993,17.89607,12.06662,-5.136346,-0.344633,89.08026,0,Fastball,6.0
5,"Bovair, Connor",Right,Four-Seam,BallCalled,93.16575,-0.198022,-2.075262,2427.043762,221.875686,1:30,...,1.83919,4.9904,14.84391,12.25233,-3.715963,0.085762,90.28281,0,Fastball,5.0


# Creating Lefty Breaking Ball Model

In [151]:
# setting X and Y variables

y = df_Left_BB['SwingAndMiss']
Xi = df_Left_BB.drop(['SwingAndMiss','Pitcher','PitcherThrows','PitchGroup','AutoPitchType','PitchCall','Tilt','EffectiveVelo','SpinAxis','HorzRelAngle','VertRelAngle','RelHeight','RelSide','HorzBreak'], axis=1)#'RelHeight','SpinRate','HorzBreak'], axis=1)

print(Xi.shape, y.shape)

(8053, 6) (8053,)


In [152]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame
scaler = StandardScaler()

# Standardize the DataFrame (ignoring the target column if it's in the DataFrame)
X = pd.DataFrame(scaler.fit_transform(Xi), columns=Xi.columns)
X = sm.add_constant(X)

In [153]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Select features based on importance threshold
threshold = 0.01
selected_features = X.columns[importances > threshold]


feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

            Feature  Importance
5     VertApprAngle    0.177033
1          RelSpeed    0.171552
6     HorzApprAngle    0.170431
4  InducedVertBreak    0.164840
2          SpinRate    0.159887
3         Extension    0.156257
0             const    0.000000


In [154]:
class_counts = np.bincount(y)  # Count occurrences of each class
class_weights = {0: len(y) / (2 * class_counts[0]),  # Weight for class 0
                 1: len(y) / (2 * class_counts[1])}  # Weight for class 1

# Apply weights to the data
weights = np.array([class_weights[i] for i in y])

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [155]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled))

# Train a Random Forest model on the resampled data
rf_model_LBB = RandomForestClassifier(random_state=42)
rf_model_LBB.fit(X_train_resampled, y_train_resampled)

Original class distribution: Counter({0: 5655, 1: 787})
Resampled class distribution: Counter({0: 5655, 1: 5655})


In [156]:
y_probs = rf_model_LBB.predict_proba(X_test)[:, 1]

# Choose a custom threshold (e.g., 0.3)
threshold = 0.5

y_pred_custom_threshold = (y_probs > threshold).astype(int)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_custom_threshold))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom_threshold))

Confusion Matrix:
[[1241  172]
 [ 152   46]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      1413
           1       0.21      0.23      0.22       198

    accuracy                           0.80      1611
   macro avg       0.55      0.56      0.55      1611
weighted avg       0.81      0.80      0.80      1611



In [157]:
# get and insert probabilites into Lefty Fastball dataset original dataset

probabilities = rf_model_LBB.predict_proba(X)[:, 1]

df_Left_BB['Whiff Score'] = probabilities * 100

df_Left_BB.head()

Unnamed: 0,Pitcher,PitcherThrows,AutoPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,Tilt,...,RelSide,Extension,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle,EffectiveVelo,SwingAndMiss,PitchGroup,Whiff Score
173,"Pence, Dalton",Left,Slider,InPlay,78.16803,0.329745,1.134805,2181.509071,229.912981,1:45,...,-1.16047,6.22398,7.11178,6.69933,-6.79793,2.345058,77.79266,0,Breaking,4.0
179,"Pence, Dalton",Left,Slider,BallCalled,79.79371,-0.188497,1.688791,2316.051305,216.819485,1:15,...,-1.01086,6.10861,7.22615,4.40414,-6.918327,2.4825,79.43264,0,Breaking,5.0
182,"Pence, Dalton",Left,Slider,BallCalled,80.12805,-0.885522,1.470697,2298.848261,247.960203,2:15,...,-0.92018,6.10886,2.51815,3.11315,-8.325928,2.032509,79.95627,0,Breaking,22.0
255,"Pence, Dalton",Left,Slider,InPlay,79.65584,0.256401,0.443132,2145.068475,175.736232,11:45,...,-0.91455,6.15402,7.28756,-0.44909,-6.428116,0.36207,79.62128,0,Breaking,13.0
257,"Pence, Dalton",Left,Slider,StrikeSwinging,78.40215,-0.356785,1.008064,2137.069207,254.970453,2:30,...,-1.01912,5.80916,3.79855,7.99563,-8.199302,2.442842,77.01116,1,Breaking,98.0


# Creating Righty Breaking Ball Model

In [158]:
# setting X and Y variables

y = df_Right_BB['SwingAndMiss']
Xi = df_Right_BB.drop(['SwingAndMiss','Pitcher','PitcherThrows','PitchGroup','AutoPitchType','PitchCall','Tilt','EffectiveVelo','SpinAxis','HorzRelAngle','VertRelAngle','RelHeight','RelSide','HorzBreak'], axis=1)#'RelHeight','SpinRate','HorzBreak'], axis=1)

print(Xi.shape, y.shape)

(24560, 6) (24560,)


In [159]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame
scaler = StandardScaler()

# Standardize the DataFrame (ignoring the target column if it's in the DataFrame)
X = pd.DataFrame(scaler.fit_transform(Xi), columns=Xi.columns)
X = sm.add_constant(X)

In [160]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Select features based on importance threshold
threshold = 0.01
selected_features = X.columns[importances > threshold]


feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

            Feature  Importance
5     VertApprAngle    0.177477
4  InducedVertBreak    0.167706
6     HorzApprAngle    0.166385
1          RelSpeed    0.166208
2          SpinRate    0.161117
3         Extension    0.161107
0             const    0.000000


In [161]:
class_counts = np.bincount(y)  # Count occurrences of each class
class_weights = {0: len(y) / (2 * class_counts[0]),  # Weight for class 0
                 1: len(y) / (2 * class_counts[1])}  # Weight for class 1

# Apply weights to the data
weights = np.array([class_weights[i] for i in y])

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [162]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled))

# Train a Random Forest model on the resampled data
rf_model_RBB = RandomForestClassifier(random_state=42)
rf_model_RBB.fit(X_train_resampled, y_train_resampled)

Original class distribution: Counter({0: 17180, 1: 2468})
Resampled class distribution: Counter({0: 17180, 1: 17180})


In [163]:
y_probs = rf_model_RBB.predict_proba(X_test)[:, 1]

# Choose a custom threshold (e.g., 0.3)
threshold = 0.4
y_pred_custom_threshold = (y_probs > threshold).astype(int)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_custom_threshold))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom_threshold))

Confusion Matrix:
[[3268 1039]
 [ 336  269]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.76      0.83      4307
           1       0.21      0.44      0.28       605

    accuracy                           0.72      4912
   macro avg       0.56      0.60      0.55      4912
weighted avg       0.82      0.72      0.76      4912



In [164]:
# get and insert probabilites into Lefty Fastball dataset original dataset

probabilities = rf_model_RBB.predict_proba(X)[:, 1]

df_Right_BB['Whiff Score'] = probabilities * 100

df_Right_BB.head()

Unnamed: 0,Pitcher,PitcherThrows,AutoPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,Tilt,...,RelSide,Extension,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle,EffectiveVelo,SwingAndMiss,PitchGroup,Whiff Score
2,"Bovair, Connor",Right,Slider,BallCalled,80.86597,2.322348,-0.253187,2239.165062,133.4788,10:30,...,2.03354,4.8823,3.04697,-1.28571,-5.495913,-0.479592,77.48598,0,Breaking,4.0
6,"Bovair, Connor",Right,Curveball,BallCalled,75.39519,5.279901,0.078064,2137.676625,33.279313,7:00,...,1.45655,4.89766,-11.44071,-8.73374,-6.213807,-1.457652,72.54646,0,Breaking,2.0
13,"Bovair, Connor",Right,Curveball,BallCalled,77.12524,0.07912,-1.907429,2290.646465,5.808623,6:15,...,2.06986,4.93139,-14.85812,-1.6835,-11.419241,-2.205435,74.57065,0,Breaking,25.0
21,"Bovair, Connor",Right,Slider,StrikeCalled,82.84563,1.085199,-2.168553,2343.211542,91.04071,9:00,...,1.89005,5.05848,1.47795,-2.05961,-6.373484,-2.53223,80.41364,0,Breaking,6.0
23,"Bovair, Connor",Right,Slider,BallCalled,81.39566,1.940022,-1.273791,2254.02092,110.082261,9:45,...,2.23915,5.13807,3.48795,-5.75048,-5.370806,-2.290706,79.38901,0,Breaking,7.0


# Creating Lefty OffSpeed Model

In [165]:
# setting X and Y variables

y = df_Left_OS['SwingAndMiss']
Xi = df_Left_OS.drop(['SwingAndMiss','Pitcher','PitcherThrows','PitchGroup','AutoPitchType','PitchCall','Tilt','EffectiveVelo','SpinAxis','HorzRelAngle','VertRelAngle','RelHeight','SpinRate','RelSide'], axis=1)#'RelHeight','SpinRate','HorzBreak'], axis=1)

print(Xi.shape, y.shape)

(6678, 6) (6678,)


In [166]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame
scaler = StandardScaler()

# Standardize the DataFrame (ignoring the target column if it's in the DataFrame)
X = pd.DataFrame(scaler.fit_transform(Xi), columns=Xi.columns)
X = sm.add_constant(X)

In [167]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Select features based on importance threshold
threshold = 0.01
selected_features = X.columns[importances > threshold]


feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

            Feature  Importance
5     VertApprAngle    0.183385
1          RelSpeed    0.169942
3  InducedVertBreak    0.166378
6     HorzApprAngle    0.164318
4         HorzBreak    0.158337
2         Extension    0.157639
0             const    0.000000


In [168]:
class_counts = np.bincount(y)  # Count occurrences of each class
class_weights = {0: len(y) / (2 * class_counts[0]),  # Weight for class 0
                 1: len(y) / (2 * class_counts[1])}  # Weight for class 1

# Apply weights to the data
weights = np.array([class_weights[i] for i in y])

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [169]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled))

# Train a Random Forest model on the resampled data
rf_model_LOS = RandomForestClassifier(random_state=42)
rf_model_LOS.fit(X_train_resampled, y_train_resampled)

Original class distribution: Counter({0: 4715, 1: 627})
Resampled class distribution: Counter({0: 4715, 1: 4715})


In [170]:
y_probs = rf_model_LOS.predict_proba(X_test)[:, 1]

# Choose a custom threshold (e.g., 0.3)
threshold = 0.5
y_pred_custom_threshold = (y_probs > threshold).astype(int)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_custom_threshold))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom_threshold))

Confusion Matrix:
[[1041  132]
 [ 111   52]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90      1173
           1       0.28      0.32      0.30       163

    accuracy                           0.82      1336
   macro avg       0.59      0.60      0.60      1336
weighted avg       0.83      0.82      0.82      1336



In [171]:
# get and insert probabilites into Lefty Fastball dataset original dataset

probabilities = rf_model_LOS.predict_proba(X)[:, 1]

df_Left_OS['Whiff Score'] = probabilities * 100

df_Left_OS.head()

Unnamed: 0,Pitcher,PitcherThrows,AutoPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,Tilt,...,RelSide,Extension,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle,EffectiveVelo,SwingAndMiss,PitchGroup,Whiff Score
174,"Pence, Dalton",Left,Changeup,BallCalled,82.67204,-2.779197,1.859972,2073.735899,140.081427,10:45,...,-1.09023,6.13176,15.16257,-11.52575,-7.59559,-0.219367,81.97693,0,Offspeed,13.0
175,"Pence, Dalton",Left,Changeup,BallCalled,83.58908,-2.396361,1.525543,2083.645532,142.261598,10:45,...,-1.03848,6.14618,17.18288,-12.28327,-6.658301,-0.690558,83.06903,0,Offspeed,20.0
177,"Pence, Dalton",Left,Changeup,BallCalled,83.59194,-2.651072,1.328026,2039.199502,135.901581,10:30,...,-1.10487,6.26664,14.99757,-13.24595,-7.293269,-1.068371,83.17562,0,Offspeed,7.0
186,"Pence, Dalton",Left,Changeup,BallCalled,84.72321,-0.218227,0.742055,1806.39349,110.902706,9:45,...,-1.14759,6.25455,7.64453,-16.0445,-6.181805,-2.159749,83.56681,0,Offspeed,2.0
221,"Pence, Dalton",Left,Changeup,BallCalled,83.76325,-0.907063,0.295212,2097.829605,142.893587,10:45,...,-0.81612,6.16449,17.01395,-11.84335,-5.221145,-1.841975,83.11203,0,Offspeed,4.0


# Creating Righty Offspeed Model

In [172]:
# setting X and Y variables

y = df_Right_OS['SwingAndMiss']
Xi = df_Right_OS.drop(['SwingAndMiss','Pitcher','PitcherThrows','PitchGroup','AutoPitchType','PitchCall','Tilt','EffectiveVelo','SpinAxis','HorzRelAngle','VertRelAngle','RelHeight','SpinRate','RelSide'], axis=1)#'RelHeight','SpinRate','HorzBreak'], axis=1)

print(Xi.shape, y.shape)

(12575, 6) (12575,)


In [173]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'df' is your DataFrame
scaler = StandardScaler()

# Standardize the DataFrame (ignoring the target column if it's in the DataFrame)
X = pd.DataFrame(scaler.fit_transform(Xi), columns=Xi.columns)
X = sm.add_constant(X)

In [174]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Select features based on importance threshold
threshold = 0.01
selected_features = X.columns[importances > threshold]


feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importances)

            Feature  Importance
5     VertApprAngle    0.177934
6     HorzApprAngle    0.166732
4         HorzBreak    0.166186
1          RelSpeed    0.163973
3  InducedVertBreak    0.162987
2         Extension    0.162187
0             const    0.000000


In [175]:
class_counts = np.bincount(y)  # Count occurrences of each class
class_weights = {0: len(y) / (2 * class_counts[0]),  # Weight for class 0
                 1: len(y) / (2 * class_counts[1])}  # Weight for class 1

# Apply weights to the data
weights = np.array([class_weights[i] for i in y])

X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [176]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
from collections import Counter
print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled))

# Train a Random Forest model on the resampled data
rf_model_ROS = RandomForestClassifier(random_state=42)
rf_model_ROS.fit(X_train_resampled, y_train_resampled)

Original class distribution: Counter({0: 8876, 1: 1184})
Resampled class distribution: Counter({0: 8876, 1: 8876})


In [177]:
y_probs = rf_model_ROS.predict_proba(X_test)[:, 1]

# Choose a custom threshold (e.g., 0.3)
threshold = 0.4
y_pred_custom_threshold = (y_probs > threshold).astype(int)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_custom_threshold))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom_threshold))

Confusion Matrix:
[[1750  467]
 [ 155  143]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.79      0.85      2217
           1       0.23      0.48      0.31       298

    accuracy                           0.75      2515
   macro avg       0.58      0.63      0.58      2515
weighted avg       0.84      0.75      0.79      2515



In [178]:
# get and insert probabilites into Lefty Fastball dataset original dataset

probabilities = rf_model_ROS.predict_proba(X)[:, 1]

df_Right_OS['Whiff Score'] = probabilities * 100

df_Right_OS.head()

Unnamed: 0,Pitcher,PitcherThrows,AutoPitchType,PitchCall,RelSpeed,VertRelAngle,HorzRelAngle,SpinRate,SpinAxis,Tilt,...,RelSide,Extension,InducedVertBreak,HorzBreak,VertApprAngle,HorzApprAngle,EffectiveVelo,SwingAndMiss,PitchGroup,Whiff Score
8,"Bovair, Connor",Right,Changeup,StrikeCalled,82.49102,-1.502017,-3.745831,1404.617967,233.231183,1:45,...,2.03756,5.4212,9.62693,11.08371,-7.432896,-1.777574,80.87667,0,Offspeed,56.0
14,"Bovair, Connor",Right,Changeup,BallCalled,82.62289,-3.105843,-4.371646,1569.284442,240.350336,2:00,...,2.28917,5.32157,9.67075,14.54784,-9.03547,-1.793429,80.65466,0,Offspeed,6.0
15,"Bovair, Connor",Right,Changeup,FoulBall,82.01757,0.644821,-3.550937,1544.888803,241.944493,2:00,...,2.12215,4.99122,9.66603,15.37072,-5.534233,-0.844199,79.53841,0,Offspeed,7.0
16,"Bovair, Connor",Right,Changeup,BallCalled,80.95613,1.320871,-2.569774,1554.342338,243.56185,2:00,...,2.12942,5.21896,8.60772,14.63438,-5.10071,0.021,79.29437,0,Offspeed,7.0
30,"Peterson, Ben",Right,Splitter,StrikeSwinging,88.22605,-1.688718,-2.394854,939.696651,289.655073,3:45,...,1.12489,5.14712,-1.06843,6.18926,-8.554382,-1.297883,86.18322,1,Offspeed,96.0


# Recombine data frames

In [179]:
Whiff_df = pd.concat([df_Left_FB, df_Right_FB, df_Left_BB, df_Right_BB, df_Left_OS, df_Right_OS], axis=0, ignore_index=True)

In [182]:
file_path = r"C:\Users\joshd\OneDrive - Kansas State University\Adv Marketing Analytics\8. Project Guideline\Whiff_DF.csv"  # Specify the desired file name and path
Whiff_df.to_csv(file_path, index=False)  # Set index=False to exclude the index column
print(f"DataFrame saved to {file_path}")

DataFrame saved to C:\Users\joshd\OneDrive - Kansas State University\Adv Marketing Analytics\8. Project Guideline\Whiff_DF.csv
