In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

Unnamed: 0.1,Unnamed: 0,gameid,GameDate,Date,Time,ab,pitchnum,inning,teambat,balls,...,horzbreak,inducedvertbreak,platelocside,platelocheight,hometeam_id,Home,awayteam_id,Visitor,venue_id,venue_name
0,97300,2021/04/01/arimlb-sdnmlb-1,2021-04-01 13:10:00,2021-04-01,13:10:00,2,4,1.0,0,1.0,...,10.892494,4.379483,-0.529703,0.398841,135,San Diego Padres,109,Arizona Diamondbacks,2680,Petco Park
3,97293,2021/04/01/arimlb-sdnmlb-1,2021-04-01 13:10:00,2021-04-01,13:10:00,4,3,1.0,0,0.0,...,-8.808213,-16.031368,-0.488700,1.254835,135,San Diego Padres,109,Arizona Diamondbacks,2680,Petco Park
6,97292,2021/04/01/arimlb-sdnmlb-1,2021-04-01 13:10:00,2021-04-01,13:10:00,4,4,1.0,0,1.0,...,9.534775,19.782555,0.058789,1.842813,135,San Diego Padres,109,Arizona Diamondbacks,2680,Petco Park
9,97289,2021/04/01/arimlb-sdnmlb-1,2021-04-01 13:10:00,2021-04-01,13:10:00,5,3,1.0,0,0.0,...,9.347733,18.952282,-0.094753,1.162178,135,San Diego Padres,109,Arizona Diamondbacks,2680,Petco Park
12,97288,2021/04/01/arimlb-sdnmlb-1,2021-04-01 13:10:00,2021-04-01,13:10:00,5,4,1.0,0,0.0,...,-7.859403,-10.739815,-1.059788,1.983340,135,San Diego Padres,109,Arizona Diamondbacks,2680,Petco Park
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244029,537828,2021/10/03/tbamlb-nyamlb-1,2021-10-03 19:05:00,2021-10-03,19:05:00,55,3,8.0,0,0.0,...,15.862026,6.957945,0.715664,1.011082,147,New York Yankees,139,Tampa Bay Rays,3313,Yankee Stadium
244032,537827,2021/10/03/tbamlb-nyamlb-1,2021-10-03 19:05:00,2021-10-03,19:05:00,55,4,8.0,0,1.0,...,16.558018,9.060814,-0.145601,0.500451,147,New York Yankees,139,Tampa Bay Rays,3313,Yankee Stadium
244035,696378,2021/10/03/tbamlb-nyamlb-1,2021-10-03 19:05:00,2021-10-03,19:05:00,58,3,8.0,1,0.0,...,-6.043284,-9.898354,1.894779,-0.808150,147,New York Yankees,139,Tampa Bay Rays,3313,Yankee Stadium
244038,697016,2021/10/03/tbamlb-nyamlb-1,2021-10-03 19:05:00,2021-10-03,19:05:00,58,4,8.0,1,1.0,...,-8.813749,-15.589659,1.358258,-0.218744,147,New York Yankees,139,Tampa Bay Rays,3313,Yankee Stadium


In [14]:
# Load and filter the dataset
df = pd.read_csv('data/filtered_df.csv')
# Remove duplicates based on the 'Unnamed: 0' column
df = df.drop_duplicates(subset='Unnamed: 0')

filtered_df = df[(df['strikes'] == 2) & (df['balls'] <= 1) & (df['pitcherthrows'] == 'R')]

# Creating a binary target for strikeouts
filtered_df['strikeout'] = filtered_df['eventtype'].apply(lambda x: 1 if x == 'strikeout' else 0)

# Feature selection and target definition
features = ['spinrate', 'platelocheight', 'relspeed']
target = 'strikeout'

X = filtered_df[features]
y = filtered_df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train Random Forest Model with SMOTE data
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train_smote, y_train_smote)

# Predictions and Evaluation
y_pred = rf.predict(X_test)
print("Classification Report for Random Forest (SMOTE):")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report for Random Forest (SMOTE):
              precision    recall  f1-score   support

           0       0.86      0.46      0.60     36224
           1       0.23      0.69      0.35      8600

    accuracy                           0.50     44824
   macro avg       0.55      0.57      0.47     44824
weighted avg       0.74      0.50      0.55     44824

Accuracy: 0.5022755666607175


In [11]:
# Gradient Boosting with XGBoost and scale_pos_weight
strikeout_ratio = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
gbm = xgb.XGBClassifier(n_estimators=500, max_depth=10, learning_rate=0.1, random_state=42, scale_pos_weight=strikeout_ratio)
gbm.fit(X_train, y_train)

# Predictions and Evaluation
y_pred_gbm = gbm.predict(X_test)
print("Classification Report for XGBoost (scale_pos_weight):")
print(classification_report(y_test, y_pred_gbm))
print("Accuracy:", accuracy_score(y_test, y_pred_gbm))

Classification Report for XGBoost (scale_pos_weight):
              precision    recall  f1-score   support

           0       0.82      0.82      0.82     13046
           1       0.26      0.26      0.26      3224

    accuracy                           0.71     16270
   macro avg       0.54      0.54      0.54     16270
weighted avg       0.71      0.71      0.71     16270

Accuracy: 0.7052243392747388
