## Introduction
In this assessment we want to use the tools of data science and analysis to estimate the probability that the outcome of each pitch will be a swing. The final deliverable will be a column with the predicted probability of a swing on that pitch. This will help us in identifying pitching talent in years to come, even if they have not pitched in the MLB.

In [1]:
# Importing necessary libraries

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import missingno as msno

# Machine Learning Modeling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.pipeline import Pipeline

## Read in Data

In [4]:
path = '/Users/williamearley/Documents/Professional/Jobs/Sports/Marlins/'

In [5]:
year1 = pd.read_csv(path + 'year1.csv')
year2 = pd.read_csv(path + 'year2.csv')
year3 = pd.read_csv(path + 'year3.csv')
documentation = pd.read_csv(path + 'documentation.csv')

## Exploring Data

In [39]:
year1.head(15)

Unnamed: 0,season,pitch_id,release_speed,batter,pitcher,description,stand,p_throws,pitch_type,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,sz_top,sz_bot
0,1,2697762.0,93.800003,5782,5738,ball,R,R,SI,1,1,-0.98,0.72,-1.23,4.1,3.7,1.82
1,1,2697773.0,95.599998,5782,5738,foul,R,R,FF,3,2,-0.38,0.96,-0.45,3.1,3.41,1.56
2,1,2697902.0,94.800003,5782,5842,foul,R,R,FF,1,0,-0.62,1.16,0.31,2.4,3.41,1.56
3,1,2697993.0,87.300003,5782,5041,ball,R,R,SL,2,1,0.82,-0.06,1.11,2.21,3.7,1.73
4,1,2697995.0,98.599998,5782,5041,called_strike,R,R,FF,3,1,-0.33,0.95,-0.35,3.39,3.7,1.7
5,1,2697997.0,98.900002,5782,5041,foul,R,R,FF,3,2,-0.42,0.89,-0.75,2.76,3.41,1.56
6,1,2697712.0,95.400002,5782,5738,foul,R,R,FF,0,0,-0.58,0.98,-0.16,2.36,3.41,1.56
7,1,2697720.0,87.400002,5782,5738,ball,R,R,SL,0,1,0.24,0.15,-0.25,3.97,3.61,1.79
8,1,2697768.0,87.199997,5782,5738,called_strike,R,R,SL,2,1,0.16,0.24,-0.54,3.09,3.64,1.7
9,1,2697772.0,86.300003,5782,5738,blocked_ball,R,R,KC,2,2,0.02,-0.39,0.61,-0.19,3.67,1.64


In [7]:
year2.head(5)

Unnamed: 0,season,pitch_id,release_speed,batter,pitcher,description,stand,p_throws,pitch_type,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,sz_top,sz_bot
0,2,3398682.0,93.0,6351,5763,ball,L,R,FF,2,1,-1.08,0.68,-1.04,4.14,3.3,1.48
1,2,3398692.0,87.800003,6351,5763,ball,L,R,SL,3,2,0.8,0.06,2.16,1.38,3.32,1.51
2,2,3398660.0,87.099998,6859,6222,ball,R,R,SL,1,0,0.49,0.44,0.05,3.53,3.23,1.47
3,2,3398685.0,94.400002,6859,6222,called_strike,R,R,FF,3,1,-0.5,1.39,0.7,2.19,3.2,1.33
4,2,3398652.0,88.300003,6411,6222,called_strike,L,R,CH,0,0,-0.98,0.3,0.51,2.15,3.66,1.74


In [8]:
year3.head(5)

Unnamed: 0,season,pitch_id,release_speed,batter,pitcher,stand,p_throws,pitch_type,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,sz_top,sz_bot
0,3,4135978.0,78.800003,5464,6936,L,R,ST,1,1,1.11,0.3,-0.33,0.49,3.58,1.66
1,3,4135989.0,93.699997,5464,6936,L,R,FF,2,2,-1.16,1.36,-1.57,2.49,3.58,1.69
2,3,4135993.0,94.199997,5464,6936,L,R,FF,3,2,-1.24,1.26,-1.31,3.48,3.68,1.69
3,3,4131576.0,91.199997,6446,6727,R,R,FF,0,0,-1.03,1.38,1.02,2.31,3.29,1.58
4,3,4131602.0,84.5,5667,6727,R,R,SL,3,2,0.2,0.12,0.61,1.36,3.41,1.63


In [9]:
documentation

Unnamed: 0,Column,Definition
0,season,Id for Season
1,pitch_id,Unique Id for Pitch
2,release_speed,Pitch velocity reported out-of-hand.
3,batter,Player Id tied to the play event.
4,pitcher,Player Id tied to the play event.
5,description,Description of the resulting pitch.
6,stand,Side of the plate batter is standing.
7,p_throws,Hand pitcher throws with.
8,pitch_type,The type of pitch derived from Statcast.
9,balls,Pre-pitch number of balls in count.


In [10]:
year1.shape

(709852, 17)

In [11]:
year2.shape

(708540, 17)

In [12]:
year3.shape

(717945, 16)

In [13]:
year3.dtypes

season             int64
pitch_id         float64
release_speed    float64
batter             int64
pitcher            int64
stand             object
p_throws          object
pitch_type        object
balls              int64
strikes            int64
pfx_x            float64
pfx_z            float64
plate_x          float64
plate_z          float64
sz_top           float64
sz_bot           float64
dtype: object

In [14]:
# This describe command allows us to see some summary statistics of the numerical categories

year1.describe()

Unnamed: 0,season,pitch_id,release_speed,batter,pitcher,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,sz_top,sz_bot
count,709852.0,708471.0,709485.0,709852.0,709852.0,709852.0,709852.0,706819.0,708792.0,709485.0,709452.0,709485.0,709440.0
mean,1.0,2355107.0,88.848689,5787.098457,5792.682917,0.883648,0.895387,-0.10355,0.65788,0.043603,2.275044,3.391976,1.57385
std,0.0,204915.2,6.050308,521.396565,525.705395,0.968486,0.828389,0.866855,0.747155,0.846102,0.984652,0.164649,0.089665
min,1.0,2000001.0,30.1,5001.0,5003.0,0.0,0.0,-2.56,-2.13,-6.1,-5.07,2.5,0.77
25%,1.0,2177472.0,84.599998,5334.0,5351.0,0.0,0.0,-0.83,0.2,-0.53,1.64,3.3,1.51
50%,1.0,2355156.0,89.900002,5720.0,5728.0,1.0,1.0,-0.17,0.76,0.04,2.28,3.41,1.56
75%,1.0,2532674.0,93.699997,6177.0,6176.0,2.0,2.0,0.59,1.28,0.61,2.92,3.49,1.62
max,1.0,2709852.0,103.400002,7100.0,7100.0,4.0,2.0,2.84,2.58,9.11,9.39,4.47,2.26


In [15]:
# We see that we do have some null values here
year1.isnull().sum()

season              0
pitch_id         1381
release_speed     367
batter              0
pitcher             0
description         0
stand               0
p_throws            0
pitch_type        367
balls               0
strikes             0
pfx_x            3033
pfx_z            1060
plate_x           367
plate_z           400
sz_top            367
sz_bot            412
dtype: int64

## Data Merging

Here we will combine the year1 and year2 data set to make essentially one big train set for our model.

In [16]:
combined = pd.concat([year1, year2])

In [17]:
combined.head(5)

Unnamed: 0,season,pitch_id,release_speed,batter,pitcher,description,stand,p_throws,pitch_type,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,sz_top,sz_bot
0,1,2697762.0,93.800003,5782,5738,ball,R,R,SI,1,1,-0.98,0.72,-1.23,4.1,3.7,1.82
1,1,2697773.0,95.599998,5782,5738,foul,R,R,FF,3,2,-0.38,0.96,-0.45,3.1,3.41,1.56
2,1,2697902.0,94.800003,5782,5842,foul,R,R,FF,1,0,-0.62,1.16,0.31,2.4,3.41,1.56
3,1,2697993.0,87.300003,5782,5041,ball,R,R,SL,2,1,0.82,-0.06,1.11,2.21,3.7,1.73
4,1,2697995.0,98.599998,5782,5041,called_strike,R,R,FF,3,1,-0.33,0.95,-0.35,3.39,3.7,1.7


In [18]:
combined.shape

(1418392, 17)

## Feature Engineering

We have a good setup for machine learning here. We have mostly numeric features and columns match across years. What we need to do is a bit of feature engineering to get the dataframes in a good place to be able to work with machine learning models.

To begin, let's fill in the NaN values with averages from their rows, and use dummie variables to replace the object variables.

In [19]:
combined.isnull().sum()

season              0
pitch_id         1611
release_speed     779
batter              0
pitcher             0
description         0
stand               0
p_throws            0
pitch_type        740
balls               0
strikes             0
pfx_x            3460
pfx_z            1477
plate_x           779
plate_z           812
sz_top            779
sz_bot            824
dtype: int64

In [20]:
year3.isnull().sum()

season              0
pitch_id         1076
release_speed     270
batter              0
pitcher             0
stand               0
p_throws            0
pitch_type        269
balls               0
strikes             0
pfx_x            2835
pfx_z             769
plate_x          5726
plate_z           296
sz_top           2539
sz_bot            315
dtype: int64

In [21]:
# Filling in columns with NaN values with the mean from that column.
# pitch_type has some NaN values but we will ignore them for now as that is an object variable type

columns_to_fill = ['pitch_id', 'release_speed', 'pfx_x', 'pfx_z', 'plate_x', 'plate_z', 'sz_top', 'sz_bot']
combined_filled = combined.copy()  # Make a copy of the original DataFrame
combined_filled[columns_to_fill] = combined_filled[columns_to_fill].fillna(combined[columns_to_fill].mean())

In [22]:
combined_filled.head(5)

Unnamed: 0,season,pitch_id,release_speed,batter,pitcher,description,stand,p_throws,pitch_type,balls,strikes,pfx_x,pfx_z,plate_x,plate_z,sz_top,sz_bot
0,1,2697762.0,93.800003,5782,5738,ball,R,R,SI,1,1,-0.98,0.72,-1.23,4.1,3.7,1.82
1,1,2697773.0,95.599998,5782,5738,foul,R,R,FF,3,2,-0.38,0.96,-0.45,3.1,3.41,1.56
2,1,2697902.0,94.800003,5782,5842,foul,R,R,FF,1,0,-0.62,1.16,0.31,2.4,3.41,1.56
3,1,2697993.0,87.300003,5782,5041,ball,R,R,SL,2,1,0.82,-0.06,1.11,2.21,3.7,1.73
4,1,2697995.0,98.599998,5782,5041,called_strike,R,R,FF,3,1,-0.33,0.95,-0.35,3.39,3.7,1.7


In [23]:
combined_filled.isnull().sum()

season             0
pitch_id           0
release_speed      0
batter             0
pitcher            0
description        0
stand              0
p_throws           0
pitch_type       740
balls              0
strikes            0
pfx_x              0
pfx_z              0
plate_x            0
plate_z            0
sz_top             0
sz_bot             0
dtype: int64

In [24]:
# Filling in columns with NaN values with the mean from that column for year1.
# pitch_type has some NaN values but we will ignore them for now as that is an object variable type

year1_filled = year1.copy()  # Make a copy of the original DataFrame
year1_filled[columns_to_fill] = year1_filled[columns_to_fill].fillna(year1[columns_to_fill].mean())

In [25]:
year1_filled.isnull().sum()

season             0
pitch_id           0
release_speed      0
batter             0
pitcher            0
description        0
stand              0
p_throws           0
pitch_type       367
balls              0
strikes            0
pfx_x              0
pfx_z              0
plate_x            0
plate_z            0
sz_top             0
sz_bot             0
dtype: int64

### Label Encoding
When we encode features, it allows us to more easily utilize them in our machine learning models. Here we will encode the object datatypes.

In [26]:
combined_filled.dtypes

season             int64
pitch_id         float64
release_speed    float64
batter             int64
pitcher            int64
description       object
stand             object
p_throws          object
pitch_type        object
balls              int64
strikes            int64
pfx_x            float64
pfx_z            float64
plate_x          float64
plate_z          float64
sz_top           float64
sz_bot           float64
dtype: object

In [27]:
combined_filled['description'].unique()

array(['ball', 'foul', 'called_strike', 'blocked_ball', 'hit_into_play',
       'hit_by_pitch', 'swinging_strike', 'foul_tip', 'foul_bunt',
       'swinging_strike_blocked', 'missed_bunt', 'pitchout',
       'bunt_foul_tip', 'foul_pitchout'], dtype=object)

In [28]:
# We need to encode the object categories for modeling

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode each necessary column in combined_filled
for column in ['stand', 'p_throws', 'pitch_type']:
    combined_filled[column] = label_encoder.fit_transform(combined_filled[column])

In [29]:
# Encode each necessary column in year3
for column in ['stand', 'p_throws', 'pitch_type']:
    year3[column] = label_encoder.fit_transform(year3[column])

### Creating target column

In [30]:
# Define a function to map swing outcomes to 1 and non-swing outcomes to 0
def map_swing(description):
    swing_outcomes = ['swinging_strike', 'hit_into_play', 'foul', 'foul_tip', 'foul_bunt',
                      'swinging_strike_blocked', 'missed_bunt', 'bunt_foul_tip', 'foul_pitchout']
    if description in swing_outcomes:
        return 1
    else:
        return 0

In [31]:
# Apply the function to create a new column called 'Swing'
combined_filled['Swing'] = combined_filled['description'].apply(map_swing)

In [32]:
# drop description column
combined_filled = combined_filled.drop(columns='description')

# Modeling
This is an interesting problem as we want to assign a probability based on a binary classification task. One of my favorite machine learning algorithms, XGBoost (eXtreme Gradient Boosting) fits well here. XGBoost uses gradient boosting framework and performs well in scenarios like we have here. Its efficiency and scalability will suit well for predicting the probability of a swing and a miss.

In [33]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Separate features (X) and target variable (y)
X = combined_filled.drop(columns=['Swing'])  # these are the features we will feed into our model
y = combined_filled['Swing']  # This is our guide, our target variable about what we would like to predict.

# Split the data into training and holdout set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
probabilities = xgb_model.predict_proba(X_test)[:, 1]  # Probability of swing and miss.

### Hyperparameter tuning

Going to be implementing a randomized search to fine-tune our XGBoost model. 

In [34]:
xgb_param_dist = {
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 400],
    'max_depth': [4, 6, 8]
}

# Instance of model
model_xgb = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Perform search
random_search = RandomizedSearchCV(
    model_xgb,
    param_distributions = xgb_param_dist,
    n_iter=10,
    cv=10,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best params
best_xgb_params = random_search.best_params_

# Print out
print("Best XGBoost parameters:", best_xgb_params)

# Get best estimator
best_xgb_model = random_search.best_estimator_

# Print out
best_xgb_model

Best XGBoost parameters: {'n_estimators': 400, 'max_depth': 8, 'learning_rate': 0.1}


In [35]:
# Make a prediction with the best model
forecast_xgb_best = best_xgb_model.predict(X_test)

In [36]:
# Add predicted probabilities to the test set
X_test['SwingProbability'] = forecast_xgb_best

In [37]:
# Here we want to assess the accuracy of our model using mean squared error.

from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, forecast_xgb_best)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.08798677378304351


The incredibly low MSE shows that our model is very accurate. This is a great sign and we will continue to move forward. If I had a bit more time I would try some other models and compare their MSE. Logistic Regression, Random Forests, Support Vector Machines, and even Neural Networks could be good candidates here. I also would implement hyperparameter tuning with my XGBoost mode to achieve the highest accuracy possible. 

In [38]:
year3 = year3.drop(columns='SwingProbability')

KeyError: "['SwingProbability'] not found in axis"

In [None]:
# Now we use our model to predict on year3
probabilities_year3 = best_xgb_model.predict_proba(year3)[:,1]  # Probability of swing

In [None]:
# Add predicted probabilities to the year3
year3['SwingProbability'] = probabilities_year3

In [None]:
year3.head(4)

In [None]:
# Exporting to csv
year3.to_csv('validation.csv', index=True)

# Question 3

In [None]:
# Define criteria for middle-middle pitches
plate_x_center = 0  # Center of the strike zone on the x-axis
plate_z_center = (3.58 + 1.66) / 2  # Center of the strike zone on the z-axis

# Filter the DataFrame for middle-middle pitches
middle_middle_pitches = combined_filled[(combined_filled['plate_x'].between(plate_x_center - 0.5, plate_x_center + 0.5)) & 
                           (combined_filled['plate_z'].between(plate_z_center - 0.5, plate_z_center + 0.5))]

In [None]:
middle_middle_pitches.head(5)

In [None]:
# Fitting model to middle middle pitches mm
# Separate features (X) and target variable (y)
X = middle_middle_pitches.drop(columns=['Swing'])  # these are the features we will feed into our model
y = middle_middle_pitches['Swing']  # This is our guide, our target variable about what we would like to predict.

# Split the data into training and holdout set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost classifier
xgb_model_mm = XGBClassifier(objective='binary:logistic', random_state=42)

# Train the model
xgb_model_mm.fit(X_train, y_train)

# Make predictions on the test set
probabilities_mm = xgb_model_mm.predict_proba(X_test)[:, 1]  # Probability of swing and miss.

In [None]:
#Hyperparams for mm
xgb_param_dist = {
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 400],
    'max_depth': [4, 6, 8]
}

# Instance of model
model_xgb_mm = xgb.XGBClassifier(objective='reg:squarederror')

# Perform search
random_search = RandomizedSearchCV(
    model_xgb_mm,
    param_distributions = xgb_param_dist,
    n_iter=10,
    cv=10,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best params
best_xgb_params_mm = random_search.best_params_

# Print out
print("Best XGBoost parameters for middle-middle pitches:", best_xgb_params_mm)

# Get best estimator
best_xgb_model_mm = random_search.best_estimator_

# Print out
best_xgb_model_mm

In [None]:
# Extract feature importance scores from the trained XGBoost model
feature_importance = best_xgb_model_mm.feature_importances_

# Match feature importance scores with corresponding column names
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# Sort the feature importance scores in descending order
feature_importance_df_sorted = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print or visualize the sorted feature importance scores
print(feature_importance_df_sorted)

# Optionally, you can visualize the feature importance scores using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df_sorted, orient='h')
plt.title('Feature Importance for Swing Probability in Middle-Middle Pitches')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()


# Question 4

In [None]:
only_2 = combined_filled[combined_filled['season'] == 2]

In [None]:
# Separate features (X) and target variable (y)
X_2= only_2.drop(columns=['Swing'])  # these are the features we will feed into our model
y_2 = only_2['Swing']  # This is our guide, our target variable about what we would like to predict.

In [None]:
# Let's focus just on season 2 for this problem
# Let's begin by using our model to make predictions just for year2
forecast_xgb_best_2 = best_xgb_model.predict_proba(X_2)[:, 1]

In [None]:
# Add predicted probabilities to the test set
X_2['SwingProbability'] = forecast_xgb_best_2

In [None]:
X_2

In [None]:
# Calculate Player Swing Probability (PSP)
player_swing_prob = X_2.groupby('batter')['SwingProbability'].mean().reset_index()
player_swing_prob.rename(columns={'SwingProbability': 'PSP'}, inplace=True)

# Calculate League Average Swing Probability (LASP)
league_avg_swing_prob = X_2['SwingProbability'].mean()

In [None]:
# Calculate Swing Performance Index (SPI)
player_swing_prob['SPI'] = (player_swing_prob['PSP'] - league_avg_swing_prob) / league_avg_swing_prob

# Sort players based on SPI values
player_swing_prob_sorted = player_swing_prob.sort_values(by='SPI', ascending=False)

In [None]:
# Top 10 players
player_swing_prob_sorted.head(10)

In [None]:
# Bottom 10 players
player_swing_prob_sorted.tail(10)