In [92]:
#importing libraries
import pandas as pd
import pickle
from pandas import DataFrame, Series
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from sklearn import linear_model, naive_bayes
from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn import svm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,r2_score
import pybaseball as pyb
from pybaseball import statcast
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

In [93]:
#Import MLB Run Values
df = pd.read_csv("/Users/johndavis/Desktop/MLBRunValues.csv",low_memory=False)

In [94]:
#Rename Pitch Type Column to pitch_type
df.rename(columns = {'Pitch Type':'pitch_type'}, inplace = True)

In [95]:
#create a new dataframe with only Four Seam Fastballs
dffb = df[df.pitch_type== 'FF']
dffb.head(5)


Unnamed: 0,Player Name,Velocity,pitch_type,Spin Rate,Vertical Break,Horizontal Break,Release Height,Side Release,Extension,Description,Run Values
13,"Hearn, Taylor",96.2,FF,2314.0,18.12,7.68,6.23,2.99,6.3,ball,0.063688
14,"Hearn, Taylor",96.3,FF,2377.0,16.44,9.84,6.25,2.86,6.5,foul,-0.03805
18,"Hearn, Taylor",95.0,FF,2368.0,17.04,7.56,6.23,2.8,6.6,foul,-0.03805
21,"Hearn, Taylor",96.5,FF,2324.0,16.2,10.56,6.2,2.85,6.3,called_strike,-0.065093
27,"Hearn, Taylor",97.4,FF,2439.0,16.32,9.48,6.33,2.81,6.5,single,0.467293


In [96]:
#Create a new column for Differential Break
#Create a new ordered list of the columns
dffb.insert(6, "Differential Break", dffb["Vertical Break"] - dffb["Horizontal Break"].abs(), True)
dffb = dffb[["Player Name","Velocity","Spin Rate","Horizontal Break", "Vertical Break", "Differential Break","Release Height","Side Release", "Extension","Run Values"]]

In [97]:
#Drop all rows with NaN values
dffb = dffb.replace([np.inf, -np.inf], np.nan)
dffb = dffb.dropna()
dffb = dffb.reset_index()

In [102]:
#Establish X and y variables
X = dffb[["Velocity","Differential Break","Release Height","Side Release","Extension"]]
y = dffb["Run Values"]

In [99]:
#Set X variables to be used in the model
x_variables=dffb[["Velocity","Differential Break","Release Height","Side Release","Extension"]]

In [100]:
#Check for Multicollinearity
from re import X
vif_data=pd.DataFrame()
vif_data["feature"] = x_variables.columns
vif_data["VIF"] = [variance_inflation_factor(x_variables.values, i) for i in range(len(x_variables.columns))]

In [None]:
#Print VIF values
print(vif_data)

In [104]:
#Split data into training and testing sets
train_test_split(X,y,test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#xgboost model
model_XGB = XGBRegressor(n_estimators=100)
model_XGB.fit(X_train, y_train)


In [None]:
#Random Forest Model
model_RF = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)
model_RF.fit(X_train, y_train)


In [None]:
#Decision Tree Model
model_DT = DecisionTreeRegressor(random_state=0)
model_DT.fit(X_train, y_train)


In [None]:
#Linear Regression Model
model_LR = linear_model.LinearRegression()
model_LR.fit(X_train, y_train)

In [109]:
#Predictions based on each model
y_pred_LR = model_LR.predict(X_test)
y_pred_RF = model_RF.predict(X_test)
y_pred_DT = model_DT.predict(X_test)
y_pred_XGB = model_XGB.predict(X_test)

In [110]:
#Model Evaluation
#Root Mean Squared Error
print("Root Mean Squared Error")
print('Linear Regression:', np.sqrt(mean_squared_error(y_test, y_pred_LR)))
print('Random Forest:', np.sqrt(mean_squared_error(y_test, y_pred_RF)))
print('Decision Tree:', np.sqrt(mean_squared_error(y_test, y_pred_DT)))
print('XGBoost:', np.sqrt(mean_squared_error(y_test, y_pred_XGB)))


Root Mean Squared Error
Linear Regression: 0.19818852353765376
Random Forest: 0.1983476875373093
Decision Tree: 0.30395237095550504
XGBoost: 0.19944324243978095


In [111]:
#insert a new column for the predicted run values with each model
dffb.insert(11, "Predicted Run Values LR", model_LR.predict(X), True)
dffb.insert(12, "Predicted Run Values RF", model_RF.predict(X), True)
dffb.insert(13, "Predicted Run Values DT", model_DT.predict(X), True)
dffb.insert(14, "Predicted Run Values XGB", model_XGB.predict(X), True)


In [112]:
#Take out Columns that are not needed
dffb = dffb[["Player Name","Velocity","Spin Rate", "Vertical Break", "Horizontal Break","Release Height","Side Release","Extension","Run Values","Predicted Run Values LR","Predicted Run Values RF","Predicted Run Values DT","Predicted Run Values XGB"]]

In [113]:
#Create a new column for the mean of the predicted run values
dffb.insert(9, "Predicted Run Values Mean", dffb[["Predicted Run Values LR","Predicted Run Values RF","Predicted Run Values DT","Predicted Run Values XGB"]].mean(axis=1), True)


In [120]:
#Reorder the columns
dffb = dffb[["Player Name","Velocity","Spin Rate", "Vertical Break", "Horizontal Break","Release Height","Side Release","Extension","Run Values","Predicted Run Values Mean","Predicted Run Values LR","Predicted Run Values RF","Predicted Run Values DT","Predicted Run Values XGB"]]

In [None]:
#Calculate the league average expected run value on four seam fastballs
League_Average_Expected = dffb["Predicted Run Values Mean"].mean()
print(League_Average_Expected)

In [124]:
dffb.to_csv("/Users/johndavis/Desktop/MLBFOURSEAMRunValuesPredicted.csv", index=False)