# **Distance Predictor Part 3**
Author: Declan Costello

Date: 8/19/2023

## **Part 3 Description**

Here I Create pipelines with hyperparameter tuning Imputation, Scalling, One Hot encoding, and then use grid search for hyper parameter tuning utilizing the new features created in part 3

## **Table of Context**

1. [Installation](#Installation)
2. [Model Comparisons](#gridsearch-for-best-model)
3. [Results](#Results)

# **Installation**

The following installs the necessary packages

In [2]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import  StandardScaler, PolynomialFeatures

In [3]:
from sklearn import linear_model
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, ElasticNet

In [10]:
from bokeh.io import output_notebook, show
from bokeh.palettes import Spectral11
from bokeh.plotting import figure
from bokeh.layouts import row
output_notebook()

# **Data Import and Train Test Split**

In [5]:
data = pd.read_csv('FE_data.csv')

data.pop('Unnamed: 0')
data.pop('hc_x')
data.pop('hc_y')
data.pop('events')#...................
data.pop('woba_value')
data.pop('hit_distance_sc_percentile')
data.pop('launch_speed_percentile')
data.pop('release_speed_percentile')
data.pop('launch_angle_binned')
data.pop('pull_percent_binned')
data.pop('Pop_percentile')
data.pop('pitch_type')

feature_cols = ['launch_angle','launch_speed',"release_speed","fav_platoon_split_for_batter","grouped_pitch_type","domed","game_elevation","is_barrel","Pop","pull_percent", "spray_angle"]
X = data.loc[:, feature_cols]

target_cols = ['hit_distance_sc']
y = data.loc[:, target_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

# **Using GridSearch to Find Best Model**

# **Possible Models**

In [6]:
#LogisticRegression
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('LR', LogisticRegression(random_state=42))])

#Ridge
pipe_ridge = Pipeline([('scl', StandardScaler()),
                       ('PR', Ridge(random_state=42))])

#RandomForestRegressor
pipe_rf = Pipeline([('scl', StandardScaler()),
                    ('RF',RandomForestRegressor(random_state=42))])

#KNeighborsRegressor
pipe_knn = Pipeline([('scl', StandardScaler()),
                    ('KNN', KNeighborsRegressor())])

#XGBRegressor
pipe_xgb = Pipeline([('scl', StandardScaler()),
                     ('XGB', XGBRegressor(random_state=42))])

#PolynomialRegression
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

#ElasticNet
pipe_en = Pipeline([('scl', StandardScaler()),
                     ('EN', ElasticNet(random_state=42))])

#PLSRegression
pipe_pls = Pipeline([('scl', StandardScaler()),
                     ('PLS', PLSRegression())])

#Lasso
pipe_lasso = Pipeline([('scl', StandardScaler()),
                     ('LAS', linear_model.Lasso(random_state=42))])

#MLPRegressor
pipe_MLPR = Pipeline([('scl', StandardScaler()),
                      ('MLPR', MLPRegressor())])

#GradientBoostingRegressor
pipe_GBR = Pipeline([('scl', StandardScaler()),
                     ('GBR', GradientBoostingRegressor(random_state=42))])



# **Param Grids**

In [7]:
param_range = [1, 2, 3, 4, 5, 6,10]
param_range_fl = [1.0, 0.5, 0.1]
n_estimators = [5,10,15]
learning_rates = [.1,.2,.3]

#LogisticRegression
lr_param_grid = [{'LR__penalty': ['l1', 'l2']}]

#Ridge
ridge_param_grid = [{'PR__alpha':param_range,
                     'PR__fit_intercept':[True,False],
                     'PR__copy_X':[True,False],
                     'PR__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}]

#RandomForestRegressor
rf_param_grid = [{'RF__max_depth': param_range,
                  'RF__n_estimators': param_range}]

#KNeighborsRegressor
knn_param_grid = [{'KNN__n_neighbors': param_range,
                   'KNN__weights': ['uniform', 'distance'],
                   'KNN__metric': ['euclidean', 'manhattan']}]

#XGBRegressor
xgb_param_grid = [{'XGB__learning_rate': learning_rates,
                    'XGB__max_depth': param_range,
                    'XGB__min_child_weight': param_range[:2],
                    'XGB__subsample': param_range_fl,
                    'XGB__n_estimators': n_estimators}]

#PolynomialRegression
pyr_param_grid = {'polynomialfeatures__degree': param_range[:4],
              'linearregression__fit_intercept': [True, False]}

#ElasticNet
en_param_grid = [{'EN__alpha': param_range_fl,
                    'EN__l1_ratio': param_range_fl,
                    'EN__fit_intercept':[True,False],
                    'EN__precompute':[True,False],
                    'EN__copy_X':[True,False],
                    'EN__warm_start': [True,False],
                    'EN__selection': ['cyclic', 'random'],}]

#PLSRegression
pls_param_grid = [{'PLS__n_components': param_range,
                    'PLS__scale': [True,False],}]

#Lasso
lasso_param_grid = [{'LAS__alpha': param_range_fl,
                     'LAS__fit_intercept':[True, False],
                     'LAS__precompute':[True, False],
                     'LAS__copy_X':[True, False],
                     'LAS__warm_start':[True, False],
                     'LAS__positive': [True,False]}]


# #MLPRegressor
# pipe_MLPR = Pipeline([('scl', StandardScaler()),
#                       ('MLPR', MLPRegressor())])

#MLPRegressor
MLPR_param_grid = [{'MLPR__alpha': [0.0001,0.0002,0.0003],
                    'MLPR__max_iter': [150,200,250],
                    'MLPR__shuffle': [False, True],
                    'MLPR__verbose': [True, False],
                    'MLPR__momentum': [0.7,0.9,1.1],
                    'MLPR__nesterovs_momentum': [False,True],
                    'MLPR__early_stopping': [True],
                    'MLPR__warm_start': [True,False]}]

#GradientBoostingRegressor
GBR_param_grid = [{'GBR__max_depth': param_range,
                   'GBR__learning_rate': learning_rates,
                   'GBR__n_estimators': param_range,
                   'GBR__loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
                   'GBR__criterion': ['friedman_mse', 'squared_error'],
                   'GBR__min_samples_split': [1,2,3],
                   'GBR__min_samples_leaf': [1,2,3],
                   'GBR__max_features': ['auto', 'sqrt', 'log2']}]


# **Grid Search**

In [8]:
#LogisticRegression
lr_grid_search = GridSearchCV(estimator=pipe_lr,
        param_grid=lr_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#Ridge
ridge_grid_search = GridSearchCV(estimator=pipe_ridge,
        param_grid=ridge_param_grid,
        scoring='neg_mean_squared_error',
        cv=3)

#RandomForestRegressor
rf_grid_search = GridSearchCV(estimator=pipe_rf,
        param_grid=rf_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#KNeighborsRegressor
knn_grid_search = GridSearchCV(estimator=pipe_knn,
        param_grid=knn_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#XGBRegressor
xgb_grid_search = GridSearchCV(estimator=pipe_xgb,
        param_grid=xgb_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#PolynomialRegression
pyr_grid_search = GridSearchCV(PolynomialRegression(), 
                               param_grid=pyr_param_grid,
                               scoring='neg_mean_absolute_error', 
                               cv=3)

#ElasticNet
en_grid_search = GridSearchCV(estimator=pipe_en,
        param_grid=en_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#PLSRegression
pls_grid_search = GridSearchCV(estimator=pipe_pls,
        param_grid=pls_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#Lasso
lasso_grid_search = GridSearchCV(estimator=pipe_lasso,
        param_grid=lasso_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#MLPRegressor
MLPR_grid_search = GridSearchCV(estimator=pipe_MLPR,
        param_grid=MLPR_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#GradientBoostingRegressor
GBR_grid_search = GridSearchCV(estimator=pipe_GBR,
        param_grid=GBR_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)



# **MODEL FITTING**

In [8]:
grids = [lr_grid_search, 
         ridge_grid_search, 
         rf_grid_search, 
         knn_grid_search, 
         xgb_grid_search, 
         pyr_grid_search,
         en_grid_search, 
         pls_grid_search, 
         lasso_grid_search,
         MLPR_grid_search,
         GBR_grid_search]

for pipe in grids:
    pipe.fit(X_train, y_train.values.ravel())

Iteration 1, loss = 21777.51620845
Validation score: -1.019492
Iteration 2, loss = 12959.45386489
Validation score: 0.077823
Iteration 3, loss = 6166.63568507
Validation score: 0.433408
Iteration 4, loss = 4652.79555510
Validation score: 0.501778
Iteration 5, loss = 4202.37394071
Validation score: 0.540798
Iteration 6, loss = 3865.88776579
Validation score: 0.578959
Iteration 7, loss = 3538.90830614
Validation score: 0.616116
Iteration 8, loss = 3232.20591226
Validation score: 0.650586
Iteration 9, loss = 2955.17026299
Validation score: 0.681358
Iteration 10, loss = 2712.61649321
Validation score: 0.708058
Iteration 11, loss = 2504.17638015
Validation score: 0.730921
Iteration 12, loss = 2326.02859987
Validation score: 0.750419
Iteration 13, loss = 2172.28320442
Validation score: 0.767211
Iteration 14, loss = 2036.30333363
Validation score: 0.782084
Iteration 15, loss = 1912.33113723
Validation score: 0.795662
Iteration 16, loss = 1795.97721331
Validation score: 0.808437
Iteration 17, 



Iteration 1, loss = 22206.20676916
Validation score: -1.087817
Iteration 2, loss = 14016.53930448
Validation score: -0.030725
Iteration 3, loss = 6868.94045752
Validation score: 0.409540
Iteration 4, loss = 4786.82275385
Validation score: 0.519784
Iteration 5, loss = 4076.50404486
Validation score: 0.577679
Iteration 6, loss = 3643.65740620
Validation score: 0.617786
Iteration 7, loss = 3318.58234147
Validation score: 0.649612
Iteration 8, loss = 3050.27994077
Validation score: 0.676321
Iteration 9, loss = 2811.08960566
Validation score: 0.701669
Iteration 10, loss = 2588.87711376
Validation score: 0.724555
Iteration 11, loss = 2389.49222855
Validation score: 0.744515
Iteration 12, loss = 2218.07798576
Validation score: 0.761732
Iteration 13, loss = 2072.98763013
Validation score: 0.776339
Iteration 14, loss = 1950.13414201
Validation score: 0.788773
Iteration 15, loss = 1844.96951364
Validation score: 0.799804
Iteration 16, loss = 1753.38366912
Validation score: 0.809415
Iteration 17,

# **The Best Model is XGBoost!**

In [None]:
grid_dict = {#0: 'Logistic Regression', 
            #  1: 'Ridge', 
            #  2: 'Random Forest', 
            #  3: 'K-Nearest Neighbors', 
            #  4: 'XGBoost', 
            #  5: 'Poly', 
            #  6: 'Elastic Net',
            #  7: 'PLS', 
            #  8: 'Lasso',
             0: 'MLPR',
             1: 'GBR',
             2: 'BR'}

results_df = pd.DataFrame(columns=['Model','Test Accuracy','MAE','R2','Test Accuracy'])

for i, model in enumerate(grids):
    preds = model.predict(X_valid)
    results_df.loc[i] = [grid_dict[i],model.score(X_valid,y_valid),mean_absolute_error(y_valid, preds),r2_score(y_valid, preds), model.score(X_valid,y_valid)]

results_df = results_df.sort_values('MAE', ascending=False)

# Set the x_range to the list of categories above
p = figure(x_range=results_df.Model, width=700, height=700, title="MAE")

# Categorical values can also be used as coordinates
p.vbar(x=results_df.Model, top=results_df.MAE, width=0.9,color=Spectral11)

# Set some properties to make the plot look better
p.xgrid.grid_line_color = None
p.y_range.start = 0

p.xaxis.major_label_orientation = 1
p.xaxis.axis_label = 'Models'
p.yaxis.axis_label = 'MAE (Feet)'



# Set the x_range to the list of categories above
z = figure(x_range=results_df.Model, width=700, height=700, title="R2 (we want higher score)")

# Categorical values can also be used as coordinates
z.vbar(x=results_df.Model, top=results_df.R2, width=0.9, color=Spectral11)

# Set some properties to make the plot look better
z.xgrid.grid_line_color = None
z.y_range.start = 0

z.xaxis.major_label_orientation = 1
z.xaxis.axis_label = 'Models'
z.yaxis.axis_label = 'R^2'

show(row(z,p))



# **TODO**

- Use XGBoost in Part 4
