# **Distance Predictor Part 4**
Author: Declan Costello

Date: 8/19/2023

## **Part 4 Description**

Here I Create pipelines with hyperparameter tuning Imputation, Scalling, One Hot encoding, and then use grid search for hyper parameter tuning utilizing the new features created in part 3

## **Table of Context**

1. [Installation](#Installation)
2. [Best Model ROI](#roi-on-chosen-model-from-grid-search-of-models)

# **Installation**

The following installs the necessary packages

In [1]:
import pandas as pd
import seaborn as sns
from sklearn import set_config
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import  StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [2]:
from sklearn import linear_model
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, ElasticNet

In [3]:
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
from bokeh.palettes import Spectral9
from bokeh.plotting import figure
from bokeh.layouts import row
output_notebook()

# **Data Import and Train Test Split**

In [4]:
data = pd.read_csv('FE_data.csv')

data.pop('Unnamed: 0')
data.pop('hc_x')
data.pop('hc_y')
data.pop('events')#...................
data.pop('woba_value')
data.pop('hit_distance_sc_percentile')
data.pop('launch_speed_percentile')
data.pop('release_speed_percentile')
data.pop('launch_angle_binned')
data.pop('pull_percent_binned')
data.pop('Pop_percentile')
data.pop('pitch_type')

feature_cols = ['launch_angle','launch_speed',"release_speed","fav_platoon_split_for_batter","grouped_pitch_type","domed","game_elevation","is_barrel","Pop","pull_percent", "spray_angle"]
X = data.loc[:, feature_cols]

target_cols = ['hit_distance_sc']
y = data.loc[:, target_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

# **Search for Best XGBR Params**

In [10]:
param_range = [1, 2, 3, 4, 5, 6,10]
param_range_fl = [1.0, 0.5, 0.1]
n_estimators = [5,10,15]
learning_rates = [.1,.2,.3]

# **Best XGB__learning_rate**

In [11]:
#XGBRegressor
pipe_xgb = Pipeline([('scl', StandardScaler()),
                     ('XGB', XGBRegressor(random_state=42))])

learning_rates = [.1,.2,.3, .4, .5, .6, .7, .8, .9]

#XGBRegressor
xgb_param_grid = [{'XGB__learning_rate': learning_rates}]

#XGBRegressor
xgb_grid_search = GridSearchCV(estimator=pipe_xgb,
        param_grid=xgb_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

xgb_grid_search.fit(X_train, y_train.values.ravel())


In [12]:
test = pd.DataFrame(xgb_grid_search.cv_results_)

x= test.param_XGB__learning_rate
y = test.mean_test_score

# Basic plot setup
p = figure(width=700, height=700, title='param_XGB__learning_rate VS Mean Test Score')
p.xaxis.axis_label = 'param_XGB__learning_rate'
p.yaxis.axis_label = 'Mean Test Score'

p.line(x, y, line_width=2, color='Blue')

cr = p.circle(x, y, size=10,
              fill_color="white", hover_fill_color="blue",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

p.add_tools(HoverTool(tooltips=None, renderers=[cr], mode='hline'))

#plot 2
x= test.param_XGB__learning_rate
y = test.mean_fit_time


z = figure(width=700, height=700, title='param_XGB__learning_rate VS Time')
z.xaxis.axis_label = 'param_XGB__learning_rate'
z.yaxis.axis_label = 'Time (Sec)'

z.line(x, y, line_width=2, color='red')

zr = z.triangle(x, y, size=10,
              fill_color="white", hover_fill_color="firebrick",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

z.add_tools(HoverTool(tooltips=None, renderers=[zr], mode='hline'))

show(row(p,z))

In [13]:
xgb_grid_search.best_params_

{'XGB__learning_rate': 0.1}

# **Best XGB__max_depth**

In [14]:
#XGBRegressor
pipe_xgb = Pipeline([('scl', StandardScaler()),
                     ('XGB', XGBRegressor(random_state=42))])

param_range = [1, 5, 10, 15, 20]

#XGBRegressor
xgb_param_grid = [{'XGB__max_depth': param_range}]

#XGBRegressor
xgb_grid_search = GridSearchCV(estimator=pipe_xgb,
        param_grid=xgb_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

xgb_grid_search.fit(X_train, y_train.values.ravel())

In [15]:
test = pd.DataFrame(xgb_grid_search.cv_results_)

x= test.param_XGB__max_depth
y = test.mean_test_score

# Basic plot setup
p = figure(width=700, height=700, title='param_XGB__max_depth VS Mean Test Score')
p.xaxis.axis_label = 'param_XGB__max_depth'
p.yaxis.axis_label = 'Mean Test Score'

p.line(x, y, line_width=2, color='Blue')

cr = p.circle(x, y, size=10,
              fill_color="white", hover_fill_color="blue",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

p.add_tools(HoverTool(tooltips=None, renderers=[cr], mode='hline'))

#plot 2
x= test.param_XGB__max_depth
y = test.mean_fit_time


z = figure(width=700, height=700, title='param_XGB__max_depth VS Time')
z.xaxis.axis_label = 'param_XGB__max_depth'
z.yaxis.axis_label = 'Time (Sec)'

z.line(x, y, line_width=2, color='red')

zr = z.triangle(x, y, size=10,
              fill_color="white", hover_fill_color="firebrick",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

z.add_tools(HoverTool(tooltips=None, renderers=[zr], mode='hline'))

show(row(p,z))

In [16]:
xgb_grid_search.best_params_

{'XGB__max_depth': 5}

# **Best XGB__min_child_weight**

In [17]:
##XGBRegressor
pipe_xgb = Pipeline([('scl', StandardScaler()),
                     ('XGB', XGBRegressor(random_state=42))])

min_child_weights = [1, 2, 3, 4, 5, 6, 7, 8, 9]

#XGBRegressor
xgb_param_grid = [{'XGB__min_child_weight': min_child_weights}]

#XGBRegressor
xgb_grid_search = GridSearchCV(estimator=pipe_xgb,
        param_grid=xgb_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

xgb_grid_search.fit(X_train, y_train.values.ravel())

In [18]:
test = pd.DataFrame(xgb_grid_search.cv_results_)

x= test.param_XGB__min_child_weight
y = test.mean_test_score

# Basic plot setup
p = figure(width=700, height=700, title='param_XGB__min_child_weight VS Mean Test Score')
p.xaxis.axis_label = 'param_XGB__min_child_weight'
p.yaxis.axis_label = 'Mean Test Score'

p.line(x, y, line_width=2, color='Blue')

cr = p.circle(x, y, size=10,
              fill_color="white", hover_fill_color="blue",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

p.add_tools(HoverTool(tooltips=None, renderers=[cr], mode='hline'))

#plot 2
x= test.param_XGB__min_child_weight
y = test.mean_fit_time


z = figure(width=700, height=700, title='param_XGB__min_child_weight VS Time')
z.xaxis.axis_label = 'param_XGB__min_child_weight'
z.yaxis.axis_label = 'Time (Sec)'

z.line(x, y, line_width=2, color='red')

zr = z.triangle(x, y, size=10,
              fill_color="white", hover_fill_color="firebrick",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

z.add_tools(HoverTool(tooltips=None, renderers=[zr], mode='hline'))

show(row(p,z))

In [19]:
xgb_grid_search.best_params_

{'XGB__min_child_weight': 1}

# **Best XGB__subsample**

In [20]:
#XGBRegressor
pipe_xgb = Pipeline([('scl', StandardScaler()),
                     ('XGB', XGBRegressor(random_state=42))])

param_range_fl = [2.0, 1.5, 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]

#XGBRegressor
xgb_param_grid = [{'XGB__subsample': param_range_fl}]

#XGBRegressor
xgb_grid_search = GridSearchCV(estimator=pipe_xgb,
        param_grid=xgb_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

xgb_grid_search.fit(X_train, y_train.values.ravel())

6 fits failed out of a total of 36.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.

In [21]:
test = pd.DataFrame(xgb_grid_search.cv_results_)

x= test.param_XGB__subsample
y = test.mean_test_score

# Basic plot setup
p = figure(width=700, height=700, title='param_XGB__subsample VS Mean Test Score')
p.xaxis.axis_label = 'param_XGB__subsample'
p.yaxis.axis_label = 'Mean Test Score'

p.line(x, y, line_width=2, color='Blue')

cr = p.circle(x, y, size=10,
              fill_color="white", hover_fill_color="blue",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

p.add_tools(HoverTool(tooltips=None, renderers=[cr], mode='hline'))

#plot 2
x= test.param_XGB__subsample
y = test.mean_fit_time


z = figure(width=700, height=700, title='param_XGB__subsample VS Time')
z.xaxis.axis_label = 'param_XGB__subsample'
z.yaxis.axis_label = 'Time (Sec)'

z.line(x, y, line_width=2, color='red')

zr = z.triangle(x, y, size=10,
              fill_color="white", hover_fill_color="firebrick",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

z.add_tools(HoverTool(tooltips=None, renderers=[zr], mode='hline'))

show(row(p,z))

In [22]:
xgb_grid_search.best_params_

{'XGB__subsample': 1.0}

# **Best XGB__n_estimators [Using Random Search](https://medium.com/@senapati.dipak97/grid-search-vs-random-search-d34c92946318)**

In [23]:
param_grid = {
    'n_estimators': [1,5,10,15,20]
}

rnd_search = RandomizedSearchCV(XGBRegressor(), param_grid, scoring='neg_mean_squared_error')

rnd_search.fit(X_train, y_train.values.ravel())



In [24]:
test = pd.DataFrame(rnd_search.cv_results_)

x= test.param_n_estimators
y = test.mean_test_score

# Basic plot setup
p = figure(width=700, height=700, title='param_n_estimators VS Mean Test Score')
p.xaxis.axis_label = 'param_n_estimators'
p.yaxis.axis_label = 'Mean Test Score'

p.line(x, y, line_width=2, color='Blue')

cr = p.circle(x, y, size=10,
              fill_color="white", hover_fill_color="blue",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

p.add_tools(HoverTool(tooltips=None, renderers=[cr], mode='hline'))

#plot 2
x= test.param_n_estimators
y = test.mean_fit_time


z = figure(width=700, height=700, title='param_n_estimators VS Time')
z.xaxis.axis_label = 'param_n_estimators'
z.yaxis.axis_label = 'Time (Sec)'

z.line(x, y, line_width=2, color='red')

zr = z.triangle(x, y, size=10,
              fill_color="white", hover_fill_color="firebrick",
              fill_alpha=0.05, hover_alpha=0.8,
              line_color=None, hover_line_color="white")

z.add_tools(HoverTool(tooltips=None, renderers=[zr], mode='hline'))

show(row(p,z))

In [25]:
rnd_search.best_params_

{'n_estimators': 20}

# **TODO**

- Use best params for final pipeline in part 5
