# **Distance Predictor Part 3**
Author: Declan Costello

Date: 8/19/2023

## **Part 3 Description**

Here I Create pipelines with hyperparameter tuning Imputation, Scalling, One Hot encoding, and then use grid search for hyper parameter tuning utilizing the new features created in part 3

## **Table of Context**

1. [Installation](#Installation)
2. [Model Comparisons](#gridsearch-for-best-model)
3. [Results](#Results)

# **Installation**

The following installs the necessary packages

In [1]:
import pandas as pd
import seaborn as sns
from sklearn import set_config
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import  StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [2]:
from sklearn import linear_model
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LogisticRegression, Ridge, LinearRegression, ElasticNet

In [3]:
from bokeh.io import output_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
from bokeh.palettes import Spectral9
from bokeh.plotting import figure
from bokeh.layouts import row
output_notebook()

# **Data Import and Train Test Split**

In [4]:
data = pd.read_csv('FE_data.csv')

data.pop('Unnamed: 0')
data.pop('hc_x')
data.pop('hc_y')
data.pop('events')#...................
data.pop('woba_value')
data.pop('hit_distance_sc_percentile')
data.pop('launch_speed_percentile')
data.pop('release_speed_percentile')
data.pop('launch_angle_binned')
data.pop('pull_percent_binned')
data.pop('Pop_percentile')
data.pop('pitch_type')

feature_cols = ['launch_angle','launch_speed',"release_speed","fav_platoon_split_for_batter","grouped_pitch_type","domed","game_elevation","is_barrel","Pop","pull_percent", "spray_angle"]
X = data.loc[:, feature_cols]

target_cols = ['hit_distance_sc']
y = data.loc[:, target_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

# **Using GridSearch to Find Best Model**

# **Possible Models**

In [5]:
#LogisticRegression
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('LR', LogisticRegression(random_state=42))])

#Ridge
pipe_ridge = Pipeline([('scl', StandardScaler()),
                       ('PR', Ridge(random_state=42))])

#RandomForestRegressor
pipe_rf = Pipeline([('scl', StandardScaler()),
                    ('RF',RandomForestRegressor(random_state=42))])

#KNeighborsRegressor
pipe_knn = Pipeline([('scl', StandardScaler()),
                    ('KNN', KNeighborsRegressor())])

#XGBRegressor
pipe_xgb = Pipeline([('scl', StandardScaler()),
                     ('XGB', XGBRegressor(random_state=42))])

#PolynomialRegression
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

#ElasticNet
pipe_en = Pipeline([('scl', StandardScaler()),
                     ('EN', ElasticNet(random_state=42))])

#PLSRegression
pipe_pls = Pipeline([('scl', StandardScaler()),
                     ('PLS', PLSRegression())])

#Lasso
pipe_lasso = Pipeline([('scl', StandardScaler()),
                     ('LAS', linear_model.Lasso(random_state=42))])

# **Param Grids**

In [6]:
param_range = [1, 2, 3, 4, 5, 6,10]
param_range_fl = [1.0, 0.5, 0.1]
n_estimators = [5,10,15]
learning_rates = [.1,.2,.3]

#LogisticRegression
lr_param_grid = [{'LR__penalty': ['l1', 'l2']}]

#Ridge
ridge_param_grid = [{'PR__alpha':param_range,
                     'PR__fit_intercept':[True,False],
                     'PR__copy_X':[True,False],
                     'PR__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}]

#RandomForestRegressor
rf_param_grid = [{'RF__max_depth': param_range,
                  'RF__n_estimators': param_range}]

#KNeighborsRegressor
knn_param_grid = [{'KNN__n_neighbors': param_range,
                   'KNN__weights': ['uniform', 'distance'],
                   'KNN__metric': ['euclidean', 'manhattan']}]

#XGBRegressor
xgb_param_grid = [{'XGB__learning_rate': learning_rates,
                    'XGB__max_depth': param_range,
                    'XGB__min_child_weight': param_range[:2],
                    'XGB__subsample': param_range_fl,
                    'XGB__n_estimators': n_estimators}]

#PolynomialRegression
pyr_param_grid = {'polynomialfeatures__degree': param_range[:4],
              'linearregression__fit_intercept': [True, False]}

#ElasticNet
en_param_grid = [{'EN__alpha': param_range_fl,
                    'EN__l1_ratio': param_range_fl,
                    'EN__fit_intercept':[True,False],
                    'EN__precompute':[True,False],
                    'EN__copy_X':[True,False],
                    'EN__warm_start': [True,False],
                    'EN__selection': ['cyclic', 'random'],}]

#PLSRegression
pls_param_grid = [{'PLS__n_components': param_range,
                    'PLS__scale': [True,False],}]

#Lasso
lasso_param_grid = [{'LAS__alpha': param_range_fl,
                     'LAS__fit_intercept':[True, False],
                     'LAS__precompute':[True, False],
                     'LAS__copy_X':[True, False],
                     'LAS__warm_start':[True, False],
                     'LAS__positive': [True,False]}]

# **Grid Search**

In [7]:
#LogisticRegression
lr_grid_search = GridSearchCV(estimator=pipe_lr,
        param_grid=lr_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#Ridge
ridge_grid_search = GridSearchCV(estimator=pipe_ridge,
        param_grid=ridge_param_grid,
        scoring='neg_mean_squared_error',
        cv=3)

#RandomForestRegressor
rf_grid_search = GridSearchCV(estimator=pipe_rf,
        param_grid=rf_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#KNeighborsRegressor
knn_grid_search = GridSearchCV(estimator=pipe_knn,
        param_grid=knn_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#XGBRegressor
xgb_grid_search = GridSearchCV(estimator=pipe_xgb,
        param_grid=xgb_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#PolynomialRegression
pyr_grid_search = GridSearchCV(PolynomialRegression(), 
                               param_grid=pyr_param_grid,
                               scoring='neg_mean_absolute_error', 
                               cv=3)

#ElasticNet
en_grid_search = GridSearchCV(estimator=pipe_en,
        param_grid=en_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#PLSRegression
pls_grid_search = GridSearchCV(estimator=pipe_pls,
        param_grid=pls_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

#Lasso
lasso_grid_search = GridSearchCV(estimator=pipe_lasso,
        param_grid=lasso_param_grid,
        scoring='neg_mean_absolute_error',
        cv=3)

# **MODEL FITTING**

In [8]:
grids = [lr_grid_search, 
         ridge_grid_search, 
         rf_grid_search, 
         knn_grid_search, 
         xgb_grid_search, 
         pyr_grid_search,
         en_grid_search, 
         pls_grid_search, 
         lasso_grid_search]

for pipe in grids:
    pipe.fit(X_train, y_train.values.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

# **The Best Model is XGBoost!**

In [9]:
grid_dict = {0: 'Logistic Regression', 
             1: 'Ridge', 
             2: 'Random Forest', 
             3: 'K-Nearest Neighbors', 
             4: 'XGBoost', 
             5: 'Poly', 
             6: 'Elastic Net',
             7: 'PLS', 
             8: 'Lasso'}

results_df = pd.DataFrame(columns=['Model','Test Accuracy','MAE','R2','Test Accuracy'])

for i, model in enumerate(grids):
    preds = model.predict(X_valid)
    results_df.loc[i] = [grid_dict[i],model.score(X_valid,y_valid),mean_absolute_error(y_valid, preds),r2_score(y_valid, preds), model.score(X_valid,y_valid)]

results_df = results_df.sort_values('MAE', ascending=False)

# Set the x_range to the list of categories above
p = figure(x_range=results_df.Model, width=700, height=700, title="MAE (we want lower score)")

# Categorical values can also be used as coordinates
p.vbar(x=results_df.Model, top=results_df.MAE, width=0.9,color=Spectral9)

# Set some properties to make the plot look better
p.xgrid.grid_line_color = None
p.y_range.start = 0

p.xaxis.major_label_orientation = 1
p.xaxis.axis_label = 'Models'
p.yaxis.axis_label = 'MAE (Feet)'



# Set the x_range to the list of categories above
z = figure(x_range=results_df.Model, width=700, height=700, title="R2 (we want higher score)")

# Categorical values can also be used as coordinates
z.vbar(x=results_df.Model, top=results_df.R2, width=0.9, color=Spectral9)

# Set some properties to make the plot look better
z.xgrid.grid_line_color = None
z.y_range.start = 0

z.xaxis.major_label_orientation = 1
z.xaxis.axis_label = 'Models'
z.yaxis.axis_label = 'R^2'

show(row(z,p))

# **TODO**

- Use XGBoost in Part 4
