# add weights to pipelines

# **Distance Predictor Part 4**
Author: Declan Costello

Date: 8/10/2023

## **Part 4 Description**

Here I Create pipelines with Imputation, Scalling, One Hot encoding, and then use grid search for hyper parameter tuning utilizing the new features created in part 3

## **Table of Context**

1. [Installation](#Installation)
2. [Machine Learning](#Machine-Learning)
3. [Grid Search](#Grid-Search)
4. [Random Search](#Random-Search)
5. [Results](#Results)
6. [Future Analysis](#Future-Analysis)

# **Installation**

The following installs the necessary packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import set_config
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler, OneHotEncoder, PolynomialFeatures, Imputer

In [None]:
data = pd.read_csv('feature_engineered_data.csv')

# **Machine Learning**

# **Train Test Split**

In [None]:
feature_cols = ['launch_angle', 'launch_speed', 'spray_angle', 'Barrel', 'domed', 'game_elevation', 'grouped_pitch_type', 'fav_platoon_split_for_batter', 'pull_percent']
#incudle "stand", "p_throws" but one hot encode them....
X = data.loc[:, feature_cols]

target_cols = ['hit_distance_sc']
y = data.loc[:, target_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

**Random Forest Regressor No Scaler**

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

model.fit(X_train, y_train)
preds = model.predict(X_valid)

RandomForestRegressor_no_scaler_mean_absolute_error =  mean_absolute_error(y_valid, preds)
RandomForestRegressor_no_scaler_mean_squared_error = mean_squared_error(y_valid, preds)
RandomForestRegressor_no_scaler_model_score = model.score(X_valid, y_valid)

RandomForestRegressor_no_scaler_mean_absolute_error

**Random Forest Regressor Standard Scaler**

for 0 to 1 vals

In [None]:
numeric_features = ['launch_angle', 'launch_speed', 'spray_angle', 'Barrel', 'domed', 'game_elevation', 'grouped_pitch_type', 'fav_platoon_split_for_batter', 'pull_percent']
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])


categorical_features = ["stand", "p_throws"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")


preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, numeric_features),
    ("cat_transform", categorical_transformer, categorical_features)
])


pipeline = Pipeline(steps=[(
    "preprocesser", preprocessor), 
    ("Random Forest Regressor", RandomForestRegressor())])


set_config(display='diagram')

pipeline

In [None]:
feature_cols = ['launch_angle', 'launch_speed', 'spray_angle', 'Barrel', 'domed', 'game_elevation', 'grouped_pitch_type', 'fav_platoon_split_for_batter','stand','p_throws','pull_percent']
X = data.loc[:, feature_cols]

target_cols = ['hit_distance_sc']
y = data.loc[:, target_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

pipeline.fit(X_train, y_train)

pipeline.score(X_valid, y_valid)

In [None]:
preds = pipeline.predict(X_valid)

RandomForestRegressor_mean_absolute_error =  mean_absolute_error(y_valid, preds)

RandomForestRegressor_mean_absolute_error

**Random Forest Regressor MinMax Scaler**

no negative numbers

In [None]:
pipeline = Pipeline([
("MinMax Scaling", MinMaxScaler()), 
("Linear Regression", RandomForestRegressor())
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_valid)

RandomForestRegressor_minmax_scaler_mean_squared_error = mean_squared_error(y_valid, preds)
RandomForestRegressor_minmax_scaler_mean_absolute_error =  mean_absolute_error(y_valid, preds)
RandomForestRegressor_minmax_scaler_model_score = pipeline.score(X_valid, y_valid)

# **NON Hyper Parameter Results**

In [None]:
results_df = pd.DataFrame(columns=['Model','Mean Squared Error','Mean Absolute Error','Model Score'])

results_df.loc[0] = ['Random Forest Regressor No Scaler', RandomForestRegressor_no_scaler_mean_squared_error, RandomForestRegressor_no_scaler_mean_absolute_error, RandomForestRegressor_no_scaler_model_score]  

results_df.loc[1] = ['Random Forest Regressor Standard Scaler', RandomForestRegressor_standard_scaler_mean_squared_error, RandomForestRegressor_standard_scaler_mean_absolute_error, RandomForestRegressor_standard_scaler_model_score]  

results_df.loc[2] = ['Random Forest Regressor MinMax Scaler', RandomForestRegressor_minmax_scaler_mean_squared_error, RandomForestRegressor_minmax_scaler_mean_absolute_error, RandomForestRegressor_minmax_scaler_model_score]  # adding a row

results_df

In [None]:
#only turning off for graph
import warnings
warnings.filterwarnings("ignore")

plt.figure(figsize=(50,5))

fig, axes = plt.subplots(1, 3, figsize=(35, 7))

plt.subplot(141)
sns.barplot(data=results_df, x="Model", y="Mean Squared Error", palette='viridis')
plt.title('Mean Squared Error')

plt.subplot(142)
sns.barplot(data=results_df, x="Model", y="Mean Absolute Error", palette='viridis')
plt.title('Mean Absolute Error')

plt.subplot(143)
sns.barplot(data=results_df, x="Model", y="Model Score", palette='viridis')
plt.title('Model Score')

plt.show()

# **HyperParameter Tuning**

# **1st Successful GridSearch**

In [None]:
param_grid = {'n_estimators': ([50,100,150]),
              'max_depth':[2,4,6], 
              'random_state':[0,1]}

grid = GridSearchCV(RandomForestRegressor(), param_grid)

grid.fit(X_train, y_train)

In [None]:
grid.best_params_


In [None]:
preds = grid.predict(X_valid)

grid_lin_mean_absolute_error =  mean_absolute_error(y_valid, preds)

grid_lin_mean_absolute_error

# **2nd Successful Grid Search**

https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/notebooks_v1/05.03-Hyperparameters-and-Model-Validation.ipynb

same thing:

https://jakevdp.github.io/PythonDataScienceHandbook/05.03-hyperparameters-and-model-validation.html

In [None]:
feature_cols = ['launch_angle', 'launch_speed', 'spray_angle', 'Barrel', 'domed', 'game_elevation', 'grouped_pitch_type', 'fav_platoon_split_for_batter', 'pull_percent']
#incudle "stand", "p_throws" but one hot encode them....
X = data.loc[:, feature_cols]

target_cols = ['hit_distance_sc']
y = data.loc[:, target_cols]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
def PolynomialRegression(degree=2, **kwargs):
                        #Imputer(strategy='mean'),
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [None]:
param_grid = {'polynomialfeatures__degree': np.arange(5),
              'linearregression__fit_intercept': [True, False]}

grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)

In [None]:
grid.fit(X_train, y_train)

In [None]:
grid.best_params_


In [None]:
preds = grid.predict(X_valid)

grid_lin_mean_absolute_error =  mean_absolute_error(y_valid, preds)

grid_lin_mean_absolute_error

# **HYPER PARAM TODO**

grid search for feats
https://github.com/wlongxiang/mlpipeline/blob/main/ml_pipeline_with_grid_search.ipynb

grid search for regression feats again: https://github.com/Andrew-Ng-s-number-one-fan/Hands-on-Machine-Learning-with-Scikit-Learn-Keras-and-TensorFlow/blob/master/Notebooks/C2_N1_Predicting%20Housing%20Price.ipynb

grid search for other models
https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html#sphx-glr-auto-examples-model-selection-plot-grid-search-stats-py

# SVC 
https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_stats.html#sphx-glr-auto-examples-model-selection-plot-grid-search-stats-py

https://medium.com/all-things-ai/in-depth-parameter-tuning-for-svc-758215394769

# **Random Forest Regressor Pipeline**

In [None]:
numeric_features = ['launch_angle', 'launch_speed', 'spray_angle', 'Barrel', 'domed', 'game_elevation', 'grouped_pitch_type', 'fav_platoon_split_for_batter', 'pull_percent']
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])


categorical_features = ["stand", "p_throws"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")


preprocessor = ColumnTransformer(transformers=[
    ("num_transform", numeric_transformer, numeric_features),
    ("cat_transform", categorical_transformer, categorical_features)
])


pipeline = Pipeline(steps=[(
    "preprocesser", preprocessor), 
    ("Random Forest Regressor", RandomForestRegressor())])


set_config(display='diagram')

pipeline

In [None]:
param_grid = [

    {
        "preprocesser__num_transform__imputer__strategy": ["mean", "median"],
        "classifier__n_estimators": [10, 100, 1000],
        "classifier": [RandomForestRegressor()]
    }
]

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=10, verbose=1,n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print(f"Best params:")
print(grid_search.best_params_)
print("Best score in grid search:")
print(grid_search.best_score_)
print("best logistic regression from grid search:")
#print(grid_search.score(X_test, y_test))

In [None]:
https://stackoverflow.com/questions/60786220/attributeerror-gridsearchcv-object-has-no-attribute-best-params

best_ = GridSearchCV(pipeline, param_grid, refit=False, n_jobs=-1).fit(X_train, y_train).best_estimator_   # <---- OK
best_


# **Future Analysis**

In the future hope to try hyper param tuning with a classfication project instead of a regression project

classifiyying grid search: 

https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee

https://github.com/BindiChen/machine-learning/blob/main/traditional-machine-learning/005-grid-search-vs-random-search-vs-bayes-search/gridsearch-vs-randomsearch-vs-bayessearch.ipynb

classifing grid search hyper param for best classifying model

https://github.com/tjburch/mlb-hit-classifier/blob/master/notebooks/2-added-variables.ipynb

# **Random**