# Select Best Hyperparameter Tuned Model:

# In this notebook, we will select the best hyperparameter tuned model based on accuracy.


In [67]:
# import libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
# import grid search CV
from sklearn.model_selection import GridSearchCV
# import preprocessors 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error



In [68]:
# load data
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [69]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [70]:
# select the features and variable to predict
X = df.drop(['tip'],axis=1)
y = df['tip']

In [71]:
# label encoded catigorical variables 
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

In [72]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [73]:
# split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# define the models 
models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor(),
}
# train and pridicr each model with evaluation metrics at well making a for loop
model_scores = []
for name, model in models.items():
    # fit the model
    model.fit(X_train, y_train)
    # predict
    y_pred = model.predict(X_test)
    metrics = mean_absolute_error(y_test, y_pred)#
    model_scores.append((name, metrics))  # store both model name and metrics
    print(name, 'MAE', metrics)

# select the best model from all above models with evaluation metrics
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute Error:', f"{model[0]} is {model[1]: .2f}")
    
  

LinearRegression MAE 0.6703807496461157
SVR MAE 0.5707097371316318
DecisionTreeRegressor MAE 0.9581632653061225
RandomForestRegressor MAE 0.7622122448979595
KNeighborsRegressor MAE 0.7262448979591837
GradientBoostingRegressor MAE 0.7293273250748341
XGBRegressor MAE 0.6721697168934103
Mean Absolute Error: SVR is  0.57
Mean Absolute Error: LinearRegression is  0.67
Mean Absolute Error: XGBRegressor is  0.67
Mean Absolute Error: KNeighborsRegressor is  0.73
Mean Absolute Error: GradientBoostingRegressor is  0.73
Mean Absolute Error: RandomForestRegressor is  0.76
Mean Absolute Error: DecisionTreeRegressor is  0.96
