# Apply the Hyperparameter tunning selecting the best model of Machine Learning (Supervised Learning) for Regression Technique  

### Scikit Learn as sklearn library used for machine learning process

***Import the libraries***

In [79]:
#Import the libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

***Import sklearn libraries***

In [80]:
#Import the sklearn libraries:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder

***Load the dataset from the Seaborn***

In [81]:
#Load dataset:
df = sns.load_dataset('tips')

In [82]:
#Check the data
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [83]:
#Check the data type:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [84]:
#Changing the categorical data 
for col in df.columns:
    if df[col].dtype == 'object' or df[col].dtype == 'category':
        df[col] = LabelEncoder().fit_transform(df[col])

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int64  
 3   smoker      244 non-null    int64  
 4   day         244 non-null    int64  
 5   time        244 non-null    int64  
 6   size        244 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 13.5 KB


In [86]:
#Split the data:
X = df.drop('tip', axis=1)
y = df['tip']

In [87]:
#Split the data into train_test_split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [88]:
#Make the a group of models:
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'Support Vector Regressor': SVR(),
    'XGBRegressor' : XGBRegressor(),
    'KNeighborsRegressor' : KNeighborsRegressor(),
}
#Adding the scores of models:
models_scores = []

In [89]:
# Make a name of model to compare all models
models_scores = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    models_scores.append((name, metric))

sorted_models = sorted(models_scores, key=lambda x: x[1])

for model_name, metric in sorted_models:
    print(f"{model_name} - Mean Absolute Error: {metric:.2f}")

Support Vector Regressor - Mean Absolute Error: 0.57
LinearRegression - Mean Absolute Error: 0.67
XGBRegressor - Mean Absolute Error: 0.67
GradientBoostingRegressor - Mean Absolute Error: 0.72
KNeighborsRegressor - Mean Absolute Error: 0.73
RandomForestRegressor - Mean Absolute Error: 0.77
DecisionTreeRegressor - Mean Absolute Error: 0.91


***Add the Hypermater Tunning***

In [106]:
#Make the a group of models:
models = {
    'LinearRegression': (LinearRegression(), {}),
    'DecisionTreeRegressor': (DecisionTreeRegressor(),{'max_depth': [3, 5, 7]}),
    'RandomForestRegressor': (RandomForestRegressor(),{'n_estimators': [100, 200, 300]}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(),{'n_estimators': [100, 200, 300]}),
    'Support Vector Regressor': (SVR(),{'kernel': ['linear', 'rbf', 'sigmoid']}),
    'XGBRegressor' : (XGBRegressor(),{'n_estimators': [10, 50, 100]}),
    'KNeighborsRegressor' : (KNeighborsRegressor(),{'n_neighbors': np.arange(3, 100, 2)}),
}

In [110]:
#Add the hyperparameter Tunning:
for name, (model, params) in models.items():
    pipeline = GridSearchCV(model, params, cv=5)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(name, 'R2 Score', r2_score(y_test, y_pred))
    print(name, 'MAE', mean_absolute_error(y_pred, y_test))
    print(name, 'MSE', mean_squared_error(y_test, y_pred))
    print('\n')

LinearRegression R2 Score 0.4441368826121931
LinearRegression MAE 0.6703807496461157
LinearRegression MSE 0.6948129686287711


DecisionTreeRegressor R2 Score 0.37563818958689676
DecisionTreeRegressor MAE 0.69672056815006
DecisionTreeRegressor MSE 0.7804343720990294


RandomForestRegressor R2 Score 0.2496603133359615
RandomForestRegressor MAE 0.7622183673469388
RandomForestRegressor MSE 0.9379031075510217


GradientBoostingRegressor R2 Score 0.3488303051319587
GradientBoostingRegressor MAE 0.7298148806346737
GradientBoostingRegressor MSE 0.8139434594950864


Support Vector Regressor R2 Score 0.45274050603842664
Support Vector Regressor MAE 0.6689066182722474
Support Vector Regressor MSE 0.6840586858804613


XGBRegressor R2 Score 0.4700592836840687
XGBRegressor MAE 0.6549163442728472
XGBRegressor MSE 0.6624107100882575


KNeighborsRegressor R2 Score 0.4687117753876745
KNeighborsRegressor MAE 0.6203721488595437
KNeighborsRegressor MSE 0.6640950568462677


