# Selecting the Best model with Best Hyperparameters

<h3> Importing Libraries </h3>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import GridSearchCV
from sklearn.compose import  ColumnTransformer
from sklearn.pipeline import Pipeline

<h3> Load  Dataset </h3>

In [3]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


<h3> Best Model Selection with Best Hyperparameters

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [5]:
# Separate the features (X) and target variable (y)
X = df.drop(columns=['total_bill', 'tip'])  # Drop 'total_bill' and 'tip' columns from features
y = df['tip']  # Target variable is 'tip'

# Encode categorical columns into numerical values
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

In [6]:
models = {
    "Linear Regression": LinearRegression(),
    "Support Vector Regressor": SVR(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "XGBoost Regressor": XGBRegressor()
}

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store evaluation metrics for each model
model_metrics = {}

# Evaluate each model
for model_name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the metrics
    model_metrics[model_name] = {"MSE": mse, "MAE": mae, "R2": r2}
    
    # Print the metrics
    print(f"Model: {model_name}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared (R²): {r2:.4f}")
    print("-" * 30)

# Find the best model based on Mean Absolute Error
best_model = max(model_metrics, key=lambda x: model_metrics[x]["MAE"])
print(f"Best Model: {best_model}")
print(f"Metrics: {model_metrics[best_model]}")

Model: Linear Regression
Mean Squared Error (MSE): 1.1055
Mean Absolute Error (MAE): 0.8633
R-squared (R²): 0.1156
------------------------------
Model: Support Vector Regressor
Mean Squared Error (MSE): 1.1170
Mean Absolute Error (MAE): 0.8613
R-squared (R²): 0.1064
------------------------------
Model: Decision Tree Regressor
Mean Squared Error (MSE): 1.7598
Mean Absolute Error (MAE): 1.0440
R-squared (R²): -0.4079
------------------------------
Model: Random Forest Regressor
Mean Squared Error (MSE): 1.6475
Mean Absolute Error (MAE): 1.0212
R-squared (R²): -0.3180
------------------------------
Model: Gradient Boosting Regressor
Mean Squared Error (MSE): 1.5743
Mean Absolute Error (MAE): 1.0290
R-squared (R²): -0.2594
------------------------------
Model: K-Neighbors Regressor
Mean Squared Error (MSE): 1.5051
Mean Absolute Error (MAE): 0.9742
R-squared (R²): -0.2041
------------------------------
Model: XGBoost Regressor
Mean Squared Error (MSE): 1.7150
Mean Absolute Error (MAE): 1.

<h2> Hyperparameter Tuning

In [8]:
param_grid = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Support Vector Regressor": {
        "model": SVR(),
        "params": {
            "kernel": ["linear", "poly", "rbf", "sigmoid"],
            "C": [0.1, 1, 10]
        }
    },
    "Decision Tree Regressor": {
        "model": DecisionTreeRegressor(),
        "params": {
            "criterion": ["squared_error", "friedman_mse", "absolute_error"],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Random Forest Regressor": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Gradient Boosting Regressor": {
        "model": GradientBoostingRegressor(),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5, 10]
        }
    },
    "K-Neighbors Regressor": {
        "model": KNeighborsRegressor(),
        "params": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"],
            "algorithm": ["auto", "kd_tree", "brute"]
        }
    },
    "XGBoost Regressor": {
        "model": XGBRegressor(),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5, 10]
        }
    }
}

In [None]:
# Dictionary to store evaluation metrics and best parameters for each model
model_results = {}

# Evaluate each model
for model_name, config in param_grid.items():
    model = config["model"]
    params = config["params"]
    
    # Perform GridSearchCV
    pipeline = GridSearchCV(model, params, cv=5)
    pipeline.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = pipeline.predict(X_test)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the metrics and best parameters
    model_results[model_name] = {
        "MSE": mse,
        "MAE": mae,
        "R2": r2,
        "Best Parameters": pipeline.best_params_
    }
    
    # Print the metrics and best parameters
    print(f"Model: {model_name}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared (R²): {r2:.4f}")
    print(f"Best Parameters: {pipeline.best_params_}")
    print("-" * 30)

# Find the best model based on Mean Absolute Error
best_model_name = max(model_results, key=lambda x: model_results[x]["MAE"])
best_model_info = model_results[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Metrics: {best_model_info}")

Model: Linear Regression
Mean Squared Error (MSE): 1.1055
Mean Absolute Error (MAE): 0.8633
R-squared (R²): 0.1156
Best Parameters: {}
------------------------------
Model: Support Vector Regressor
Mean Squared Error (MSE): 0.9723
Mean Absolute Error (MAE): 0.8222
R-squared (R²): 0.2222
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
------------------------------
Model: Decision Tree Regressor
Mean Squared Error (MSE): 1.5998
Mean Absolute Error (MAE): 0.9990
R-squared (R²): -0.2798
Best Parameters: {'criterion': 'squared_error', 'max_depth': None, 'min_samples_split': 10}
------------------------------
Model: Random Forest Regressor
Mean Squared Error (MSE): 1.5368
Mean Absolute Error (MAE): 0.9968
R-squared (R²): -0.2295
Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
------------------------------
Model: Gradient Boosting Regressor
Mean Squared Error (MSE): 1.2411
Mean Absolute Error (MAE): 0.9501
R-squared (R²): 0.0071
Best Parameters: {'learning_r