## Selecting the best model with best hyperparameters 

In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import model selection and preprocessing method
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.compose import   ColumnTransformer
from sklearn.pipeline import Pipeline
#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV



#import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score





In [2]:
#load the data
df=sns.load_dataset('tips')


In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Regression Tasks

In [4]:
#select features and varaibles
X=df.drop('tip',axis=1)
y=df['tip']

#label encode categorical varaiables
le=LabelEncoder()
X['sex']=le.fit_transform(X['sex'])
X['smoker']=le.fit_transform(X['smoker'])
X['day']=le.fit_transform(X['day'])
X['time']=le.fit_transform(X['time'])

#spilt the data into train and test data sets with 80% training dataset
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42 )


In [5]:
%%time
#create dictionaries of list of models to evaluate performance
models={
    'LinearRegression':LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'XGBRegressor':XGBRegressor()
}

#train and predict each model with evaluation metrics as well making  a for loop
model_scores=[]
for name,model in models.items():
    #fit each model from models on training data
    model.fit(X_train,y_train)

    #make prediction from each model
    y_pred=model.predict(X_test)
    metric=mean_absolute_error(y_test,y_pred)
    model_scores.append((name,metric))


    #print the performance metrics
    # print(name, 'MAE: ', mean_absolute_error(y_test,y_pred))
    # print(name, 'MSE: ', mean_squared_error(y_test,y_pred))
    # print(name, 'R2: ', r2_score(y_test,y_pred))
    #print('\n)

#selecting the best model from all above models with evaluation metrics  sorted in ascending order
sorted_models=sorted(model_scores, key=lambda x:x[1], reverse=False)
for model in sorted_models:
    print('mean_absolute_error for', f"{model[0]} is {model[1]: .2f}")



mean_absolute_error for SVR is  0.57
mean_absolute_error for LinearRegression is  0.67
mean_absolute_error for XGBRegressor is  0.67
mean_absolute_error for KNeighborsRegressor is  0.73
mean_absolute_error for GradientBoostingRegressor is  0.73
mean_absolute_error for RandomForestRegressor is  0.77
mean_absolute_error for DecisionTreeRegressor is  0.91
CPU times: total: 812 ms
Wall time: 711 ms


In [6]:
%%time
#create dictionaries of list of models to evaluate performance
models={
    'LinearRegression':LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'XGBRegressor':XGBRegressor()
}

#train and predict each model with evaluation metrics as well making  a for loop
model_scores=[]
for name,model in models.items():
    #fit each model from models on training data
    model.fit(X_train,y_train)

    #make prediction from each model
    y_pred=model.predict(X_test)
    metric=r2_score(y_test,y_pred)
    model_scores.append((name,metric))


    #print the performance metrics
    # print(name, 'MAE: ', mean_absolute_error(y_test,y_pred))
    # print(name, 'MSE: ', mean_squared_error(y_test,y_pred))
    # print(name, 'R2: ', r2_score(y_test,y_pred))
    #print('\n)

#selecting the best model from all above models with evaluation metrics  sorted in desending error
sorted_models=sorted(model_scores, key=lambda x:x[1], reverse=True)
for model in sorted_models:
    print('r2_score for', f"{model[0]} is {model[1]: .2f}")



r2_score for SVR is  0.57
r2_score for LinearRegression is  0.44
r2_score for XGBRegressor is  0.41
r2_score for GradientBoostingRegressor is  0.35
r2_score for KNeighborsRegressor is  0.33
r2_score for RandomForestRegressor is  0.23
r2_score for DecisionTreeRegressor is  0.13
CPU times: total: 812 ms
Wall time: 488 ms


# Hperparameter Tunninng

In [7]:
%%time
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461157


SVR MSE:  1.460718141299992
SVR R2:  -0.1686013018011976
SVR MAE:  0.8935334948775431


DecisionTreeRegressor MSE:  0.8774153020453994
DecisionTreeRegressor R2:  0.2980516670532909
DecisionTreeRegressor MAE:  0.7189481629481629


RandomForestRegressor MSE:  0.9149918393877565
RandomForestRegressor R2:  0.26798974804653475
RandomForestRegressor MAE:  0.7682102040816329


KNeighborsRegressor MSE:  0.6640950568462677
KNeighborsRegressor R2:  0.4687117753876745
KNeighborsRegressor MAE:  0.6203721488595437


GradientBoostingRegressor MSE:  0.8106801524004932
GradientBoostingRegressor R2:  0.35144101065487676
GradientBoostingRegressor MAE:  0.7657809818712309


XGBRegressor MSE:  0.6624107100882575
XGBRegressor R2:  0.4700592836840687
XGBRegressor MAE:  0.6549163442728472


CPU times: total: 5.81 s
Wall time: 6.11 s


In [8]:

# Dictionary of models and their respective hyperparameters
models = { 
    'Linear Regression': (LinearRegression(), {}),
    'Support Vector Regressor': (SVR(),
                                  {'kernel': ['rbf', 'poly', 'sigmoid']}),
    'Decision Tree Regressor': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
    'Random Forest Regressor': (RandomForestRegressor(), {'n_estimators': [10, 100]}),
    'K-Neighbors Regressor': (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
    'Gradient Boosting Regressor': (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
    'XGBoost Regressor': (XGBRegressor(), {'n_estimators': [10, 100]}),          
}

# Initialize a list to store the results
results = []

# Loop through each model
for name, (model, params) in models.items():
    # Create the GridSearchCV pipeline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # Fit the pipeline to the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = pipeline.predict(X_test)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Append the results to the list
    results.append([name, mse, r2, mae])
    
# Convert the list to a DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'MSE', 'R2', 'MAE'])

# Display the DataFrame
print(results_df)


                         Model       MSE        R2       MAE
0            Linear Regression  0.694813  0.444137  0.670381
1     Support Vector Regressor  1.460718 -0.168601  0.893533
2      Decision Tree Regressor  0.877415  0.298052  0.718948
3      Random Forest Regressor  0.980678  0.215440  0.772047
4        K-Neighbors Regressor  0.664095  0.468712  0.620372
5  Gradient Boosting Regressor  0.810680  0.351441  0.765781
6            XGBoost Regressor  0.662411  0.470059  0.654916


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.compose import ColumnTransformer




# Dictionary of models and their hyperparameters
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Support Vector Regressor': (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
    'Decision Tree Regressor': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
    'Random Forest Regressor': (RandomForestRegressor(), {'n_estimators': [10, 100]}),
    'K-Neighbors Regressor': (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
    'Gradient Boosting Regressor': (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
    'XGBoost Regressor': (XGBRegressor(), {'n_estimators': [10, 100]}),
}

# DataFrame to store the results
results=[]

# Iterate over models
for name, (model, params) in models.items():
    # Create a pipeline with StandardScaler and the model
    pipeline = Pipeline([
        ('scale', StandardScaler()),  # Transformer step
        ('model', model)
    ])
    
    # Use GridSearchCV with the pipeline
    grid = GridSearchCV(pipeline,param_grid={'model__' + k: v for k, v in params.items()} ,cv=5)
    
    # Fit the pipeline
    grid.fit(X_train, y_train)
    # Make predictions
    y_pred = grid.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)


    # Append the results to the list
    results.append([name, mse, r2, mae])
    
    # Convert the list to a DataFrame
    results_df = pd.DataFrame(results, columns=['Model', 'MSE', 'R2', 'MAE'])

    

# Display the results with lines separating the columns and rows
print(results_df.to_string(index=False))


                      Model      MSE       R2      MAE
          Linear Regression 0.694813 0.444137 0.670381
   Support Vector Regressor 0.717650 0.425867 0.662227
    Decision Tree Regressor 0.877415 0.298052 0.718948
    Random Forest Regressor 0.944654 0.244260 0.765657
      K-Neighbors Regressor 0.780454 0.375623 0.740729
Gradient Boosting Regressor 0.810680 0.351441 0.765781
          XGBoost Regressor 0.662411 0.470059 0.654916
