# **Selecting the best model with with best hyperparameters**

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

#import required regressor libraries
from sklearn.linear_model import  LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
#import pipeline
from sklearn.pipeline import Pipeline

#evaluation metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
#import preprocessing 
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer



In [11]:
df=sns.load_dataset('tips') 

df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

## All this is Regerssion tasks

In [4]:
#select feature and variable
x=df.drop('price',axis=1)
y=df['price']


#label encoding(changing categorical data to numerical data)
le=LabelEncoder()
x['cut']=le.fit_transform(x['cut'])
x['color']=le.fit_transform(x['color'])
x['clarity']=le.fit_transform(x['clarity'])


#split data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)



## Predicting the best model on the base of evaluations

In [9]:
# creating a dictionary of models to valuate
models = {
        'Linear Regression' : LinearRegression(),
        'SVR' : SVR(),
        'Decision Tree' : DecisionTreeRegressor(),
        'Random Forest' : RandomForestRegressor(),
        'KNN' : KNeighborsRegressor(),
        'AdaBoost' : AdaBoostRegressor(),
        'Gradient Boosting' : GradientBoostingRegressor()
        }

        
#train and predict each model with evaluation metric in a for loop and selecting the best model on the basis of each metric
metric_values = {}
for name , model in models.items():
    #fit each model from models on training model
    model.fit(x_train,y_train)

    #make prediction on each model
    y_pred=model.predict(x_test)
    
      # Evaluate each model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store metric values for each model
    metric_values[name] = {'mse': mse, 'mae': mae, 'r2_score': r2}

    # Print evaluation metrics
    print(name, "mean squared error:", mse)
    print(name, "mean absolute error:", mae)
    print(name, "r2 score:", r2)
    print()

# Find the best model based on each metric
best_model_mse = min(metric_values, key=lambda x: metric_values[x]['mse'])
best_model_mae = min(metric_values, key=lambda x: metric_values[x]['mae'])
best_model_r2 = max(metric_values, key=lambda x: metric_values[x]['r2_score'])

print("Best Model based on MSE:", best_model_mse)
print("Best Model based on MAE:", best_model_mae)
print("Best Model based on R2 Score:", best_model_r2)


mean squared error:  1825912.991525348


# Hyperparameter tunning and pipline

In [None]:
#creating a dictionary of models to valuate
models = {
        'Linear Regression' : (LinearRegression(),{}),

        'SVR' : (SVR(),{'kerla':['rbf','poly','sigmoid'], 'C':[0.1,1,10],'gamma':[1,0.1,0.01], 'epsilon':[0.1,0.01,0.001]}),

        'Decision Tree' :(DecisionTreeRegressor(),{'splitter':['best','random'],'max_depth':['None',5,10]}),

        'Random Forest' : (RandomForestRegressor(),{'n_estimators':[10,50,100],'max_depth':['None',5,10]}),

        'KNN' : (KNeighborsRegressor(),{'n_neighbors':np.arange(3,100, 3),'weights':['uniform','distance'],'algorithm':['auto','ball_tree','kd_tree','brute']}), 

        'AdaBoost' : (AdaBoostRegressor(),{'n_neighbors': np.arange(3,100, 3),'loss':['linear','square','exponential'],'learning_rate':[0.1,0.01,0.001],'n_estimators':[10,100,200]}),
        
        'Gradient Boosting' : (GradientBoostingRegressor(),{'n_neighbors': np.arange(3,100, 3),'loss':['ls','lad','huber','quantile'],'learning_rate':[0.1,0.01,0.001],'n_estimators':[10,100,200]})
        }

        
#train and predict each model with evaluation metric as well making a for loop
metric_values = {}
for name , (model, params) in models.items():
    #create a pipeline
    pipeline=GridSearchCV(model,params,cv=5)

    #fit pipeline
    pipeline.fit(x_train,y_train)

   
    #make prediction on each model
    y_pred=pipeline.predict(x_test)
    
      # Evaluate each model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store metric values for each model
    metric_values[name] = {'mse': mse, 'mae': mae, 'r2_score': r2}

    # Print evaluation metrics
    print(name, "mean squared error:", mse)
    print(name, "mean absolute error:", mae)
    print(name, "r2 score:", r2)
    print()

# Find the best model based on each metric
best_model_mse = min(metric_values, key=lambda x: metric_values[x]['mse'])
best_model_mae = min(metric_values, key=lambda x: metric_values[x]['mae'])
best_model_r2 = max(metric_values, key=lambda x: metric_values[x]['r2_score'])

print("Best Model based on MSE:", best_model_mse)
print("Best Model based on MAE:", best_model_mae)
print("Best Model based on R2 Score:", best_model_r2)

## Add preprocessor inside the pipeline

In [None]:
#make a preprocessor
preprocessor=ColumnTransformer(
    transformers=['numeric_scaling', StandardScaler(),['total_bill', 'size']],remainder='passthrough')


#creating a dictionary of models to valuate
models = {
        'Linear Regression' : (LinearRegression(),{}),

        'SVR' : (SVR(),{'kerla':['rbf','poly','sigmoid'], 'C':[0.1,1,10],'gamma':[1,0.1,0.01], 'epsilon':[0.1,0.01,0.001]}),

        'Decision Tree' :(DecisionTreeRegressor(),{'splitter':['best','random'],'max_depth':['None',5,10]}),

        'Random Forest' : (RandomForestRegressor(),{'n_estimators':[10,50,100],'max_depth':['None',5,10]}),

        'KNN' : (KNeighborsRegressor(),{'n_neighbors':np.arange(3,100, 3),'weights':['uniform','distance'],'algorithm':['auto','ball_tree','kd_tree','brute']}), 

        'AdaBoost' : (AdaBoostRegressor(),{'n_neighbors': np.arange(3,100, 3),'loss':['linear','square','exponential'],'learning_rate':[0.1,0.01,0.001],'n_estimators':[10,100,200]}),
        
        'Gradient Boosting' : (GradientBoostingRegressor(),{'n_neighbors': np.arange(3,100, 3),'loss':['ls','lad','huber','quantile'],'learning_rate':[0.1,0.01,0.001],'n_estimators':[10,100,200]})
        }

        
#train and predict each model with evaluation metric as well making a for loop
metric_values = {}
for name , (model, params) in models.items():
    #create a pipeline  with preprocessor
    pipeline=Pipeline(steps=[('preprocessor', preprocessor),('model',model)])

    #grid search cross validation
    grid_search=GridSearchCV(pipeline,params,cv=5)

    #fit pipeline
    grid_search.fit(x_train,y_train)

   
    #make prediction on each model
    y_pred=grid_search.predict(x_test)
    
      # Evaluate each model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Store metric values for each model
    metric_values[name] = {'mse': mse, 'mae': mae, 'r2_score': r2}

    # Print evaluation metrics
    print(name, "mean squared error:", mse)
    print(name, "mean absolute error:", mae)
    print(name, "r2 score:", r2)
    print()

# Find the best model based on each metric
best_model_mse = min(metric_values, key=lambda x: metric_values[x]['mse'])
best_model_mae = min(metric_values, key=lambda x: metric_values[x]['mae'])
best_model_r2 = max(metric_values, key=lambda x: metric_values[x]['r2_score'])

print("Best Model based on MSE:", best_model_mse)
print("Best Model based on MAE:", best_model_mae)
print("Best Model based on R2 Score:", best_model_r2)

## k-fold cross validation on classifier problem


In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Define the classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'kNN': KNeighborsClassifier()
}

# Create the KFold object
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the classifiers
for name, classifier in classifiers.items():
    # Perform cross-validation
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    
    # Print the results
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classifier: Logistic Regression
Mean Accuracy: 0.9733333333333334

Classifier: Decision Tree
Mean Accuracy: 0.9533333333333335

Classifier: Random Forest
Mean Accuracy: 0.9600000000000002

Classifier: SVM
Mean Accuracy: 0.9666666666666668

Classifier: kNN
Mean Accuracy: 0.9733333333333334

