# Selecting the best model with Best hyperparameters

In [4]:
# import libraries 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

# train test split the data 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder 

In [5]:
# Import regression algorithms 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.ensemble import GradientBoostingRegressor 
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 


In [6]:
# import grid seach cv for cross valication 
from sklearn.model_selection import GridSearchCV 


In [7]:
# import preprocessors 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline

In [8]:
# Load dataset 
df = sns.load_dataset('tips')


In [9]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [10]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regresssion Tasks

In [11]:
# select features and variables 
X = df.drop("tip", axis=1)
y = df['tip']

In [12]:
# label encode categorical variables 
le = LabelEncoder()
X['sex']= le.fit_transform(X['sex'])
X['smoker']= le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

In [13]:
%%time 
# split the data into train and test data with 80% training dateset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dictionaries of list of model sto evalueate performace 

models ={
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()
    
}

CPU times: total: 0 ns
Wall time: 5.98 ms


In [16]:
# train and predict each model with evaluateion metrics as well making a for loop to iterate over the models 

model_scores =[]
for name, model in models.items():
    # fit each model from modles on taining data 
    model.fit(X_train, y_train)

    # make prediction from each model 
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))


    # # print the performing metric 
    # print (name, 'MSE:', mean_squared_error(y_test, y_pred))
    # print(name, 'R2:', r2_score(y_test, y_pred))
    # print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
    # print('\n')


# selecting the best model from all above modles with evaluation metrics sorting method 
    sorted_models = sorted(model_scores, key=lambda x:x[1], reverse=False)
    for model in sorted_models:
        print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}")

Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for DecisionTreeRegressor is  0.83
Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for RandomForestRegressor is  0.77
Mean Absolute error for DecisionTreeRegressor is  0.83
Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.77
Mean Absolute error for DecisionTreeRegressor is  0.83
Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for GradientBoostingRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.77
Mean Absolute error 

In [19]:
%%time 
# split the data into train and test data with 80% training dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create a dictionaries of list of models to evaluate performance 
models ={
          'LinearRegression' : LinearRegression(),
          'SVR' : SVR(),
          'DecisionTreeRegressor' : DecisionTreeRegressor(),
          'RandomForestRegressor' : RandomForestRegressor(),
          'KNeighborsRegressor' : KNeighborsRegressor(),
          'GradientBoostingRegressor' : GradientBoostingRegressor(),
          'XGBRegressor' : XGBRegressor()  
}

# train and predict each model with evaluation metrics as well making a for looop to iterate over the model
model_scores = []

for name, model in models.items():
    # fit each model from models on training data 
    model.fit(X_train, y_train)


    # make prediction from each model 
    y_pered = model.predict(X_test) 
    metric = mean_squared_error(y_test, y_pred)
    model_scores.append((name, metric))

    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')

# selectiong the best model from all above models with evaluation metrics sorting method 

sorted_models = sorted(model_scores, key=lambda x:x[1], reverse=False)

for model in sorted_models:
    print('Mean Squard error for', f"{model[0]} is {model[1]:.2f}")




Mean Squard error for LinearRegression is 0.74
Mean Squard error for SVR is 0.74
Mean Squard error for DecisionTreeRegressor is 0.74
Mean Squard error for RandomForestRegressor is 0.74
Mean Squard error for KNeighborsRegressor is 0.74
Mean Squard error for GradientBoostingRegressor is 0.74
Mean Squard error for XGBRegressor is 0.74
CPU times: total: 625 ms
Wall time: 335 ms


In [20]:
diamonds = sns.load_dataset('diamonds')

# Hyperparameter tuning:

In [22]:
%%time 
# Create a dictionaries of lsit of models to evaluate performance with hyperparametes 

models ={
    'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}), 

}


# train and predict ech mdoel jwith evaluation metrics as well makeing a for loop to item iterate over the models 

for name, (model, params) in models.items():
    # crate a pipline 
    pipeline = GridSearchCV(model, params, cv=5)

    # fit the pipeline 
    pipeline.fit(X_train, y_train)

    # make prediction for each model

    y_pred = pipeline.predict(X_test)

    # print the performing metric 
    print(name, 'MSE:', mean_squared_error(y_test, y_pred))
    print(name, 'R2:',  r2_score(y_test, y_pred))
    print(name, 'MAE:', mean_absolute_error(y_test, y_pred))

LinearRegression MSE: 0.6948129686287711
LinearRegression R2: 0.4441368826121931
LinearRegression MAE: 0.6703807496461157
SVR MSE: 1.460718141299992
SVR R2: -0.1686013018011976
SVR MAE: 0.8935334948775431
DecisionTreeRegressor MSE: 0.8774153020453994
DecisionTreeRegressor R2: 0.2980516670532909
DecisionTreeRegressor MAE: 0.718948162948163
RandomForestRegressor MSE: 0.9091948500000013
RandomForestRegressor R2: 0.2726274458703116
RandomForestRegressor MAE: 0.7596020408163265
KNeighborsRegressor MSE: 0.6640950568462677
KNeighborsRegressor R2: 0.4687117753876745
KNeighborsRegressor MAE: 0.6203721488595437
GradientBoostingRegressor MSE: 0.8106801524004932
GradientBoostingRegressor R2: 0.35144101065487676
GradientBoostingRegressor MAE: 0.7657809818712309
XGBRegressor MSE: 0.6624107100882575
XGBRegressor R2: 0.4700592836840687
XGBRegressor MAE: 0.6549163442728472
CPU times: total: 5 s
Wall time: 3.3 s


In [23]:
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  0.6948129686287711
LinearRegression R2:  0.4441368826121931
LinearRegression MAE:  0.6703807496461157




## Add preprocessor inside the pipeline

In [None]:
# make a preprocessor

preprocessor = ColumnTransformer(
    transformers=['numeric_scaling', StandardScaler(), ['total_bill', 'size']], remainder='passthrough')


# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid'], 'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'epsilon': [0.1, 0.01, 0.001]}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10], 'splitter': ['best', 'random']}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100, 1000], 'max_depth': [None, 5, 10]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2), 'weights': ['uniform', 'distance']}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'loss': ['ls', 'lad', 'huber', 'quantile'], 'n_estimators': [10, 100, 1000]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100, 1000], 'learning_rate': [0.1, 0.01, 0.001]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline with preprocessor
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])   
    
    # make a grid search cv to tune the hyperparameter
    grid_search = GridSearchCV(pipeline, params, cv=5)
    
    
    # fit the pipeline
    grid_search.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = grid_search.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

# Classifiers:

In [None]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# Perform k-fold cross-validation and calculate the mean accuracy
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()