# Best Machine Learning Model Evaluation Loop

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.model_selection import cross_val_score
from sklearn import tree

# import dataset
titan = sns.load_dataset("titanic")

# categorize x and y
x = titan[["pclass", "sex", "age", "sibsp", "parch", "fare"]]
y = titan["survived"]

# this converts sex into dummy variables here we are converting sex into dummy variables i.e., male=1, and female=0
x = pd.get_dummies(x, columns=["sex"])  # label encoding

# deal with missing values of age
x.fillna({"age": x["age"].mean()}, inplace=True)

# train model
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)
# random state=42 for reproducibility i.e same split every time


# models evaluation
models = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    GaussianNB(),
    SVC(),
    KNeighborsClassifier(),
]
model_names = [
    "DecisionTreeClassifier",
    "RandomForestClassifier",
    "LogisticRegression",
    "GaussianNB",
    "SVC",
    "KNeighborsClassifier",
]

models_scores = []
for model, model_name in zip(models, model_names):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    models_scores.append(
        [model_name, accuracy]
    )  # Changed from model_names to model_name


sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=True)
# lambda function is used to sort the models by their accuracy scores in descending order.
# This is a minimal function that takes a model and its accuracy score as input and returns the accuracy score.

for model in sorted_models:
    print(f"{model[0]} accuracy_score: {model[1]:.2f}")
    # {model[1]:.2f} is used to format the accuracy score to two decimal places.
    # f here is used to format the string.
    # The .2f at the end of the string specifies that the value should be formatted as a floating-point number with two decimal places.
    # The f is used to indicate that the value should be formatted as a floating-point number.

LogisticRegression accuracy_score: 0.81
RandomForestClassifier accuracy_score: 0.80
GaussianNB accuracy_score: 0.79
DecisionTreeClassifier accuracy_score: 0.77
KNeighborsClassifier accuracy_score: 0.69
SVC accuracy_score: 0.66


# Same findings on Different dataset

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.model_selection import cross_val_score
from sklearn import tree

# import dataset
tips = sns.load_dataset("tips")
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [None]:
tips["day"].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [None]:
tips["time"].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [None]:
# categorize x and y
x = tips[["total_bill", "sex", "smoker", "time"]]
y = tips["tip"]

# this converts sex into dummy variables here we are converting sex into dummy variables i.e., male=1, and female=0
x = pd.get_dummies(x, columns=["sex"])  # label encoding
x = pd.get_dummies(x, columns=["time"])
x = pd.get_dummies(x, columns=["smoker"])

# turn continuous values into categorical
y = pd.cut(tips["tip"], bins=3, labels=["low", "medium", "high"])

# train model
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)
# random state=42 for reproducibility i.e same split every time


# models evaluation
models = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    GaussianNB(),
    SVC(),
    KNeighborsClassifier(),
]
model_names = [
    "DecisionTreeClassifier",
    "RandomForestClassifier",
    "LogisticRegression",
    "GaussianNB",
    "SVC",
    "KNeighborsClassifier",
]

models_scores = []
for model, model_name in zip(models, model_names):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    models_scores.append(
        [model_name, accuracy]
    )  # Changed from model_names to model_name


sorted_models = sorted(models_scores, key=lambda x: x[1], reverse=True)
# lambda function is used to sort the models by their accuracy scores in descending order.
# This is a minimal function that takes a model and its accuracy score as input and returns the accuracy score.

for model in sorted_models:
    print(f"{model[0]} accuracy_score: {model[1]:.2f}")
    # {model[1]:.2f} is used to format the accuracy score to two decimal places.
    # f here is used to format the string.
    # The .2f at the end of the string specifies that the value should be formatted as a floating-point number with two decimal places.
    # The f is used to indicate that the value should be formatted as a floating-point number.

LogisticRegression accuracy_score: 0.92
KNeighborsClassifier accuracy_score: 0.92
SVC accuracy_score: 0.90
DecisionTreeClassifier accuracy_score: 0.88
RandomForestClassifier accuracy_score: 0.88
GaussianNB accuracy_score: 0.53


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.model_selection import cross_val_score
from sklearn import tree

# categorize x and y
x = tips[["total_bill", "sex", "smoker", "time"]]
y = tips["tip"]

# this converts sex into dummy variables here we are converting sex into dummy variables i.e., male=1, and female=0
x = pd.get_dummies(x, columns=["sex"])  # label encoding
x = pd.get_dummies(x, columns=["time"])
x = pd.get_dummies(x, columns=["smoker"])

# turn continuous values into categorical
y = pd.cut(tips["tip"], bins=3, labels=["low", "medium", "high"])

# train model
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)
# random state=42 for reproducibility i.e same split every time


# models evaluation
models = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    GaussianNB(),
    SVC(),
    KNeighborsClassifier(),
]
model_names = [
    "DecisionTreeClassifier",
    "RandomForestClassifier",
    "LogisticRegression",
    "GaussianNB",
    "SVC",
    "KNeighborsClassifier",
]

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

models_scores = []
for model, model_name in zip(models, model_names):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    # Multiclass evaluation metrics
    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1": f1_score(y_test, y_pred, average="weighted"),
    }
    models_scores.append([model_name, metrics])

# Print results
for model in sorted(models_scores, key=lambda x: x[1]["Accuracy"], reverse=True):
    print(f"{model[0]}:")
    print(f"  Accuracy: {model[1]['Accuracy']:.3f}")
    print(f"  Precision: {model[1]['Precision']:.3f}")
    print(f"  Recall: {model[1]['Recall']:.3f}")
    print(f"  F1: {model[1]['F1']:.3f}")
    print()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LogisticRegression:
  Accuracy: 0.918
  Precision: 0.926
  Recall: 0.918
  F1: 0.919

KNeighborsClassifier:
  Accuracy: 0.918
  Precision: 0.943
  Recall: 0.918
  F1: 0.902

SVC:
  Accuracy: 0.898
  Precision: 0.806
  Recall: 0.898
  F1: 0.850

DecisionTreeClassifier:
  Accuracy: 0.878
  Precision: 0.949
  Recall: 0.878
  F1: 0.907

RandomForestClassifier:
  Accuracy: 0.857
  Precision: 0.920
  Recall: 0.857
  F1: 0.881

GaussianNB:
  Accuracy: 0.531
  Precision: 0.964
  Recall: 0.531
  F1: 0.683

