## test1
the result shows that different combination between preprocessor and model yields different metrics.
The KBinsDiscretizer preprocessor with GradientBoostingClassifier relatively low metrics in general, in wine dataset outperform other combinations.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import warnings

from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import (
    KBinsDiscretizer,
    MinMaxScaler,
    StandardScaler,
    RobustScaler,
    QuantileTransformer,
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 10)
pd.set_option("display.max_rows", 10)
pd.set_option("display.width", 1000)

# https://www.notion.so/cheaper01flagon/Machine-Learning-final-project-7fbacb61a9c14215bb6ddc641c32cfae?pvs=4

plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (16, 9)
plt.rcParams["font.size"] = 14
plt.rcParams["axes.labelcolor"] = "k"
plt.rcParams["axes.edgecolor"] = "k"
plt.rcParams["xtick.color"] = "k"
plt.rcParams["ytick.color"] = "k"
plt.rcParams["axes.facecolor"] = "white"
plt.rcParams["figure.facecolor"] = "white"
plt.rcParams["axes.grid"] = True


iris = sns.load_dataset("iris")

data, target = iris.loc[:, iris.columns != "species"], iris.loc[:, "species"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.1, random_state=0, shuffle=True
)

pre_processors = [
    StandardScaler(),
    MinMaxScaler(),
    KBinsDiscretizer(),
    RobustScaler(),
    QuantileTransformer(),
]

StandardScaler_params = {
    "with_mean": [True, False],
    "with_std": [True, False],
}
MinMaxScaler_params = {
    "feature_range": [(0, 1), (-1, 1)],
}
KBinsDiscretizer_params = {
    "n_bins": [3, 5, 10],
    "encode": ["ordinal", "onehot", "onehot-dense"],
    "strategy": ["uniform", "quantile", "kmeans"],
}
RobustScaler_params = {
    "with_centering": [True, False],
    "with_scaling": [True, False],
    "quantile_range": [(25.0, 75.0), (10.0, 90.0)],
}
QuantileTransformer_params = {
    "n_quantiles": [10, 100, 1000],
    "output_distribution": ["uniform", "normal"],
    "ignore_implicit_zeros": [True, False],
    "subsample": [1e5],
}
param_grids = [
    StandardScaler_params,
    MinMaxScaler_params,
    KBinsDiscretizer_params,
    RobustScaler_params,
    QuantileTransformer_params,
]


models = [
    SVC(),
    RandomForestClassifier(),
    LogisticRegression(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
]

SVC_params = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "C": [0.1, 1, 10],
    "degree": [1, 2, 3, 4, 5],
    "gamma": ["scale", "auto"],
}
RandomForestClassifier_params = {
    "n_estimators": [10, 100, 1000],
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 3, 5],
    "max_features": ["auto", "sqrt", "log2"],
}
LogisticRegression_params = {
    "penalty": ["l1", "l2", "elasticnet", "none"],
    "C": [0.1, 1, 10],
    "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
    "max_iter": [100, 1000, 10000],
}
GradientBoostingClassifier_params = {
    "loss": ["deviance", "exponential"],
    "learning_rate": [0.1, 1, 10],
    "n_estimators": [10, 100, 1000],
    "criterion": ["friedman_mse", "mse", "mae"],
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 3, 5],
    "max_features": ["auto", "sqrt", "log2"],
}
AdaBoostClassifier_params = {
    "n_estimators": [10, 100, 1000],
    "learning_rate": [0.1, 1, 10],
    "algorithm": ["SAMME", "SAMME.R"],
}
KNeighborsClassifier_params = {
    "n_neighbors": [3, 5, 10],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [10, 30, 100],
    "p": [1, 2],
}
DecisionTreeClassifier_params = {
    "criterion": ["gini", "entropy"],
    "splitter": ["best", "random"],
    "max_depth": [None, 3, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 3, 5],
    "max_features": ["auto", "sqrt", "log2"],
}
model_params = [
    SVC_params,
    RandomForestClassifier_params,
    LogisticRegression_params,
    GradientBoostingClassifier_params,
    AdaBoostClassifier_params,
    KNeighborsClassifier_params,
    DecisionTreeClassifier_params,
]

def get_best_preprocessor_and_model(pre_processors, param_grids, models, model_params, X, y):
    best_score = 0
    best_preprocessor = None
    best_model = None
    best_model_params = None
    for preprocessor, param_grid in zip(pre_processors, param_grids):
        for model, params in zip(models, model_params):
            try:
                # Create a pipeline with the preprocessor and model
                pipeline = make_pipeline(preprocessor, model)

                # Train the model
                pipeline.fit(X, y)

                # Find the best parameters for both the feature extraction and the
                # classifier
                grid = GridSearchCV(
                    pipeline, params, error_score=0., n_jobs=-1, cv=5
                )
                grid.fit(X, y)

                # Update best score
                if grid.best_score_ > best_score:
                    best_score = grid.best_score_
                    best_preprocessor = preprocessor
                    best_model = model
                    best_model_params = grid.best_params_
            except Exception as e:
                print(e)
                print("---")
    print(
        "Best Score: {}".format(best_score)
    )
    print("Best Preprocessor: {}".format(best_preprocessor.__class__.__name__))
    print("Best Model: {}".format(best_model.__class__.__name__))
    print("Best Parameters: {}".format(best_model_params))
    print("---")
    return best_preprocessor, best_model, best_model_params

scoreboard = pd.DataFrame(
    columns=["Preprocessor", "Model", "Accuracy", "F1", "Precision", "Recall"]
)


def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    precision = precision_score(y_test, y_pred, average="weighted")
    recall = recall_score(y_test, y_pred, average="weighted")
    return accuracy, f1, precision, recall


for preprocessor in pre_processors:
    for model in models:
        raise Exception("stop")
        pipeline = make_pipeline(preprocessor, model)

#         try:
#             # Create a pipeline with the preprocessor and model
#             pipeline = make_pipeline(preprocessor, model)

#             # Train the model
#             pipeline.fit(X_train, y_train)

#             accuracy, f1, precision, recall = evaluate_model(pipeline, X_test, y_test)
#         except Exception as e:
#             print(e)
#             print("---")
#             accuracy, f1, precision, recall = 0, 0, 0, 0
#         scoreboard = scoreboard.append(
#             {
#                 "Preprocessor": preprocessor.__class__.__name__,
#                 "Model": model.__class__.__name__,
#                 "Accuracy": accuracy,
#                 "F1": f1,
#                 "Precision": precision,
#                 "Recall": recall,
#             },
#             ignore_index=True,
#         )
# for metrics in ["Accuracy", "F1", "Precision", "Recall"]:
#     display(
#         pd.crosstab(
#             scoreboard["Preprocessor"],
#             scoreboard["Model"],
#             values=scoreboard[metrics],
#             aggfunc="mean",
#             margins=True,
#         )
#         .style.background_gradient(cmap="Blues")
#         .format("{:.1%}")
#     )

In [19]:
from sklearn.datasets import load_breast_cancer

# Load Breast Cancer data
cancer = load_breast_cancer()
data = pd.DataFrame(cancer.data, columns=cancer.feature_names)
target = pd.Series(cancer.target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.3, random_state=0, shuffle=True
)

scoreboard = pd.DataFrame(
    columns=["Preprocessor", "Model", "Accuracy", "F1", "Precision", "Recall"]
)

for preprocessor in pre_processors:
    for model in models:
        try:
            # Create a pipeline with the preprocessor and model
            pipeline = make_pipeline(preprocessor, model)
            # Train the model
            pipeline.fit(X_train, y_train)
            accuracy, f1, precision, recall = evaluate_model(pipeline, X_test, y_test)
        except Exception as e:
            print(e)
            print("---")
            accuracy, f1, precision, recall = 0, 0, 0, 0
        scoreboard = scoreboard.append(
            {
                "Preprocessor": preprocessor.__class__.__name__,
                "Model": model.__class__.__name__,
                "Accuracy": accuracy,
                "F1": f1,
                "Precision": precision,
                "Recall": recall,
            },
            ignore_index=True,
        )

for metrics in ["Accuracy", "F1", "Precision", "Recall"]:
    display(
        pd.crosstab(
            scoreboard["Preprocessor"],
            scoreboard["Model"],
            values=scoreboard[metrics],
            aggfunc="mean",
            margins=True,
        )
        .style.background_gradient(cmap="Blues")
        .format("{:.1%}")
    )

Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,94.7%,93.0%,96.5%,94.7%,95.3%,94.2%,97.1%,95.1%
MinMaxScaler,96.5%,91.8%,96.5%,95.3%,95.3%,97.7%,97.7%,95.8%
QuantileTransformer,96.5%,90.6%,97.1%,96.5%,97.1%,97.1%,98.2%,96.2%
RobustScaler,96.5%,91.8%,97.1%,96.5%,96.5%,97.7%,98.2%,96.3%
StandardScaler,96.5%,92.4%,96.5%,95.9%,97.7%,97.7%,97.7%,96.3%
All,96.1%,91.9%,96.7%,95.8%,96.4%,96.8%,97.8%,95.9%


Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,94.8%,93.0%,96.5%,94.7%,95.3%,94.2%,97.1%,95.1%
MinMaxScaler,96.5%,91.9%,96.5%,95.3%,95.3%,97.7%,97.7%,95.8%
QuantileTransformer,96.5%,90.7%,97.1%,96.5%,97.1%,97.1%,98.2%,96.2%
RobustScaler,96.5%,91.9%,97.1%,96.5%,96.5%,97.7%,98.2%,96.3%
StandardScaler,96.5%,92.5%,96.5%,95.9%,97.7%,97.7%,97.7%,96.3%
All,96.2%,92.0%,96.7%,95.8%,96.4%,96.8%,97.8%,95.9%


Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,95.2%,93.3%,96.5%,94.7%,95.3%,94.2%,97.1%,95.2%
MinMaxScaler,96.5%,92.4%,96.5%,95.4%,95.5%,97.7%,97.7%,96.0%
QuantileTransformer,96.5%,91.0%,97.1%,96.5%,97.1%,97.1%,98.3%,96.2%
RobustScaler,96.5%,92.2%,97.1%,96.6%,96.5%,97.7%,98.2%,96.4%
StandardScaler,96.5%,92.9%,96.5%,96.0%,97.7%,97.7%,97.7%,96.4%
All,96.3%,92.4%,96.8%,95.8%,96.4%,96.9%,97.8%,96.0%


Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,94.7%,93.0%,96.5%,94.7%,95.3%,94.2%,97.1%,95.1%
MinMaxScaler,96.5%,91.8%,96.5%,95.3%,95.3%,97.7%,97.7%,95.8%
QuantileTransformer,96.5%,90.6%,97.1%,96.5%,97.1%,97.1%,98.2%,96.2%
RobustScaler,96.5%,91.8%,97.1%,96.5%,96.5%,97.7%,98.2%,96.3%
StandardScaler,96.5%,92.4%,96.5%,95.9%,97.7%,97.7%,97.7%,96.3%
All,96.1%,91.9%,96.7%,95.8%,96.4%,96.8%,97.8%,95.9%


In [20]:
from sklearn.datasets import load_wine

# Load Wine data
wine = load_wine()
data = pd.DataFrame(wine.data, columns=wine.feature_names)
target = pd.Series(wine.target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data, target, test_size=0.1, random_state=0, shuffle=True
)

scoreboard = pd.DataFrame(
    columns=["Preprocessor", "Model", "Accuracy", "F1", "Precision", "Recall"]
)

for preprocessor in pre_processors:
    for model in models:
        try:
            # Create a pipeline with the preprocessor and model
            pipeline = make_pipeline(preprocessor, model)
            # Train the model
            pipeline.fit(X_train, y_train)
            accuracy, f1, precision, recall = evaluate_model(pipeline, X_test, y_test)
        except Exception as e:
            print(e)
            print("---")
            accuracy, f1, precision, recall = 0, 0, 0, 0
        scoreboard = scoreboard.append(
            {
                "Preprocessor": preprocessor.__class__.__name__,
                "Model": model.__class__.__name__,
                "Accuracy": accuracy,
                "F1": f1,
                "Precision": precision,
                "Recall": recall,
            },
            ignore_index=True,
        )

for metrics in ["Accuracy", "F1", "Precision", "Recall"]:
    display(
        pd.crosstab(
            scoreboard["Preprocessor"],
            scoreboard["Model"],
            values=scoreboard[metrics],
            aggfunc="mean",
            margins=True,
        )
        .style.background_gradient(cmap="Blues")
        .format("{:.1%}")
    )

Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,83.3%,94.4%,100.0%,100.0%,100.0%,100.0%,100.0%,96.8%
MinMaxScaler,83.3%,100.0%,94.4%,100.0%,100.0%,100.0%,100.0%,96.8%
QuantileTransformer,83.3%,94.4%,94.4%,100.0%,100.0%,100.0%,100.0%,96.0%
RobustScaler,83.3%,100.0%,94.4%,100.0%,100.0%,100.0%,100.0%,96.8%
StandardScaler,83.3%,94.4%,94.4%,100.0%,100.0%,100.0%,100.0%,96.0%
All,83.3%,96.7%,95.6%,100.0%,100.0%,100.0%,100.0%,96.5%


Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,82.7%,94.4%,100.0%,100.0%,100.0%,100.0%,100.0%,96.7%
MinMaxScaler,82.7%,100.0%,94.4%,100.0%,100.0%,100.0%,100.0%,96.7%
QuantileTransformer,82.7%,94.4%,94.4%,100.0%,100.0%,100.0%,100.0%,95.9%
RobustScaler,82.7%,100.0%,94.4%,100.0%,100.0%,100.0%,100.0%,96.7%
StandardScaler,82.7%,94.4%,94.4%,100.0%,100.0%,100.0%,100.0%,95.9%
All,82.7%,96.7%,95.6%,100.0%,100.0%,100.0%,100.0%,96.4%


Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,88.9%,95.2%,100.0%,100.0%,100.0%,100.0%,100.0%,97.7%
MinMaxScaler,88.9%,100.0%,95.2%,100.0%,100.0%,100.0%,100.0%,97.7%
QuantileTransformer,88.9%,95.2%,95.2%,100.0%,100.0%,100.0%,100.0%,97.1%
RobustScaler,88.9%,100.0%,95.2%,100.0%,100.0%,100.0%,100.0%,97.7%
StandardScaler,88.9%,95.2%,95.2%,100.0%,100.0%,100.0%,100.0%,97.1%
All,88.9%,97.1%,96.2%,100.0%,100.0%,100.0%,100.0%,97.5%


Model,AdaBoostClassifier,DecisionTreeClassifier,GradientBoostingClassifier,KNeighborsClassifier,LogisticRegression,RandomForestClassifier,SVC,All
Preprocessor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
KBinsDiscretizer,83.3%,94.4%,100.0%,100.0%,100.0%,100.0%,100.0%,96.8%
MinMaxScaler,83.3%,100.0%,94.4%,100.0%,100.0%,100.0%,100.0%,96.8%
QuantileTransformer,83.3%,94.4%,94.4%,100.0%,100.0%,100.0%,100.0%,96.0%
RobustScaler,83.3%,100.0%,94.4%,100.0%,100.0%,100.0%,100.0%,96.8%
StandardScaler,83.3%,94.4%,94.4%,100.0%,100.0%,100.0%,100.0%,96.0%
All,83.3%,96.7%,95.6%,100.0%,100.0%,100.0%,100.0%,96.5%
