In [46]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

## Separa√ß√£o de dados

In [47]:
dados = pd.read_csv("data/Vehicle.csv")

# transform the last collumn into integer values
le = LabelEncoder()
le.fit(dados["Class"])
dados["Class"] = le.transform(dados["Class"])

df_dados = pd.DataFrame(dados)

# print which class each integer value represents
print("Encoded Classes:")
print(le.classes_)

Encoded Classes:
['bus' 'opel' 'saab' 'van']


In [48]:
# shuffle the data and split it into train, validation and test sets
def ShuffleuSplit(df_dados):
    # shuffle data to avoid bias
    df_dados = shuffle(df_dados)

    # separe the last column in another variable
    df_dados_without_class = df_dados.drop(columns=["Class"])

    x_treino, x_temp, y_treino, y_temp = train_test_split(
        df_dados_without_class,
        df_dados["Class"],
        test_size=0.5,
        stratify=df_dados["Class"],
    )
    x_validacao, x_teste, y_validacao, y_teste = train_test_split(
        x_temp, y_temp, test_size=0.5, stratify=y_temp
    )

    # print("Treino")
    # x_treino.info()
    # y_treino.info()

    # print("\nValida√ß√£o")
    # x_validacao.info()
    # y_validacao.info()

    # print("\nTeste")
    # x_teste.info()
    # y_teste.info()
    return (
        x_treino,
        y_treino,
        x_validacao,
        y_validacao,
        x_teste,
        y_teste,
    )

In [49]:
# Plot Roc Curve
def plot_roc_curve(fper, tper, cor, classsificador):
    plt.clf()
    plt.plot(fper, tper, color=cor, label=classsificador)
    plt.plot([0, 1], [0, 1], color="green", linestyle="--")
    plt.xlabel("Taxa de Falsos Positivos (FPR)")
    plt.ylabel("Taxa de Verdadeiros Positivos (TPR)")
    plt.title("Curva ROC")
    plt.legend()
    plt.show()

# KNN

In [50]:
from sklearn.neighbors import KNeighborsClassifier


def grid_search_KNN(x_treino, y_treino, x_validacao, y_validacao):
    best_accuracy = -1
    best_k = -1
    best_distance_metric = ""
    best_KNN = None

    for k in range(1, 50, 2):
        for distance_metric in ["uniform", "distance"]:
            knn_instance = KNeighborsClassifier(
                n_jobs=-1, n_neighbors=k, weights=distance_metric
            )
            knn_instance.fit(x_treino, y_treino)
            knn_validation_pred = knn_instance.predict(x_validacao)
            accuracy = accuracy_score(y_validacao, knn_validation_pred)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_k = k
                best_distance_metric = distance_metric
                best_KNN = knn_instance

    return best_KNN, best_k, best_distance_metric


def KNN(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_KNN, best_k, best_distance_metric = grid_search_KNN(
        x_treino, y_treino, x_validacao, y_validacao
    )

    knn_test_pred = best_KNN.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, knn_test_pred)

    return test_accuracy, best_KNN, best_k, best_distance_metric

# √Årvore de Decis√£o

In [51]:
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeClassifier


def grid_search_DT(x_treino, y_treino, x_validacao, y_validacao):
    param_grid = {
        "criterion": ["gini", "entropy", "log_loss"],
        "max_depth": range(1, 33, 2),
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    }

    best_accuracy = 0
    best_params = None
    best_DT = None

    for params in ParameterGrid(param_grid):
        # print(f"Testing parameters: {params}")
        DT = DecisionTreeClassifier(**params)
        DT.fit(x_treino, y_treino)
        dt_validation_pred = DT.predict(x_validacao)
        accuracy = accuracy_score(y_validacao, dt_validation_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_DT = DT

    return best_DT, best_params


def DT(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_DT, best_params = grid_search_DT(x_treino, y_treino, x_validacao, y_validacao)

    dt_test_pred = best_DT.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, dt_test_pred)

    return test_accuracy, best_DT, best_params

# SVM

In [52]:
from sklearn.svm import SVC


def grid_search_SVM(x_treino, y_treino, x_validacao, y_validacao):
    param_grid = {
        "C": [0.1, 1.0, 10.0],  # Define C values
        "kernel": ["linear", "poly", "rbf", "sigmoid"],
    }

    best_accuracy = 0
    best_params = None
    best_SVM = None

    for params in ParameterGrid(param_grid):
        SVM = SVC(**params, probability=True)
        SVM.fit(x_treino, y_treino)
        svm_validation_pred = SVM.predict(x_validacao)
        accuracy = accuracy_score(y_validacao, svm_validation_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_SVM = SVM

    return best_SVM, best_params


def SVM(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_SVM, best_params = grid_search_SVM(
        x_treino, y_treino, x_validacao, y_validacao
    )

    svm_test_pred = best_SVM.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, svm_test_pred)

    return test_accuracy, best_SVM, best_params

# Naive Bayes

In [53]:
from sklearn.naive_bayes import GaussianNB


def NB(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    NB = GaussianNB()
    NB.fit(x_treino, y_treino)

    nb_predict_test = NB.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, nb_predict_test)

    return test_accuracy, NB

# MLP

In [54]:
from sklearn.neural_network import MLPClassifier


def grid_search_MLP(x_treino, y_treino, x_validacao, y_validacao):
    param_grid = {
        "hidden_layer_sizes": [
            (100,),
            (50, 50),
            (100, 50, 25),
        ],  # Define hidden_layer_sizes
        "activation": ["identity", "logistic", "tanh", "relu"],
        "max_iter": [1000, 2000],  # Define max_iter values
        "learning_rate": ["constant", "invscaling", "adaptive"],
    }

    best_accuracy = 0
    best_params = None
    best_MLP = None

    for params in ParameterGrid(param_grid):
        MLP = MLPClassifier(**params)
        MLP.fit(x_treino, y_treino)
        mlp_validation_pred = MLP.predict(x_validacao)
        accuracy = accuracy_score(y_validacao, mlp_validation_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_MLP = MLP

    return best_MLP, best_params


def MLP(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_MLP, best_params = grid_search_MLP(
        x_treino, y_treino, x_validacao, y_validacao
    )

    mlp_test_pred = best_MLP.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, mlp_test_pred)

    return test_accuracy, best_MLP, best_params

# Main

In [55]:
# Main
output = pd.DataFrame(columns=["KNN", "DT", "NB", "SVM", "MLP"])

knn_best_params = pd.DataFrame(columns=["K", "Distance Metric"])
dt_best_params = pd.DataFrame(
    columns=["criterion", "max_depth", "min_samples_split", "min_samples_leaf"]
)
svm_best_params = pd.DataFrame(columns=["C", "kernel"])
mlp_best_params = pd.DataFrame(
    columns=["hidden_layer_sizes", "activation", "max_iter", "learning_rate"]
)


for i in range(1):
    shuffled_data = ShuffleuSplit(df_dados)

    # KNN Execution
    knn_accuracy, knn_model, *knn_params = KNN(*shuffled_data)
    knn_best_params.loc[len(knn_best_params.index)] = knn_params

    # DT Execution
    dt_accuracy, dt_model, dt_params = DT(*shuffled_data)
    dt_params = [
        dt_params[key]
        for key in ["criterion", "max_depth", "min_samples_split", "min_samples_leaf"]
    ]
    dt_best_params.loc[len(dt_best_params.index)] = dt_params

    # SVM Execution
    svm_accuracy, svm_model, svm_params = SVM(*shuffled_data)
    svm_params = [svm_params[key] for key in ["C", "kernel"]]
    svm_best_params.loc[len(svm_best_params.index)] = svm_params

    # NB Execution
    nb_accuracy, nb_model = NB(*shuffled_data)

    # MLP Execution
    mlp_accuracy, mlp_model, mlp_params = MLP(*shuffled_data)
    mlp_params = [
        mlp_params[key]
        for key in ["hidden_layer_sizes", "activation", "max_iter", "learning_rate"]
    ]
    mlp_best_params.loc[len(mlp_best_params.index)] = mlp_params

    # add accuracies to output
    output.loc[len(output.index)] = [
        knn_accuracy,
        dt_accuracy,
        svm_accuracy,
        nb_accuracy,
        mlp_accuracy,
    ]

# generate csv from knn best params, ignoring the index columns
knn_best_params.to_csv("best_params/knn.csv", index=False)
dt_best_params.to_csv("best_params/dt.csv", index=False)
svm_best_params.to_csv("best_params/svm.csv", index=False)
mlp_best_params.to_csv("best_params/mlp.csv", index=False)

# generate csv from output, ignoring the index columns
output.to_csv("output.csv", index=False)

# Parallel Main (‚ò†Ô∏èüî¥ Danger Zone)

In [56]:
from concurrent.futures import ThreadPoolExecutor

import pandas as pd


# Define a function to execute a model and return accuracy and params
def execute_model(model_function, shuffled_data, best_params_df, model_name):
    accuracy, model, *params = model_function(*shuffled_data)
    params_dict = {
        "KNN": ["K", "Distance Metric"],
        "DT": ["Criterion", "Max Depth", "Min Samples Split", "Min Samples Leaf"],
        "SVM": ["C", "Kernel"],
        "MLP": ["Hidden Layer Sizes", "Activation", "Max Iter", "Learning Rate"],
    }
    params = [params_dict[model_name][i] for i in range(len(params))]
    best_params_df.loc[len(best_params_df.index)] = params
    return accuracy


# Create DataFrames to store best parameters
knn_best_params = pd.DataFrame(columns=["K", "Distance Metric"])
dt_best_params = pd.DataFrame(
    columns=["Criterion", "Max Depth", "Min Samples Split", "Min Samples Leaf"]
)
svm_best_params = pd.DataFrame(columns=["C", "Kernel"])
mlp_best_params = pd.DataFrame(
    columns=["Hidden Layer Sizes", "Activation", "Max Iter", "Learning Rate"]
)

# Create a DataFrame to store accuracy values
output = pd.DataFrame(columns=["KNN", "DT", "SVM", "NB", "MLP"])

# Shuffle and split data
shuffled_data = [ShuffleuSplit(df_dados) for _ in range(20)]

# Create a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=8) as executor:  # Adjust max_workers as needed
    futures = []

    for i in range(1):
        futures.append(
            executor.submit(
                execute_model, KNN, shuffled_data[i], knn_best_params, "KNN"
            )
        )
        futures.append(
            executor.submit(execute_model, DT, shuffled_data[i], dt_best_params, "DT")
        )
        futures.append(
            executor.submit(
                execute_model, SVM, shuffled_data[i], svm_best_params, "SVM"
            )
        )
        futures.append(
            executor.submit(
                execute_model, MLP, shuffled_data[i], mlp_best_params, "MLP"
            )
        )

    for future in futures:
        accuracy = future.result()
        output.loc[len(output.index)] = [accuracy] * 5

# Generate CSV files
knn_best_params.to_csv("best_params/parallel_knn.csv", index=False)
dt_best_params.to_csv("best_params/parallel_dt.csv", index=False)
svm_best_params.to_csv("best_params/parallel_svm.csv", index=False)
mlp_best_params.to_csv("best_params/parallel_mlp.csv", index=False)
output.to_csv("parallel_output.csv", index=False)

ValueError: cannot set a row with mismatched columns