In [43]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

## Separação de dados

In [44]:
dados = pd.read_csv("data/Vehicle.csv")

# transform the last collumn into integer values
le = LabelEncoder()
le.fit(dados["Class"])
dados["Class"] = le.transform(dados["Class"])

df_dados = pd.DataFrame(dados)

# print which class each integer value represents
print("Encoded Classes:")
print(le.classes_)

Encoded Classes:
['bus' 'opel' 'saab' 'van']


In [45]:
# shuffle the data and split it into train, validation and test sets
def ShuffleuSplit(df_dados):
    # shuffle data to avoid bias
    df_dados = shuffle(df_dados)

    # separe the last column in another variable
    df_dados_without_class = df_dados.drop(columns=["Class"])

    x_treino, x_temp, y_treino, y_temp = train_test_split(
        df_dados_without_class,
        df_dados["Class"],
        test_size=0.5,
        stratify=df_dados["Class"],
    )
    x_validacao, x_teste, y_validacao, y_teste = train_test_split(
        x_temp, y_temp, test_size=0.5, stratify=y_temp
    )

    # print("Treino")
    # x_treino.info()
    # y_treino.info()

    # print("\nValidação")
    # x_validacao.info()
    # y_validacao.info()

    # print("\nTeste")
    # x_teste.info()
    # y_teste.info()
    return (
        x_treino,
        y_treino,
        x_validacao,
        y_validacao,
        x_teste,
        y_teste,
    )

In [46]:
# Plot Roc Curve
def plot_roc_curve(fper, tper, cor, classsificador):
    plt.clf()
    plt.plot(fper, tper, color=cor, label=classsificador)
    plt.plot([0, 1], [0, 1], color="green", linestyle="--")
    plt.xlabel("Taxa de Falsos Positivos (FPR)")
    plt.ylabel("Taxa de Verdadeiros Positivos (TPR)")
    plt.title("Curva ROC")
    plt.legend()
    plt.show()

# KNN

In [47]:
from sklearn.neighbors import KNeighborsClassifier


def grid_search_KNN(x_treino, y_treino, x_validacao, y_validacao):
    best_accuracy = -1
    best_k = -1
    best_distance_metric = ""
    best_KNN = None

    for k in range(1, 50, 2):
        for distance_metric in ["uniform", "distance"]:
            knn_instance = KNeighborsClassifier(
                n_jobs=-1, n_neighbors=k, weights=distance_metric
            )
            knn_instance.fit(x_treino, y_treino)
            knn_validation_pred = knn_instance.predict(x_validacao)
            accuracy = accuracy_score(y_validacao, knn_validation_pred)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_k = k
                best_distance_metric = distance_metric
                best_KNN = knn_instance

    return best_KNN, best_k, best_distance_metric


def KNN(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_KNN, best_k, best_distance_metric = grid_search_KNN(
        x_treino, y_treino, x_validacao, y_validacao
    )

    knn_test_pred = best_KNN.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, knn_test_pred)

    return test_accuracy, best_KNN, best_k, best_distance_metric

# Árvore de Decisão

In [48]:
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeClassifier


def grid_search_DT(x_treino, y_treino, x_validacao, y_validacao):
    param_grid = {
        "criterion": ["gini", "entropy", "log_loss"],
        "max_depth": range(1, 33, 2),
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
    }

    best_accuracy = 0
    best_params = None
    best_DT = None

    for params in ParameterGrid(param_grid):
        # print(f"Testing parameters: {params}")
        DT = DecisionTreeClassifier(**params)
        DT.fit(x_treino, y_treino)
        dt_validation_pred = DT.predict(x_validacao)
        accuracy = accuracy_score(y_validacao, dt_validation_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_DT = DT

    return best_DT, best_params


def DT(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_DT, best_params = grid_search_DT(x_treino, y_treino, x_validacao, y_validacao)

    dt_test_pred = best_DT.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, dt_test_pred)

    return test_accuracy, best_DT, best_params

# SVM

In [49]:
from sklearn.svm import SVC


def grid_search_SVM(x_treino, y_treino, x_validacao, y_validacao):
    param_grid = {
        "C": [0.1, 1.0, 10.0],  # Define C values
        "kernel": ["linear", "poly", "rbf", "sigmoid"],
    }

    best_accuracy = 0
    best_params = None
    best_SVM = None

    for params in ParameterGrid(param_grid):
        SVM = SVC(**params, probability=True)
        SVM.fit(x_treino, y_treino)
        svm_validation_pred = SVM.predict(x_validacao)
        accuracy = accuracy_score(y_validacao, svm_validation_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_SVM = SVM

    return best_SVM, best_params


def SVM(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_SVM, best_params = grid_search_SVM(
        x_treino, y_treino, x_validacao, y_validacao
    )

    svm_test_pred = best_SVM.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, svm_test_pred)

    return test_accuracy, best_SVM, best_params

# Naive Bayes

In [50]:
from sklearn.naive_bayes import GaussianNB


def NB(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    NB = GaussianNB()
    NB.fit(x_treino, y_treino)

    nb_predict_test = NB.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, nb_predict_test)

    return test_accuracy, NB

# MLP

In [51]:
from sklearn.neural_network import MLPClassifier


def grid_search_MLP(x_treino, y_treino, x_validacao, y_validacao):
    param_grid = {
        "hidden_layer_sizes": [
            (100,),
            (50, 50),
            (100, 50, 25),
        ],  # Define hidden_layer_sizes
        "activation": ["identity", "logistic", "tanh", "relu"],
        "max_iter": [1000, 2000],  # Define max_iter values
        "learning_rate": ["constant", "invscaling", "adaptive"],
    }

    best_accuracy = 0
    best_params = None
    best_MLP = None

    for params in ParameterGrid(param_grid):
        MLP = MLPClassifier(**params)
        MLP.fit(x_treino, y_treino)
        mlp_validation_pred = MLP.predict(x_validacao)
        accuracy = accuracy_score(y_validacao, mlp_validation_pred)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
            best_MLP = MLP

    return best_MLP, best_params


def MLP(x_treino, y_treino, x_validacao, y_validacao, x_teste, y_teste):
    best_MLP, best_params = grid_search_MLP(
        x_treino, y_treino, x_validacao, y_validacao
    )

    mlp_test_pred = best_MLP.predict(x_teste)
    test_accuracy = accuracy_score(y_teste, mlp_test_pred)

    return test_accuracy, best_MLP, best_params

# Borda Count

In [52]:
class BordaCountClassifier:
    def __init__(self, estimators):
        """
        Initialize the BordaCountClassifier.

        Parameters:
        - estimators: List of classifiers.
        """
        self.estimators = estimators

    def fit(self, X, y):
        """
        Fit each estimator to the data.

        Parameters:
        - X: Features.
        - y: Target labels.
        """
        for _, estimator in self.estimators:
            estimator.fit(X, y)

    def predict(self, X):
        """
        Predict the class labels using the Borda Count approach.

        Parameters:
        - X: Features.

        Returns:
        - Predicted class labels.
        """
        all_probs = [estimator.predict_proba(X) for _, estimator in self.estimators]
        all_probs = np.stack(all_probs)

        # Get rankings for each classifier's predictions
        rankings = np.argsort(-all_probs, axis=-1)

        # Assign points based on rankings
        num_classes = all_probs.shape[2]
        points = np.zeros_like(rankings)
        for rank in range(num_classes):
            points[rankings == rank] = num_classes - 1 - rank

        # Sum points across classifiers
        total_points = points.sum(axis=0)

        # Get the final prediction as the class with the highest total points
        final_predictions = np.argmax(total_points, axis=1)

        return final_predictions

# Main

In [53]:
from sklearn.ensemble import VotingClassifier

output = pd.DataFrame(columns=["KNN", "DT", "NB", "SVM", "MLP", "MV", "SV", "BC"])

knn_best_params = pd.DataFrame(columns=["K", "Distance Metric"])
dt_best_params = pd.DataFrame(
    columns=["criterion", "max_depth", "min_samples_split", "min_samples_leaf"]
)
svm_best_params = pd.DataFrame(columns=["C", "kernel"])
mlp_best_params = pd.DataFrame(
    columns=["hidden_layer_sizes", "activation", "max_iter", "learning_rate"]
)

## find the number of elements of each class in the dataset
print("============================================")
print(le.classes_)
print("============================================")

for i in range(20):
    shuffled_data = ShuffleuSplit(df_dados)

    # KNN Execution
    knn_accuracy, knn_model, *knn_params = KNN(*shuffled_data)
    knn_best_params.loc[len(knn_best_params.index)] = knn_params

    # # show confusion matrix for KNN
    # knn_test_pred = knn_model.predict(shuffled_data[4])
    # print("\nKNN Confusion Matrix:")
    # print("Accuracy: ", knn_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], knn_test_pred))

    # DT Execution
    dt_accuracy, dt_model, dt_params = DT(*shuffled_data)
    dt_params = [
        dt_params[key]
        for key in ["criterion", "max_depth", "min_samples_split", "min_samples_leaf"]
    ]
    dt_best_params.loc[len(dt_best_params.index)] = dt_params

    # # show confusion matrix for DT
    # dt_test_pred = dt_model.predict(shuffled_data[4])
    # print("\nDT Confusion Matrix:")
    # print("Accuracy: ", dt_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], dt_test_pred))

    # SVM Execution
    svm_accuracy, svm_model, svm_params = SVM(*shuffled_data)
    svm_params = [svm_params[key] for key in ["C", "kernel"]]
    svm_best_params.loc[len(svm_best_params.index)] = svm_params

    # # show confusion matrix for SVM
    # svm_test_pred = svm_model.predict(shuffled_data[4])
    # print("\nSVM Confusion Matrix:")
    # print("Accuracy: ", svm_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], svm_test_pred))

    # NB Execution
    nb_accuracy, nb_model = NB(*shuffled_data)

    # # show confusion matrix for NB
    # nb_test_pred = nb_model.predict(shuffled_data[4])
    # print("\nNB Confusion Matrix:")
    # print("Accuracy: ", nb_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], nb_test_pred))

    # MLP Execution
    mlp_accuracy, mlp_model, mlp_params = MLP(*shuffled_data)
    mlp_params = [
        mlp_params[key]
        for key in ["hidden_layer_sizes", "activation", "max_iter", "learning_rate"]
    ]
    mlp_best_params.loc[len(mlp_best_params.index)] = mlp_params

    # # show confusion matrix for MLP
    # mlp_test_pred = mlp_model.predict(shuffled_data[4])
    # print("\nMLP Confusion Matrix:")
    # print("Accuracy: ", mlp_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], mlp_test_pred))

    # create a multiple classifier approach with VotingClassifier
    estimators = [
        ("knn", knn_model),
        ("dt", dt_model),
        ("nb", nb_model),
        ("svm", svm_model),
        ("mlp", mlp_model),
    ]

    # majority voting
    majority_voting_classifier = VotingClassifier(estimators=estimators, voting="hard")
    majority_voting_classifier.fit(shuffled_data[0], shuffled_data[1])

    voting_test_pred = majority_voting_classifier.predict(shuffled_data[4])
    majority_voting_accuracy = accuracy_score(shuffled_data[5], voting_test_pred)

    # print("\nMajority Rule Voting Confusion Matrix:")
    # print("Accuracy: ", majority_voting_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], voting_test_pred))

    # sum voting
    sum_voting_classifier = VotingClassifier(estimators=estimators, voting="soft")
    sum_voting_classifier.fit(shuffled_data[0], shuffled_data[1])

    voting_test_pred = sum_voting_classifier.predict(shuffled_data[4])
    sum_voting_accuracy = accuracy_score(shuffled_data[5], voting_test_pred)

    # print("\nSum Rule Voting Confusion Matrix:")
    # print("Accuracy: ", sum_voting_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], voting_test_pred))

    # borda count
    borda_clf = BordaCountClassifier(estimators=estimators)
    borda_clf.fit(shuffled_data[0], shuffled_data[1])
    borda_count_predictions = borda_clf.predict(shuffled_data[4])
    bc_accuracy = accuracy_score(shuffled_data[5], borda_count_predictions)

    # print("\nBorda Count Confusion Matrix:")
    # print("Accuracy: ", bc_accuracy)
    # print(metrics.confusion_matrix(shuffled_data[5], borda_count_predictions))

    # add accuracies to output
    output.loc[len(output.index)] = [
        knn_accuracy,
        dt_accuracy,
        nb_accuracy,
        svm_accuracy,
        mlp_accuracy,
        majority_voting_accuracy,
        sum_voting_accuracy,
        bc_accuracy,
    ]

    # print current index and current line
    print("============================================")
    print(
        i,
        ":",
        [
            knn_accuracy,
            dt_accuracy,
            nb_accuracy,
            svm_accuracy,
            mlp_accuracy,
            majority_voting_accuracy,
            sum_voting_accuracy,
            bc_accuracy,
        ],
    )
    print("============================================")


# generate csv from knn best params, ignoring the index columns
knn_best_params.to_csv("best_params/knn.csv", index=False)
dt_best_params.to_csv("best_params/dt.csv", index=False)
svm_best_params.to_csv("best_params/svm.csv", index=False)
mlp_best_params.to_csv("best_params/mlp.csv", index=False)

# generate csv from output, ignoring the index columns
output.to_csv("output.csv", inde x=False)

['bus' 'opel' 'saab' 'van']
0 : [0.6179245283018868, 0.6462264150943396, 0.419811320754717, 0.7405660377358491, 0.6745283018867925, 0.6839622641509434, 0.6698113207547169, 0.4339622641509434]
1 : [0.5990566037735849, 0.6415094339622641, 0.41037735849056606, 0.7594339622641509, 0.6745283018867925, 0.7028301886792453, 0.7311320754716981, 0.44339622641509435]
2 : [0.5849056603773585, 0.6320754716981132, 0.4858490566037736, 0.7877358490566038, 0.6226415094339622, 0.6792452830188679, 0.6886792452830188, 0.4056603773584906]
3 : [0.6556603773584906, 0.7405660377358491, 0.419811320754717, 0.7924528301886793, 0.7122641509433962, 0.7735849056603774, 0.7641509433962265, 0.49528301886792453]
4 : [0.5849056603773585, 0.7358490566037735, 0.4528301886792453, 0.7971698113207547, 0.6745283018867925, 0.7594339622641509, 0.7122641509433962, 0.4528301886792453]
5 : [0.6415094339622641, 0.6886792452830188, 0.4481132075471698, 0.7877358490566038, 0.6933962264150944, 0.7405660377358491, 0.7169811320754716, 0