# Setup

In [11]:
%pip install --upgrade pip -q
%pip install pandas scikit-learn imbalanced-learn scikit-optimize sklearn-genetic-opt -q

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import zipfile
import pandas as pd 
import numpy as np
import time
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from skopt import BayesSearchCV  
from skopt.space import Categorical as SkoptCategorical
from sklearn_genetic import GASearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score
from sklearn_genetic.space import Categorical as GeneticCategorical
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA


In [13]:
path_dataset = 'dataset'
path_dataset_zip = f'{path_dataset}/german.zip'
path_dataset_extracted = f'{path_dataset}/german'
path_dataset_file = f'{path_dataset_extracted}/german.data-numeric'

In [14]:
os.makedirs(path_dataset, exist_ok=True)
if not os.path.exists(path_dataset_zip):
  !curl https://archive.ics.uci.edu/static/public/144/statlog+german+credit+data.zip -o {path_dataset_zip}
zipfile.ZipFile(path_dataset_zip).extractall(path_dataset_extracted)

In [15]:
df = pd.read_csv(path_dataset_file, header=None, sep='\s+')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1,6,4,12,5,5,3,4,1,67,...,0,0,1,0,0,1,0,0,1,1
1,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2
2,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1
3,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,0,1,1
4,1,24,3,49,1,3,3,4,4,53,...,1,0,1,0,0,0,0,0,1,2


# Pré processamento dos dados

In [16]:
# Vamos usar o RobustScaler para normalizar os dados reduzindo o impacto de outliers.
# Sem dados nulos
scaler = RobustScaler()
X, y = df.iloc[:, :-1], df.iloc[:, -1]
X_scalered = scaler.fit_transform(X)
df.isnull().sum().sum()

np.int64(0)

# Configuração dos experimentos

In [17]:
param_grids = {
    'RF': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },
    'MLP': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['tanh', 'relu'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant', 'adaptive']
    }
}


def rodar_experimento(algoritmo_nome, modelo_base, params, x_train, x_test, y_train, y_test):
    x_train_resampled, y_train_resampled = SMOTE(random_state=42).fit_resample(x_train, y_train)
    
    resultados = []
    estrategias = ['GridSearch', 'RandomSearch', 'Bayesiana', 'Genetica']
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for estrategia in estrategias:
        print(f"\t> Rodando {algoritmo_nome} com {estrategia}...")
        opt = None
        
        if estrategia == 'GridSearch':
            opt = GridSearchCV(modelo_base, params, cv=cv, scoring='f1', n_jobs=-1)
        elif estrategia == 'RandomSearch':
            opt = RandomizedSearchCV(modelo_base, params, n_iter=10, cv=cv, scoring='f1', n_jobs=-1, random_state=42)
        elif estrategia == 'Bayesiana':
            params_copy = params.copy()
            if algoritmo_nome == 'MLP': [params_copy.__setitem__(key, SkoptCategorical(params_copy[key])) for key in params_copy]
            opt = BayesSearchCV(modelo_base, params_copy, n_iter=10, cv=cv, scoring='f1', n_jobs=-1, random_state=42)
        elif estrategia == 'Genetica':
            params_copy = params.copy()
            [params_copy.__setitem__(key, GeneticCategorical(params_copy[key])) for key in params_copy]
            opt = GASearchCV(estimator=modelo_base, param_grid=params_copy, population_size=10, generations=5, cv=cv, scoring='f1', n_jobs=-1, verbose=0)
            
        try:
            start_time_process = time.time()
            
            opt.fit(x_train_resampled, y_train_resampled)
            best_model = opt.best_estimator_
            best_params = opt.best_params_
            
            y_pred = best_model.predict(x_test)
            y_proba = best_model.predict_proba(x_test)[:, 1]
            auc = roc_auc_score(y_test, y_proba)

            end_time = time.time()
            tempo_treinamento_da_estrategia = end_time - start_time_process
            
            print(f"\t\t> Tempo de treinamento e avaliação {estrategia}: {round(tempo_treinamento_da_estrategia, 2)}s")
            print(f"\t\t> F1-Score: {round(f1_score(y_test, y_pred), 4)} | AUC: {round(auc, 4)}")
            
            res = {
                'Algoritmo': algoritmo_nome,
                'Otimizador': estrategia,
                'Melhores Params': str(best_params),
                'Acuracia': accuracy_score(y_test, y_pred),
                'Precisao': precision_score(y_test, y_pred),
                'Recall': recall_score(y_test, y_pred),
                'F1-Score': f1_score(y_test, y_pred),
                'AUC': auc,
                'Kappa': cohen_kappa_score(y_test, y_pred),
                'Tempo Treinamento (s)': round(tempo_treinamento_da_estrategia, 2)
            }
            resultados.append(res)
            
        except Exception as e:
            print(f"ERRO em {algoritmo_nome} - {estrategia}: {e}")
            
    return resultados


# Cenários de Classificação - Com PCA 

In [18]:
X_pca = PCA(n_components=0.95).fit_transform(X_scalered)
X_pca.shape, X_scalered.shape 

((1000, 17), (1000, 24))

In [19]:
x_train, x_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, stratify=y, random_state=42)
x_train.shape, x_test.shape

((800, 17), (200, 17))

## Sistemática Experimental 

In [20]:
todos_resultados = []
modelos = [
    ('MLP', MLPClassifier(max_iter=100_000, random_state=42)),
    ('RF', RandomForestClassifier(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('SVM', SVC(max_iter=100_000,probability=True, random_state=42)), 
]

for nome_algo, modelo in modelos:
    print(f"\n>> Processando {nome_algo}...")
    res_algo = rodar_experimento(
        nome_algo, 
        modelo, 
        param_grids[nome_algo], 
        x_train,
        x_test,
        y_train, 
        y_test
    )
    
    for r in res_algo:
        r['Cenario'] = 'PCA + SMOTE'
        todos_resultados.append(r)

df_resultados = pd.DataFrame(todos_resultados)
df_resultados = df_resultados.sort_values(by='F1-Score', ascending=False)
df_resultados.reset_index(drop=True, inplace=True)
df_resultados


>> Processando MLP...
	> Rodando MLP com GridSearch...
		> Tempo de treinamento e avaliação GridSearch: 35.92s
		> F1-Score: 0.8159 | AUC: 0.7967
	> Rodando MLP com RandomSearch...
		> Tempo de treinamento e avaliação RandomSearch: 15.3s
		> F1-Score: 0.8293 | AUC: 0.7783
	> Rodando MLP com Bayesiana...
ERRO em MLP - Bayesiana: can only convert an array of size 1 to a Python scalar
	> Rodando MLP com Genetica...
		> Tempo de treinamento e avaliação Genetica: 35.24s
		> F1-Score: 0.8159 | AUC: 0.7967

>> Processando RF...
	> Rodando RF com GridSearch...
		> Tempo de treinamento e avaliação GridSearch: 17.86s
		> F1-Score: 0.8414 | AUC: 0.7981
	> Rodando RF com RandomSearch...
		> Tempo de treinamento e avaliação RandomSearch: 2.4s
		> F1-Score: 0.8304 | AUC: 0.783
	> Rodando RF com Bayesiana...
		> Tempo de treinamento e avaliação Bayesiana: 6.85s
		> F1-Score: 0.838 | AUC: 0.8054
	> Rodando RF com Genetica...
		> Tempo de treinamento e avaliação Genetica: 16.18s
		> F1-Score: 0.8414 |

Unnamed: 0,Algoritmo,Otimizador,Melhores Params,Acuracia,Precisao,Recall,F1-Score,AUC,Kappa,Tempo Treinamento (s),Cenario
0,RF,GridSearch,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.77,0.813333,0.871429,0.841379,0.798095,0.425,17.86,PCA + SMOTE
1,RF,Genetica,"{'n_estimators': 200, 'max_depth': None, 'min_...",0.77,0.813333,0.871429,0.841379,0.798095,0.425,16.18,PCA + SMOTE
2,RF,Bayesiana,"OrderedDict([('criterion', 'entropy'), ('max_d...",0.77,0.826389,0.85,0.838028,0.805417,0.441748,6.85,PCA + SMOTE
3,RF,RandomSearch,"{'n_estimators': 100, 'min_samples_split': 2, ...",0.755,0.805369,0.857143,0.83045,0.783036,0.390547,2.4,PCA + SMOTE
4,MLP,RandomSearch,"{'learning_rate': 'adaptive', 'hidden_layer_si...",0.755,0.809524,0.85,0.829268,0.778333,0.396552,15.3,PCA + SMOTE
5,MLP,Genetica,"{'hidden_layer_sizes': (100,), 'activation': '...",0.745,0.824818,0.807143,0.815884,0.796667,0.401408,35.24,PCA + SMOTE
6,MLP,GridSearch,"{'activation': 'tanh', 'alpha': 0.0001, 'hidde...",0.745,0.824818,0.807143,0.815884,0.796667,0.401408,35.92,PCA + SMOTE
7,SVM,Genetica,"{'C': 10, 'kernel': 'rbf', 'gamma': 'scale'}",0.705,0.791367,0.785714,0.78853,0.766667,0.300948,2.71,PCA + SMOTE
8,SVM,GridSearch,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.705,0.791367,0.785714,0.78853,0.766667,0.300948,1.81,PCA + SMOTE
9,SVM,Bayesiana,"OrderedDict([('C', 100), ('gamma', 'auto'), ('...",0.68,0.792308,0.735714,0.762963,0.756548,0.272727,2.17,PCA + SMOTE
