In [53]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay

In [67]:
X=np.load("Cred_features.npy")
y=np.load("Cred_labels.npy")
print(X.shape)
print(y.shape)

(56961, 30)
(56961,)


### K-nearest neighbors algorithm

- <b>Parâmetros KNN</b> 
    - N_neighbors: nº de vizinhos considerados próximos da amostra 
    - Weights: uniform ( All points in each neighborhood are weighted equally) 
    - Weights: distance (in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away)
    - p: [1,2] (When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2)
---
- <b>Validação</b>: ele fará de maneira sistemática diversas combinações dos parâmetros e depois de avaliá-los os armazenará num único objeto. Nesse sentido, será feito uma análise combinatória a partir da quantidade de elementos em cada matriz com outros
    - Arrays: X, y (Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes)
    - test_size + train_size = 1.0 (should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split)
    - Random_state: 42 (Pass an int for reproducible output across multiple function calls)
    - Stratify: y (Data is split in a stratified fashion, using this as the class labels)  
---
- <b>GridSearchCV</b>
    - Estimator: clf (o classificador ou regressor que estamos utilizando)
    - Param_grid: parameters (é aquele dicionário com valores para serem testados)
    - scoring: 'accuracy' ()
    - cv: 5 (cv é um número inteiro o GridSearchCV executa um StratifiedKFolds, isso quer dizer que o dataset foi divido em 5 partes (ou folds))

In [57]:
from sklearn.model_selection import train_test_split, GridSearchCV

parameters=[{'n_neighbors':[1,3,5,7,9,11], 'weights': ['uniform', 'distance'], 'p':[1,2]}]

# Separando uma parte dos dados para validação (usaremos no GridSearchCV)
X,X_val,y,y_val=train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

clf = KNeighborsClassifier()

# Execução do GridSearch
gs=GridSearchCV(clf, parameters, scoring='accuracy', cv=5, n_jobs=-1)

gs.fit(X_val, y_val)
# Mostrando a tabela de resultados do GridSearch (opcional)
#from tabulate import tabulate
import pandas as pd
df=gs.cv_results_
df = pd.DataFrame(gs.cv_results_)
#print(tabulate(df, headers='keys', tablefmt='psql'))
#print("Melhores parâmetros encontrados: ", gs.best_params_)

# Recuperando os melhores resultados
#clf=gs.best_estimator_
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,param_p,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00369,0.00076,0.389568,0.021505,1,1,uniform,"{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}",0.997944,0.996571,0.997942,0.997942,0.996571,0.997394,0.000672,21
1,0.003283,0.000426,0.406003,0.066352,1,1,distance,"{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}",0.997944,0.996571,0.997942,0.997942,0.996571,0.997394,0.000672,21
2,0.003092,0.001261,0.077681,0.015038,1,2,uniform,"{'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}",0.996573,0.995885,0.997257,0.997942,0.996571,0.996845,0.000699,23
3,0.004576,0.001986,0.075963,0.010251,1,2,distance,"{'n_neighbors': 1, 'p': 2, 'weights': 'distance'}",0.996573,0.995885,0.997257,0.997942,0.996571,0.996845,0.000699,23
4,0.00274,0.000528,0.33304,0.020369,3,1,uniform,"{'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}",0.997944,0.998628,0.998628,0.998628,0.997942,0.998354,0.000336,1
5,0.002232,0.000395,0.335869,0.027805,3,1,distance,"{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}",0.997944,0.998628,0.998628,0.998628,0.997942,0.998354,0.000336,1
6,0.003346,0.001408,0.067287,0.008208,3,2,uniform,"{'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}",0.997944,0.998628,0.998628,0.998628,0.997942,0.998354,0.000336,1
7,0.00304,0.000325,0.073016,0.008779,3,2,distance,"{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}",0.997258,0.995885,0.997942,0.998628,0.997257,0.997394,0.00091,20
8,0.002593,0.000468,0.362948,0.028443,5,1,uniform,"{'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}",0.997944,0.998628,0.998628,0.998628,0.997942,0.998354,0.000336,1
9,0.002713,0.00061,0.315142,0.003289,5,1,distance,"{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}",0.997944,0.998628,0.998628,0.998628,0.997942,0.998354,0.000336,1


In [69]:
parameters_list = []
dict_ = parameters[0]

for value in dict_.values():
    parameters_list.append(len(value))
    
print(parameters_list)

[3, 3, 2, 2]


In [70]:
import time
import numpy
resultVar = numpy.prod(parameters_list)
tot = resultVar * len(X_val) * gs.n_splits_

init = time.time()
for i in range(tot):
    pass

final = time.time() - init
print(final)
print(f"{final/tot} é tempo para executar uma instrução cada")

0.024503231048583984
3.111805910963794e-08 é tempo para executar uma instrução cada


### Naive Bayes classifier

In [64]:
# Carregando a base de dados
#X, y=load_breast_cancer(return_X_y=True)
#X.shape
X=np.load("Cred_features.npy")
y=np.load("Cred_labels.npy")
print(X.shape)
print(y.shape)

# Validação cruzada
from sklearn.model_selection import train_test_split, GridSearchCV

# Separando da base original uma parte para validação
X,x_val,y,y_val=train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

# Declara o modelo
clf=GaussianNB()

# parâmetros que o GridSearch utilizará em sua busca (KNN)

# parâmetros Naive Bayes
parameters=[{'var_smoothing':[1e-09,1e-03, 1e-06]}]

# Execução do GridSearch
gs=GridSearchCV(clf, parameters, scoring='accuracy', cv=3, n_jobs=-1)
gs.fit(x_val, y_val)

# Mostrando a tabela de resultados do GrifSearch (opcional)
#from tabulate import tabulate
import pandas as pd
df=gs.cv_results_
df = pd.DataFrame(gs.cv_results_)
#print(tabulate(df, headers='keys', tablefmt='psql'))
print("Melhores parâmetros encontrados: ", gs.best_params_)

# Recuperando os melhores resultados
clf=gs.best_estimator_
df

(56961, 30)
(56961,)
Melhores parâmetros encontrados:  {'var_smoothing': 0.001}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_var_smoothing,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.022753,0.004405,0.008875,0.00285,0.0,{'var_smoothing': 1e-09},0.993944,0.993681,0.993679,0.993768,0.000125,3
1,0.017486,0.000473,0.008805,5e-06,0.001,{'var_smoothing': 0.001},0.998157,0.998157,0.99842,0.998245,0.000124,1
2,0.017113,0.005755,0.005009,0.000828,1e-06,{'var_smoothing': 1e-06},0.998157,0.990521,0.99842,0.995699,0.003663,2


### Decision Tree Algorithm

- Parâmetros DecisionTree
    - criterion: ['gini', 'entropy', 'log_loss'] (The function to measure the quality of a split.)
    - max_depth: [None, 5, 10] (The maximum depth of the tree)
    - min_samples_split:[2, 5] (The minimum number of samples required to split an internal node)
    - splitter:['random', 'best'] (The strategy used to choose the split at each node)

In [68]:
# Carregando a base de dados
#X, y=load_breast_cancer(return_X_y=True)
#X.shape
X=np.load("Cred_features.npy")
y=np.load("Cred_labels.npy")
print(X.shape)
print(y.shape)

# Validação cruzada
from sklearn.model_selection import train_test_split, GridSearchCV

# Separando da base original uma parte para validação
X,x_val,y,y_val=train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

# Declara o modelo
clf=DecisionTreeClassifier()

# parâmetros para árvore de classificação (DecisionTree)
parameters=[{'criterion':['gini', 'entropy', 'log_loss'],
             'max_depth': [None, 5, 10],
             'min_samples_split':[2, 5],
             'splitter':['random', 'best']}]


# Execução do GridSearch
gs=GridSearchCV(clf, parameters, scoring='accuracy', cv=3, n_jobs=-1)
gs.fit(x_val, y_val)

# Mostrando a tabela de resultados do GrifSearch (opcional)
#from tabulate import tabulate
import pandas as pd
df=gs.cv_results_
df = pd.DataFrame(gs.cv_results_)
#print(tabulate(df, headers='keys', tablefmt='psql'))
print("Melhores parâmetros encontrados: ", gs.best_params_)

# Recuperando os melhores resultados
clf=gs.best_estimator_
df

(56961, 30)
(56961,)
Melhores parâmetros encontrados:  {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2, 'splitter': 'random'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_split,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011597,0.002195,0.001333,0.0004714268,gini,,2,random,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.999737,0.99921,0.998683,0.99921,0.00043,16
1,0.177851,0.06751,0.001667,0.0004711456,gini,,2,best,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.99921,0.99921,0.999473,0.999298,0.000124,6
2,0.010809,0.001187,0.001668,0.0004714827,gini,,5,random,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.999473,0.998684,0.998683,0.998947,0.000372,29
3,0.177334,0.070861,0.00143,0.0004277274,gini,,5,best,"{'criterion': 'gini', 'max_depth': None, 'min_...",0.99921,0.998684,0.999473,0.999122,0.000328,17
4,0.008023,0.001414,0.002001,1.362676e-06,gini,5.0,2,random,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.99921,0.999473,0.99921,0.999298,0.000124,8
5,0.178666,0.074336,0.001666,0.0004713142,gini,5.0,2,best,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.998947,0.998947,0.999473,0.999122,0.000248,17
6,0.008857,0.001428,0.001334,0.0004710902,gini,5.0,5,random,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.999737,0.99921,0.99921,0.999386,0.000248,5
7,0.175672,0.070273,0.001667,0.0004715955,gini,5.0,5,best,"{'criterion': 'gini', 'max_depth': 5, 'min_sam...",0.999473,0.998684,0.999473,0.99921,0.000372,11
8,0.008354,0.000472,0.001,1.94668e-07,gini,10.0,2,random,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.999473,0.999737,0.99921,0.999473,0.000215,3
9,0.179228,0.06551,0.001332,0.0002349429,gini,10.0,2,best,"{'criterion': 'gini', 'max_depth': 10, 'min_sa...",0.999473,0.998947,0.999473,0.999298,0.000248,6


### Multilayer Perceptron (MLP)

In [None]:
#MultiLayer Perceptron

# Carregando a base de dados
#X, y=load_breast_cancer(return_X_y=True)
#X.shape
#X=np.load("Cred_features.npy")
#y=np.load("Cred_labels.npy")
#print(X.shape)
#print(y.shape)

# Validação cruzada
from sklearn.model_selection import train_test_split, GridSearchCV

# Separando da base original uma parte para validação
X,x_val,y,y_val=train_test_split(X,y,test_size=0.2,random_state=42, stratify=y)

# Declara o modelo
clf=MLPClassifier(max_iter=300, early_stopping=False, solver='adam')

# parâmetros para MLP (Rede Neural Artificial - MultiLayer Perceptron)
parameters=[{'hidden_layer_sizes':[16, (16, 8), (16, 8, 4)],
             'learning_rate': ['constant', 'invscaling'],
             'learning_rate_init':[0.01, 0.001, 0.0001],
             'activation':['relu', 'logistic', 'tanh'],
             'random_state':[10, 46, 37]}]

# Execução do GridSearch
gs=GridSearchCV(clf, parameters, scoring='accuracy', cv=3, n_jobs=-1)
gs.fit(x_val, y_val)

# Mostrando a tabela de resultados do GrifSearch (opcional)
#from tabulate import tabulate
import pandas as pd
#df=gs.cv_results_
#df = pd.DataFrame(gs.cv_results_)
#print(tabulate(df, headers='keys', tablefmt='psql'))
print("Melhores parâmetros encontrados: ", gs.best_params_)

# Recuperando os melhores resultados
clf=gs.best_estimator_
#df

### Metrics to Evaluate your model

In [None]:
#Métricas de avaliação

# Usando validação cruzada com T=5 folds
result=model_selection.cross_val_score(clf, X, y, cv=5)
print("Acurácia média: %.3f" % result.mean())
print("Desvio padrão: %.3f" % result.std())

# Calculando a predição para cada exempolo de teste
y_pred=model_selection.cross_val_predict(clf, X, y, cv=5)

# Calculando a predição para cada exemplo de teste
#y_pred_proba=model_selection.cross_val_predict(clf, X, y, cv=5, method='predict_proba')

# Calcular precisão
precision=precision_score(y, y_pred, average='macro')

# Calcular revocação
recall=recall_score(y, y_pred, average='macro')

# Calcular revocação
f1=f1_score(y, y_pred, average='macro')

print("Precision: %.3f" % precision)
print("Recall: %.3f" % recall)
print("f1: %.3f" % f1)

matrix=ConfusionMatrixDisplay(y, y_pred)
cm = confusion_matrix(y, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['benigno', 'maligno'])
disp.plot()