In [None]:
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.model_selection import train_test_split

In [None]:
iris = datasets.load_iris()

In [None]:
sample = iris.target

features = pd.DataFrame(data = iris.data, columns=iris.feature_names)
df = features.copy()
df['sample'] = sample

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   sample             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [None]:
pre_processador = ColumnTransformer(transformers=
    [('standardization',StandardScaler(), make_column_selector(dtype_include=['float64']))], remainder='passthrough')

features_escalonadas = pd.DataFrame(pre_processador.fit_transform(features), columns=iris.feature_names)

In [None]:
sample_escalonado = LabelEncoder().fit_transform(sample)

In [None]:
F_train_model, F_test, S_train_model, S_test = train_test_split(features_escalonadas, sample_escalonado, random_state = 42,test_size = 0.30)

In [None]:
#separando outro teste cru para passas no objeto
F_test_model, F_test_obj, S_test_model, S_test_obj = train_test_split(F_test, S_test, random_state = 12, test_size = 0.30)

# PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import time

In [None]:
dt_models = []
dt_accuracies = []
dt_times = []

knn_models = []
knn_accuracies = []
knn_times = []

nb_models = []
nb_accuracies = []
nb_times = []

for i in range(1,len(F_train_model.columns)+1):
  pca = PCA(n_components=i)
  F_train_model_pca = pca.fit_transform(F_train_model)

  for a in ['gini','entropy']:
    classificador_dt_bestalpha = tree.DecisionTreeClassifier()

    classificador_dt = tree.DecisionTreeClassifier(criterion=a)
    path = classificador_dt.cost_complexity_pruning_path(F_train_model_pca, S_train_model)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities

    #tirando o primeiro valor que é sempre zero
    ccp_alphas = ccp_alphas[:-1]
    #lista de classificadores de acordo com os alphas
    classificadores_dt_alphas = []
    #guardando cada classificador
    for ccp_alpha in ccp_alphas:
      classificador_dt_alpha = tree.DecisionTreeClassifier(ccp_alpha=ccp_alpha)
      classificador_dt_alpha.fit(F_train_model_pca, S_train_model)
      classificadores_dt_alphas.append(classificador_dt_alpha)

    #testando cada classificador
    acurrancy_teste = [classificador_dt_alpha.score(pca.transform(F_test_model), S_test_model) for classificador_dt_alpha in classificadores_dt_alphas]
    #pegando a acúracia deles
    max_accuracy_index = acurrancy_teste.index(max(acurrancy_teste))
    #pegando a poscição do melhor classificador
    alpha = classificadores_dt_alphas[max_accuracy_index]
    #pegando o alpha do classificador com maior acurácia
    best_ccp_alpha = alpha.ccp_alpha

    #criando um classificador com o melhor alpha
    classificador_dt_bestalpha = tree.DecisionTreeClassifier(criterion=a,ccp_alpha=best_ccp_alpha)


    #definir hyperparametros sem informar o critério, pois ja esta no looping - espero que va
    hyperparametros = {
        'criterion': [a],
        'splitter': ['best','random'],
        'max_depth': [None,2,4,6,8,10,12],
        'min_samples_split': [2,5,10],
        'min_samples_leaf': [1,2,5,10],
        'max_features': [None,'sqrt','log2']
    }

    #achando os melhores hyperparametros
    grid = GridSearchCV(estimator=classificador_dt_bestalpha, param_grid=hyperparametros,scoring='accuracy')
    #treinando
    grid.fit(F_train_model_pca,S_train_model)
    #pegando os melhores parametros
    best_params = grid.best_params_
    #olhando o melhor score
    #print('melhor acurácia:', grid.best_score_)

    #criando um novo classificador com os melhores parametros
    classificador_dt_ccp_grid = tree.DecisionTreeClassifier(
        criterion = best_params['criterion'],
        max_depth = best_params['max_depth'],
        min_samples_split = best_params['min_samples_split'],
        min_samples_leaf = best_params['min_samples_leaf'],
        max_features = best_params['max_features']
    )
    #treinando depois do grid search
    classificador_dt_ccp_grid.fit(F_train_model_pca,S_train_model)

    #nomezin
    description = 'decisionTree_pca_' + str(i) + '_criterion_' + str(a)

    #pegando a resposta do modelo e medindo o tempo

    inicio = time.time()
    answer_ccp_grid = classificador_dt_ccp_grid.predict(pca.transform(F_test_model))
    fim = time.time()

    #adicionando objeto na lista
    dt_models.append({description: classificador_dt_ccp_grid, 'accurancy': accuracy_score(answer_ccp_grid, S_test_model), 'tempo': fim - inicio})
    dt_accuracies.append(accuracy_score(answer_ccp_grid, S_test_model))
    dt_times.append(fim - inicio)

  #knn
  for b in range(2,7):
    #criando classificador
    knn = KNeighborsClassifier()

    #definindo um range de k's para fazer teste
    params = {'n_neighbors': range(1,30)}

    #fazendo a cross validation com 5 cortes
    grid_knn = GridSearchCV(knn, params, cv=5)
    grid_knn.fit(F_train_model_pca, S_train_model)

    #pegando o resultado
    #print(f'Melhor valor de K: {grid_knn.best_params_}')
    #print(f'Melhor score (médio): {round(grid_knn.best_score_*100,4)}%')

    #criando classificador tunado
    knn_cv = KNeighborsClassifier()
    knn_cv.fit(F_train_model_pca,S_train_model)

    #resposta do modelo e tempo
    inicio = time.time()
    response_knn = knn_cv.predict(pca.transform(F_test_model))
    fim = time.time()

    #nomeandoo
    description = 'knn_pca_' + str(i) + '_cv_' + str(b) + '_bestK_' + str(grid_knn.best_params_['n_neighbors'])

    #adding
    knn_models.append({description: knn_cv, 'accurancy': accuracy_score(response_knn, S_test_model), 'tempo': fim - inicio})
    knn_accuracies.append(accuracy_score(response_knn, S_test_model))
    knn_times.append(fim - inicio)

  #naive bayes
  classificador_bayes = GaussianNB()

  classificador_bayes.fit(F_train_model_pca,S_train_model)

  #respota e tempo
  inicio = time.time()
  response_nb = classificador_bayes.predict(pca.transform(F_test_model))
  fim = time.time()

  #ultimo nome obrigada deus
  description = 'nb_pca_' + str(i)

  #botando
  nb_models.append({description: classificador_bayes, 'accurancy': accuracy_score(response_nb, S_test_model), 'tempo': fim - inicio})
  nb_accuracies.append(accuracy_score(response_nb, S_test_model))
  nb_times.append(fim - inicio)


## melhor acurácia no melhor tempo

In [None]:
dt_better_accuracie_index = []

for i in range(len(dt_accuracies)):
  if dt_accuracies[i] == max(dt_accuracies):
    dt_better_accuracie_index.append(i)

dt_better_time_index = [ dt_times[e] for e in dt_better_accuracie_index]

dt_better_faster_index = dt_times.index(min(dt_better_time_index))
dt_better_faster_index

5

In [None]:
knn_better_accuracie_index = []

for i in range(len(knn_accuracies)):
  if knn_accuracies[i] == max(knn_accuracies):
    knn_better_accuracie_index.append(i)

knn_better_time_index = [ knn_times[e] for e in knn_better_accuracie_index]

knn_better_faster_index = knn_times.index(min(knn_better_time_index))
knn_better_faster_index

12

In [None]:
nb_better_accuracie_index = []

for i in range(len(nb_accuracies)):
  if nb_accuracies[i] == max(nb_accuracies):
    nb_better_accuracie_index.append(i)

nb_better_time_index = [ nb_times[e] for e in nb_better_accuracie_index]

nb_better_faster_index = nb_times.index(min(nb_better_time_index))
nb_better_faster_index

2

## pickle

In [None]:
import pickle as pk
import hmac
import hashlib

In [None]:
key = b'GodelIncompleteness'

In [None]:
obj_serializacao = {
    'bestModels': [dt_models[dt_better_faster_index], knn_models[knn_better_faster_index], nb_models[nb_better_faster_index]],
    'data_test': {'f_test': F_test_obj, 's_test': S_test_obj}
    }

In [None]:
with open('modelo_ml.picke','wb') as doc:
  obj_serializado = pk.dumps(obj_serializacao)

  signature = hmac.new(key, obj_serializado, hashlib.sha256).digest()

  data_with_signature = obj_serializado + signature

  doc.write(data_with_signature)