In [1]:
!pip3 install pandas boto3 mysql-connector-python scikit-learn numpy



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import matplotlib as mpl
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
import sys
sys.path.append('../..') # Adiciona o diretório superior ao caminho de importação para acessar funções utilitárias
from functions.db_inserts import * # Importa funções para inserção de dados no banco de dados
from functions.aws_functions import * # Importa funções relacionadas à AWS
from functions.select_cols_functions import * # Importa funções para seleção e processamento de colunas
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_absolute_error, cohen_kappa_score
from keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import backend as K

In [3]:
env_mode = 'prd' # Define o modo de ambiente: 'dev' para desenvolvimento, 'prd' para produção

### Lendo arquivo fonte

In [4]:
bucket_name = "adoptai-trusted-prod-tcc" # Nome do bucket S3 de onde os dados serão carregados

In [5]:
if env_mode == 'dev':
    # Modo de desenvolvimento: carrega e processa os dados localmente
    source = 'feature_engineering'
    df_trusted = pd.read_csv(f'../../../data/{source}.csv')
    df_trusted = apply_all_rules(df_trusted)
    s3 = [f'{source}.csv', df_trusted]
elif env_mode == 'prd':
    # Modo de produção: carrega e processa os dados a partir do S3
    s3 = s3_csv_to_df(bucket_name)
    df_trusted = s3[1]

### Utilizando LabelEncoded nas colunas categóricas

In [6]:
df_trusted = adjust_categoric_values(df_trusted)

### Parâmetros de gravação

In [7]:
nome_fonte = s3[0] # Armazena o nome do arquivo de dados utilizado
nome_modelo = 'MLP' # Define o nome do modelo
parameters = {
    'learning_rate': [
        1e-2,
        1e-3,
        1e-4
    ],
}

### Preparativos para executar o modelo

In [8]:
# Listas para armazenar métricas de desempenho e tempos de execução
accuracy = []
precision = []
recall = []
f1 = []
mae = []
qwk = []

start_times = []
end_times = []

In [9]:
# Separar as features (x) e os rótulos (y)
x = df_trusted.iloc[:, :-1] # Seleciona todas as colunas, exceto a última, como features
y = df_trusted.iloc[:, -1] # Seleciona a última coluna como rótulo

In [10]:
# Dividir os dados em conjuntos de treinamento e teste
x_train_full, x_test, y_train_full, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
x_valid, x_train = x_train_full[:int(len(x_train_full) * 0.1)], x_train_full[int(len(x_train_full) * 0.1):]
y_valid, y_train = y_train_full[:int(len(y_train_full) * 0.1)], y_train_full[int(len(y_train_full) * 0.1):]

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_valid = scaler.transform(x_valid)
x_test = scaler.transform(x_test)

In [13]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)




###  Rodando modelo Sequencial

In [14]:
def mae_classification(y_true, y_pred):
    y_pred_labels = K.argmax(y_pred, axis=-1)
    y_true_labels = K.cast(y_true, y_pred_labels.dtype)
    return K.mean(K.abs(y_true_labels - y_pred_labels))

In [15]:
for learning_rate in parameters['learning_rate']:

    start_time = start_times.append(time.time()) # Registra o tempo de início

    early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

    # Inicializa o modelo com os parâmetros atuais
    model = keras.models.Sequential([
        Dense(32, activation="relu"),
        Dropout(0.3),
        Dense(16, activation="relu"),
        Dropout(0.3),
        Dense(5, activation="softmax")
    ])

    model.compile(loss="sparse_categorical_crossentropy",
                optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                metrics=["accuracy", mae_classification])

    history = model.fit(
        x_train, y_train,
        epochs=200,
        validation_data=(x_valid, y_valid),
        callbacks=[early_stopping]
    )

    # Fazer previsões no conjunto de teste
    y_pred = model.predict(x_test)
    y_pred_labels = np.argmax(y_pred, axis=1)

    # Calcula e armazena as métricas
    accuracy.append(accuracy_score(y_test, y_pred_labels))
    precision.append(precision_score(y_test, y_pred_labels, average=None))
    recall.append(recall_score(y_test, y_pred_labels, average=None))
    f1.append(f1_score(y_test, y_pred_labels, average=None))
    mae.append(mean_absolute_error(y_test, y_pred_labels))
    qwk.append(cohen_kappa_score(y_test, y_pred_labels, weights='quadratic'))

    end_times.append(time.time()) # Registra o tempo de término

Epoch 1/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2885 - loss: 1.5063 - mae_classification: 0.9671 - val_accuracy: 0.3344 - val_loss: 1.4355 - val_mae_classification: 1.0000
Epoch 2/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 777us/step - accuracy: 0.3291 - loss: 1.4373 - mae_classification: 0.9728 - val_accuracy: 0.3353 - val_loss: 1.4329 - val_mae_classification: 0.9737
Epoch 3/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 802us/step - accuracy: 0.3372 - loss: 1.4285 - mae_classification: 0.9564 - val_accuracy: 0.3403 - val_loss: 1.4205 - val_mae_classification: 0.9737
Epoch 4/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 788us/step - accuracy: 0.3373 - loss: 1.4219 - mae_classification: 0.9889 - val_accuracy: 0.3603 - val_loss: 1.4149 - val_mae_classification: 1.0000
Epoch 5/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 786us/step - acc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.2401 - loss: 1.7631 - mae_classification: 1.0000 - val_accuracy: 0.3303 - val_loss: 1.4804 - val_mae_classification: 1.0000
Epoch 2/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 785us/step - accuracy: 0.2928 - loss: 1.4891 - mae_classification: 1.0000 - val_accuracy: 0.3511 - val_loss: 1.4500 - val_mae_classification: 1.0000
Epoch 3/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 783us/step - accuracy: 0.3122 - loss: 1.4542 - mae_classification: 1.0000 - val_accuracy: 0.3595 - val_loss: 1.4402 - val_mae_classification: 1.0000
Epoch 4/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 761us/step - accuracy: 0.3139 - loss: 1.4456 - mae_classification: 1.0000 - val_accuracy: 0.3686 - val_loss: 1.4346 - val_mae_classification: 1.0000
Epoch 5/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 831us/step - accuracy: 0.323

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.1513 - loss: 2.0199 - mae_classification: 1.1363 - val_accuracy: 0.1943 - val_loss: 1.7009 - val_mae_classification: 1.0000
Epoch 2/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 738us/step - accuracy: 0.2107 - loss: 1.7960 - mae_classification: 1.0000 - val_accuracy: 0.2627 - val_loss: 1.6229 - val_mae_classification: 1.0000
Epoch 3/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 849us/step - accuracy: 0.2402 - loss: 1.6924 - mae_classification: 1.0000 - val_accuracy: 0.2744 - val_loss: 1.5872 - val_mae_classification: 1.0000
Epoch 4/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 743us/step - accuracy: 0.2533 - loss: 1.6392 - mae_classification: 1.0000 - val_accuracy: 0.2752 - val_loss: 1.5659 - val_mae_classification: 1.0000
Epoch 5/200
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 742us/step - accuracy: 0.256

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Obtendo métricas de desempenho

In [16]:
# Armazena as métricas calculadas
metrics = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1
}

### Gravando dados no banco

In [17]:
# Prepara os dados para inserção no banco de dados
model = {'nome_modelo': nome_modelo, 'nome_fonte': nome_fonte}
execution = {
    'accuracy': accuracy, 
    'mae': mae,
    'qwk': qwk,
    'start_time': start_times, 
    'end_time': end_times
    }
hyperparams = parameters
desempenho = {
    'precision_values': metrics['precision'], 
    'recall_values': metrics['recall'], 
    'f1_values': metrics['f1_score']
    }

In [18]:
# Insere os valores calculados nas tabelas do banco de dados
insert_values(model, execution, hyperparams, desempenho, env=env_mode)

Executando em dev
Registros inseridos em dev
