In [20]:
# !pip3 install pandas boto3 mysql-connector-python scikit-learn numpy

In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, cohen_kappa_score
import time
import sys
sys.path.append('../..')
from functions.db_inserts import *
from functions.aws_functions import *
from functions.select_cols_functions import *

In [22]:
env_mode = 'dev'

### Lendo arquivo fonte

In [23]:
bucket_name = "adoptai-trusted-prod-tcc"

In [24]:
if env_mode == 'dev':
    source = 'feature_engineering'
    df_trusted = pd.read_csv(f'../../../data/{source}.csv')
    df_trusted = apply_all_rules(df_trusted)
    s3 = [f'{source}.csv', df_trusted]
elif env_mode == 'prd':
    s3 = s3_csv_to_df(bucket_name)
    df_trusted = s3[1]

### Utilizando LabelEncoded nas colunas categóricas

In [25]:
df_trusted = adjust_categoric_values(df_trusted)

### Parametros de gravação

In [26]:
nome_fonte = s3[0]
nome_modelo = 'Gradient Boosting'
parameters = {'n_estimators': [50, 100, 200]}  # Escolha alguns valores para n_estimators

### Preparativos para executar o modelo

In [27]:

# Separar as features (X) e os rótulos (y)
x = df_trusted.iloc[:, :-1]
y = df_trusted.iloc[:, -1]

In [28]:
# Dividir os dados em conjuntos de treinamento e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [29]:
# Listas para armazenar métricas de desempenho e tempos de execução
accuracy = []
precision = []
recall = []
f1 = []
mae = []
qwk = []

start_times = []
end_times = []

In [30]:
# Treina e avalia o modelo Gradient Boosting para diferentes valores de n_estimators
for n_estimator in parameters['n_estimators']:

    start_time = start_times.append(time.time())
    gb = GradientBoostingClassifier(n_estimators=n_estimator, random_state=42)
    gb.fit(x_train, y_train)
    
    y_pred = gb.predict(x_test)
    
    # Avaliar o modelo e armazenar as métricas
    accuracy.append(accuracy_score(y_test, y_pred))
    precision.append(precision_score(y_test, y_pred, average=None))
    recall.append(recall_score(y_test, y_pred, average=None))
    f1.append(f1_score(y_test, y_pred, average=None))
    mae.append(mean_absolute_error(y_test, y_pred))
    qwk.append(cohen_kappa_score(y_test, y_pred, weights='quadratic'))

    end_times.append(time.time()) # Registra o tempo de término


In [31]:
# Armazena as métricas calculadas
metrics = {
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'mae': mae,
    'quadratic_weighted_kappa': qwk
}

In [32]:
# Prepara os dados para inserção no banco de dados
model = {'nome_modelo': nome_modelo, 'nome_fonte': nome_fonte}
execution = {
    'accuracy': accuracy, 
    'mae': mae,
    'qwk': qwk,
    'start_time': start_times, 
    'end_time': end_times
    }
hyperparams = parameters
desempenho = {
    'precision_values': metrics['precision'], 
    'recall_values': metrics['recall'], 
    'f1_values': metrics['f1_score']
    }

In [33]:
print(start_times)

[1730595312.1578803, 1730595314.690557, 1730595319.6594167]


In [34]:
desempenho


{'precision_values': [array([0.33333333, 0.38223938, 0.35223048, 0.37318841, 0.49332146]),
  array([0.42857143, 0.38817006, 0.35288553, 0.37681159, 0.49910555]),
  array([0.375     , 0.3754386 , 0.34945706, 0.3812709 , 0.50045086])],
 'recall_values': [array([0.02173913, 0.31578947, 0.47022333, 0.16068643, 0.66506603]),
  array([0.0326087 , 0.33492823, 0.46277916, 0.16224649, 0.66986795]),
  array([0.0326087 , 0.34130781, 0.43920596, 0.17784711, 0.66626651])],
 'f1_values': [array([0.04081633, 0.34585153, 0.40276302, 0.22464558, 0.56646217]),
  array([0.06060606, 0.35958904, 0.40042941, 0.22682661, 0.57201435]),
  array([0.06      , 0.35756057, 0.38922485, 0.24255319, 0.5715757 ])]}

In [35]:
# Insere os valores calculados nas tabelas do banco de dados
insert_values(model, execution, hyperparams, desempenho, env=env_mode)

Executando em dev
Registros inseridos em dev
