In [1]:
# Imports
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

In [2]:
# Inicialização do H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,3 mins 41 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.2
H2O_cluster_version_age:,8 days
H2O_cluster_name:,H2O_from_python_jiran_06bvzc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.975 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [3]:
# Passo 1 - Importar base de dados
clientes = pd.read_csv("clientes.csv") # Importa base de dados

# Convertendo DataFrame Pandas para H2OFrame
clientes_h2o = h2o.H2OFrame(clientes)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [4]:
# Passo 2 - Preparar base de dados
    # 2.1 Verificar dados do Dataset
print(clientes_h2o.shape) # Checando informações, linhas e colunas
print(clientes_h2o.types) # Tipos de dados

(100000, 25)
{'id_cliente': 'int', 'mes': 'int', 'idade': 'int', 'profissao': 'enum', 'salario_anual': 'real', 'num_contas': 'int', 'num_cartoes': 'int', 'juros_emprestimo': 'int', 'num_emprestimos': 'int', 'dias_atraso': 'int', 'num_pagamentos_atrasados': 'int', 'num_verificacoes_credito': 'int', 'mix_credito': 'enum', 'divida_total': 'real', 'taxa_uso_credito': 'real', 'idade_historico_credito': 'int', 'investimento_mensal': 'real', 'comportamento_pagamento': 'enum', 'saldo_final_mes': 'real', 'score_credito': 'enum', 'emprestimo_carro': 'int', 'emprestimo_casa': 'int', 'emprestimo_pessoal': 'int', 'emprestimo_credito': 'int', 'emprestimo_estudantil': 'int'}


In [5]:
# Passo 3 - Criar modelo de score - Bom, Médio, Ruim
    # 3.1 Selecionar as colunas de treino e de teste
x = clientes_h2o.columns[:-2]
y = 'score_credito'

In [10]:
# Passo 4 - Treinar 10 modelos
aml = H2OAutoML(max_models=10, seed=1)  # Tempo máximo de execução de 5 minutos (300 segundos)
aml.train(x=x, y=y, training_frame=clientes_h2o)

AutoML progress: |█
19:53:59.758: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),10/10
# GBM base models (used / total),6/6
# DRF base models (used / total),2/2
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),1/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Good,Poor,Standard,Error,Rate
1831.0,0.0,0.0,0.0,0 / 1.831
0.0,2823.0,1.0,0.0003541,1 / 2.824
82.0,182.0,5039.0,0.0497831,264 / 5.303
1913.0,3005.0,5040.0,0.0266118,265 / 9.958

k,hit_ratio
1,0.9733882
2,1.0
3,1.0

Good,Poor,Standard,Error,Rate
15856.0,21.0,1951.0,0.1106125,1.972 / 17.828
15.0,26207.0,2776.0,0.096248,2.791 / 28.998
3811.0,6047.0,43316.0,0.1853914,9.858 / 53.174
19682.0,32275.0,48043.0,0.14621,14.621 / 100.000

k,hit_ratio
1,0.85379
2,0.99636
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8537831,0.0025873,0.853324,0.8532256,0.8561004,0.8499673,0.8562979
aic,,0.0,,,,,
auc,,0.0,,,,,
err,0.1462169,0.0025873,0.146676,0.1467743,0.1438995,0.1500327,0.1437021
err_count,2924.2,43.47068,2930.0,2944.0,2899.0,2981.0,2867.0
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
logloss,0.364374,0.0048811,0.3658502,0.3662918,0.3575234,0.3703765,0.3618278
max_per_class_error,0.1854023,0.0035769,0.1884389,0.1863418,0.1818945,0.1889535,0.1813827
mean_per_class_accuracy,0.8692308,0.0028292,0.8683327,0.8692318,0.8712199,0.8650339,0.8723355
mean_per_class_error,0.1307692,0.0028292,0.1316673,0.1307682,0.1287801,0.1349661,0.1276644


In [11]:
# Passo 5 - Verificar melhor modelo
lb = aml.leaderboard
print(lb)

model_id                                                   mean_per_class_error    logloss      rmse       mse
StackedEnsemble_AllModels_1_AutoML_2_20240521_195359                   0.130751   0.364358  0.335741  0.112722
StackedEnsemble_BestOfFamily_1_AutoML_2_20240521_195359                0.140602   0.376652  0.341423  0.116569
GBM_1_AutoML_2_20240521_195359                                         0.164195   0.408581  0.358363  0.128424
GBM_4_AutoML_2_20240521_195359                                         0.16473    0.410183  0.357215  0.127603
DRF_1_AutoML_2_20240521_195359                                         0.166764   0.439552  0.375616  0.141087
GBM_grid_1_AutoML_2_20240521_195359_model_1                            0.168257   0.407445  0.354949  0.125988
GBM_3_AutoML_2_20240521_195359                                         0.171715   0.429772  0.369455  0.136497
GBM_2_AutoML_2_20240521_195359                                         0.182708   0.452979  0.381935  0.145874
X

In [12]:
# Passo 6 - Usar modelo em cenário real
novos_clientes = pd.read_csv("novos_clientes.csv")
novos_clientes_h2o = h2o.H2OFrame(novos_clientes)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [13]:
# Realizando previsões
previsoes = aml.predict(novos_clientes_h2o)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




In [14]:
# Convertendo resultados para DataFrame Pandas
previsoes_df = previsoes.as_data_frame()


with h2o.utils.threading.local_context(polars_enabled=True, datatable_enabled=True):
    pandas_df = h2o_df.as_data_frame()



In [54]:
# Passo 7 - Resultado
resultado = pd.DataFrame({'Codigo do Cliente': novos_clientes.index.tolist(), 'Previsao Cliente': previsoes_df['predict']})
print(resultado)

   Codigo do Cliente Previsao Cliente
0                  0             Poor
1                  1             Poor
2                  2         Standard
