In [1]:
import requests
import pandas as pd
from pathlib import Path
import numpy as np
import io
import os
from lightkurve import search_targetpixelfile
import  matplotlib.pyplot as plt
import lightkurve as lk
from tqdm import tqdm
import concurrent.futures
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import (
	accuracy_score,
	classification_report,
	roc_auc_score,
	average_precision_score
)
import tensorflow as tf



pd.set_option('display.max_rows', None)

%matplotlib inline

import random, os
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)



In [2]:

BASE_URL = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query="
QUERY = 'SELECT * FROM toi'
requestUrl = BASE_URL + QUERY.replace(' ', '+') + '&format=csv'
toiDf = pd.read_csv(requestUrl)

toiDf.to_csv('toi.csv', index=False)
toiDf.head()



Unnamed: 0,tid,toi,toidisplay,toipfx,ctoi_alias,pl_pnum,tfopwg_disp,st_tmag,st_tmagerr1,st_tmagerr2,...,st_logglim,st_rad,st_raderr1,st_raderr2,st_radsymerr,st_radlim,sectors,toi_created,rowupdate,release_date
0,79748331,1064.01,TOI-1064.01,1064,79748330.0,1,CP,10.0059,0.006,-0.006,...,0,0.737189,0.056703,-0.056703,1,0,,2019-08-16 20:20:42,2024-04-25 10:08:01,2025-09-02 21:30:30
1,79748331,1064.02,TOI-1064.02,1064,79748330.0,2,CP,10.0059,0.006,-0.006,...,0,0.737189,0.056703,-0.056703,1,0,,2019-08-16 20:20:42,2024-04-25 10:08:01,2025-09-02 21:30:30
2,7088246,1065.01,TOI-1065.01,1065,7088246.0,1,KP,13.2851,0.006,-0.006,...,0,1.2,,,1,0,,2019-08-16 20:20:41,2021-10-29 12:59:15,2025-09-02 21:30:30
3,201604954,1066.01,TOI-1066.01,1066,201605000.0,1,KP,11.9707,0.006,-0.006,...,0,1.07,,,1,0,,2019-08-16 20:20:44,2021-10-29 12:59:15,2025-09-02 21:30:30
4,201642601,1067.01,TOI-1067.01,1067,201642600.0,1,KP,13.1686,0.006,-0.006,...,0,0.76701,,,1,0,,2019-08-16 20:20:44,2024-03-14 16:54:32,2025-09-02 21:30:30


In [3]:


TRUE_PLANETS = ['CP', 'KP', 'PC']

FALSE_POSITIVES = ['FP', 'FA']


toi_filtered = toiDf[toiDf['tfopwg_disp'].isin(TRUE_PLANETS + FALSE_POSITIVES)].copy()
toi_filtered['isPlanet'] = 0

toi_filtered.loc[toi_filtered['tfopwg_disp'].isin(TRUE_PLANETS), 'isPlanet'] = 1

toi_filtered['isPlanet'].value_counts()

missing_values = toi_filtered.isnull().sum()
missing_percentage = (missing_values / len(toi_filtered)) * 100


# remove all columns that have 100% missing values
toi_filtered = toi_filtered.loc[:, missing_percentage < 100]


missing_values = toi_filtered.isnull().sum()
missing_percentage = (missing_values / len(toi_filtered)) * 100

print(missing_percentage)


tid                   0.000000
toi                   0.000000
toidisplay            0.000000
toipfx                0.000000
ctoi_alias            0.000000
pl_pnum               0.000000
tfopwg_disp           0.000000
st_tmag               0.000000
st_tmagerr1           0.000000
st_tmagerr2           0.000000
st_tmagsymerr         0.000000
st_tmaglim            0.000000
rastr                 0.000000
ra                    0.000000
decstr                0.000000
dec                   0.000000
st_pmra               1.706201
st_pmraerr1           1.706201
st_pmraerr2           1.706201
st_pmrasymerr         1.706201
st_pmralim            1.706201
st_pmdec              1.706201
st_pmdecerr1          1.706201
st_pmdecerr2          1.706201
st_pmdecsymerr        1.706201
st_pmdeclim           1.706201
pl_tranmid            0.000000
pl_tranmiderr1        0.152587
pl_tranmiderr2        0.152587
pl_tranmidsymerr      0.000000
pl_tranmidlim         0.000000
pl_orbper             1.414898
pl_orbpe

In [4]:
cols_to_remove = ['pl_radeerr1', 'pl_radeerr2','st_disterr1', 'st_disterr2' ,'st_loggerr1', 'st_loggerr2', 'st_raderr1', 'st_raderr2']
toi_removed = toi_filtered.drop(columns=cols_to_remove)

missing_values = toi_removed.isnull().sum()
missing_percentage = (missing_values / len(toi_removed)) * 100
print(missing_percentage)

tid                   0.000000
toi                   0.000000
toidisplay            0.000000
toipfx                0.000000
ctoi_alias            0.000000
pl_pnum               0.000000
tfopwg_disp           0.000000
st_tmag               0.000000
st_tmagerr1           0.000000
st_tmagerr2           0.000000
st_tmagsymerr         0.000000
st_tmaglim            0.000000
rastr                 0.000000
ra                    0.000000
decstr                0.000000
dec                   0.000000
st_pmra               1.706201
st_pmraerr1           1.706201
st_pmraerr2           1.706201
st_pmrasymerr         1.706201
st_pmralim            1.706201
st_pmdec              1.706201
st_pmdecerr1          1.706201
st_pmdecerr2          1.706201
st_pmdecsymerr        1.706201
st_pmdeclim           1.706201
pl_tranmid            0.000000
pl_tranmiderr1        0.152587
pl_tranmiderr2        0.152587
pl_tranmidsymerr      0.000000
pl_tranmidlim         0.000000
pl_orbper             1.414898
pl_orbpe

In [5]:

print(f"Catálogo carregado. Total de {len(toi_filtered)} sinais para processar.")
print(f"Exemplos positivos (isPlanet=1): {len(toi_filtered[toi_filtered['isPlanet'] == 1])}")
print(f"Exemplos negativos (isPlanet=0): {len(toi_filtered[toi_filtered['isPlanet'] == 0])}")

Catálogo carregado. Total de 7209 sinais para processar.
Exemplos positivos (isPlanet=1): 5919
Exemplos negativos (isPlanet=0): 1290


In [6]:
toi_filtered['star_id'] = toi_filtered['toi'].apply(lambda x: str(x)[-2:])

print(toi_filtered['toi'].apply(lambda x: str(x)[-2:]).value_counts())
print(f"Número de estrelas diferentes: {toi_filtered['star_id'].nunique()}")

toi
01    6918
02     225
03      50
04      13
05       3
Name: count, dtype: int64
Número de estrelas diferentes: 5


In [1]:
features_for_snr = {
	'pl_orbper': ('pl_orbpererr1', 'pl_orbpererr2'),
	'pl_trandurh': ('pl_trandurherr1', 'pl_trandurherr2'),
	'pl_trandep': ('pl_trandeperr1', 'pl_trandeperr2'),
	'st_tmag': ('st_tmagerr1', 'st_tmagerr2')
}

data_engineered = toi_filtered.copy()

for feature, (err1_col, err2_col) in features_for_snr.items():
	# Calcula a incerteza média. O erro 2 é negativo por convenção.
	avg_error = (data_engineered[err1_col] - data_engineered[err2_col]) / 2
	
	# Calcula a SNR, evitando divisão por zero
	snr_col_name = f'{feature}_snr'
	data_engineered[snr_col_name] = data_engineered[feature] / avg_error
	data_engineered[snr_col_name].replace([np.inf, -np.inf], 0, inplace=True) # Substitui infinitos por 0

print("Novas colunas de SNR criadas:")
print([col for col in data_engineered.columns if '_snr' in col])


# ETAPA 2: Definir a lista final de colunas para remover
# Agora a lista é bem menor e mais precisa.
final_cols_to_drop = [
	# Identificadores e metadados
	'tid', 'toi', 'toidisplay', 'toipfx', 'ctoi_alias', 'pl_pnum', 'toi_created',
	'rowupdate', 'release_date',
	
	# Coluna fonte do target (evitar data leakage)
	'tfopwg_disp',
	
	# Coordenadas redundantes (formato string)
	'rastr', 'decstr',
	
	# Colunas de erro que já foram usadas para criar a SNR
	'pl_orbpererr1', 'pl_orbpererr2', 'pl_trandurherr1', 'pl_trandurherr2',
	'pl_trandeperr1', 'pl_trandeperr2', 'st_tmagerr1', 'st_tmagerr2',

	# Colunas de erro simétrico (para simplificar, já temos a SNR)
	'st_pmrasymerr', 'st_pmdecsymerr', 'pl_tranmidsymerr', 'pl_orbpersymerr',
	'pl_trandurhsymerr', 'pl_trandepsymerr', 'pl_radesymerr', 'st_distsymerr',
	'st_teffsymerr', 'st_loggsymerr', 'st_radsymerr'
]

# Remover apenas as colunas que existem no DataFrame
final_cols_to_drop_existing = [col for col in final_cols_to_drop if col in data_engineered.columns]
data_for_model = data_engineered.drop(columns=final_cols_to_drop_existing)

print("\nNúmero de features final para o modelo:", data_for_model.shape[1] - 1) # -1 para excluir a coluna 'isPlanet'
print("Features finais (amostra):")
print(data_for_model.columns.tolist())




NameError: name 'toi_filtered' is not defined

In [8]:
data_null_removed = data_for_model.dropna().copy()
print(f"Após remoção de nulos, total de {len(data_for_model)} sinais para processar.")

Após remoção de nulos, total de 7209 sinais para processar.


In [9]:
# columsn with only one unique value
unique_value_cols = [col for col in data_null_removed.columns if data_null_removed[col].nunique() <= 1]
print(f"Colunas com valor único (removidas): {unique_value_cols}")
data_null_removed = data_null_removed.drop(columns=unique_value_cols)

data_null_removed.to_csv('data_for_model.csv', index=False)

data_final = data_null_removed.copy()

Colunas com valor único (removidas): ['st_tmagsymerr', 'st_tmaglim', 'st_pmralim', 'st_pmdeclim', 'pl_tranmidlim', 'pl_orbperlim', 'pl_trandurhlim', 'pl_trandeplim', 'pl_radelim', 'st_distlim', 'st_tefflim', 'st_logglim', 'st_radlim']


In [10]:
missing_values = data_final.isnull().sum()
print(missing_values)

st_tmag            0
ra                 0
dec                0
st_pmra            0
st_pmraerr1        0
st_pmraerr2        0
st_pmdec           0
st_pmdecerr1       0
st_pmdecerr2       0
pl_tranmid         0
pl_tranmiderr1     0
pl_tranmiderr2     0
pl_orbper          0
pl_trandurh        0
pl_trandep         0
pl_rade            0
pl_radeerr1        0
pl_radeerr2        0
pl_insol           0
pl_eqt             0
st_dist            0
st_disterr1        0
st_disterr2        0
st_teff            0
st_tefferr1        0
st_tefferr2        0
st_logg            0
st_loggerr1        0
st_loggerr2        0
st_rad             0
st_raderr1         0
st_raderr2         0
isPlanet           0
star_id            0
pl_orbper_snr      0
pl_trandurh_snr    0
pl_trandep_snr     0
st_tmag_snr        0
dtype: int64


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X = data_final.drop(columns=['isPlanet', 'star_id'])
y = data_final['isPlanet']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

groups = data_final['star_id']
print(f"Total de sinais: {len(X)}")
print(f"Total de estrelas únicas (grupos): {len(groups.unique())}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Tamanho do conjunto de treino: {X_train_scaled.shape[0]} amostras")
print(f"Tamanho do conjunto de teste: {X_test_scaled.shape[0]} amostras")


# Relação entre classes no conjunto de treino
print("Distribuição das classes no conjunto de treino:")
print(y_train.value_counts(normalize=True))

Total de sinais: 4826
Total de estrelas únicas (grupos): 5
Tamanho do conjunto de treino: 3860 amostras
Tamanho do conjunto de teste: 966 amostras
Distribuição das classes no conjunto de treino:
isPlanet
1    0.821244
0    0.178756
Name: proportion, dtype: float64


In [12]:


count_neg = y_train.value_counts()[0]
count_pos = y_train.value_counts()[1]
scale_pos_weight = count_neg / count_pos

models = {
	"Regressão Logística": LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
	"LDA": LinearDiscriminantAnalysis(),

	'KNN': KNeighborsClassifier(n_neighbors=5),
	'Gradient Boosting': GradientBoostingClassifier(random_state=SEED), 
	"Árvore de Decisão": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
	"Random Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
	"SVM": SVC(random_state=42, probability=True, class_weight='balanced'),
	"XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight),
	"XGBoost (Equilibrado)": XGBClassifier(
		random_state=42,
		use_label_encoder=False,
		eval_metric='logloss',
		scale_pos_weight=scale_pos_weight,
		n_estimators=600,
		learning_rate=0.04,
		max_depth=6,
		subsample=0.8,
		colsample_bytree=0.8
	),
	"LightGBM": LGBMClassifier(random_state=42, class_weight='balanced'),
	"LightGBM (Tunado v3)": LGBMClassifier(
		random_state=42,
		class_weight='balanced',
		n_estimators=500,
		learning_rate=0.05,
	),
}


In [13]:
# %% [markdown]
# ## 🚀 Busca de Hiperparâmetros com Optuna (Holdout + Early Stopping)

# %%
import optuna
from optuna.integration import XGBoostPruningCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from optuna.pruners import MedianPruner
# 1. Criar um conjunto de validação DENTRO do conjunto de treino principal
# Este split será usado para cada trial do Optuna para avaliar os hiperparâmetros
X_train_opt, X_val_opt, y_train_opt, y_val_opt = train_test_split(
	X_train_scaled, y_train, 
	test_size=0.25, # Usaremos 25% dos dados de treino para validação em cada trial
	random_state=SEED, 
	stratify=y_train
)

# 2. Definir a função objetivo para o Optuna
def objective(trial, X_train, y_train, X_val, y_val, scale_pos_weight):
	"""
	Função que o Optuna irá otimizar.
	Treina um modelo XGBoost com os hiperparâmetros sugeridos em um split de treino/validação
	e retorna a pontuação AUC PRC no conjunto de validação.
	"""
	# Espaço de busca de hiperparâmetros
	param_grid = {
		'n_estimators': 2000, # Definimos um valor alto, o early stopping encontrará o ideal
		'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
		'max_depth': trial.suggest_int('max_depth', 3, 10),
		'subsample': trial.suggest_float('subsample', 0.5, 1.0),
		'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
		'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
		'lambda': trial.suggest_float('lambda', 1e-8, 5.0, log=True), # L2 Regularization
		'alpha': trial.suggest_float('alpha', 1e-8, 5.0, log=True),   # L1 Regularization
		'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
	}
	pruning_callback = XGBoostPruningCallback(trial, "validation_0-auc")
	model = XGBClassifier(
		**param_grid,
		random_state=SEED,
		scale_pos_weight=scale_pos_weight,
		eval_metric='auc', # Métrica para o early stopping
		n_jobs=-1,
		callbacks=[pruning_callback],
		early_stopping_rounds=100
	)
	
	# Callback para Pruning (poda) e Early Stopping integrados
   

	# Treina o modelo
	model.fit(
		X_train, y_train,
		eval_set=[(X_val, y_val)],
		
		verbose=False # Desativa os logs de cada rodada de treino
	)
	
	preds_proba = model.predict_proba(X_val)[:, 1]

	# MUDANÇA 3: A pontuação final do trial agora é calculada com roc_auc_score.
	roc_auc = roc_auc_score(y_val, preds_proba)

	trial.set_user_attr(key="best_iteration", value=model.best_iteration)

	return roc_auc

# 3. Configurar e executar o estudo do Optuna
print("--- Iniciando busca de hiperparâmetros com Optuna (estratégia holdout) ---")

# Sampler avançado que aprende correlações entre hiperparâmetros
sampler = optuna.samplers.TPESampler(multivariate=True, seed=SEED)


study = optuna.create_study(
	direction="maximize",
	sampler=sampler
)

# Executar a otimização
study.optimize(
	lambda trial: objective(trial, X_train_opt, y_train_opt, X_val_opt, y_val_opt, scale_pos_weight),
	n_trials=150,  # Aumentei um pouco os trials, pois a execução é mais rápida
	timeout=1800, # Tempo máximo em segundos (10 minutos)
	n_jobs=1, # XGBoost já usa todos os cores, então deixamos o Optuna com 1
	show_progress_bar=True
)

# 4. Exibir os resultados e criar o melhor modelo
print("\n--- Busca de Hiperparâmetros Concluída ---")
# MUDANÇA na string do print para refletir a nova métrica
print(f"Melhor pontuação (AUC ROC) na validação: {study.best_value:.4f}")
print("Melhores hiperparâmetros encontrados:")
print(study.best_params)

best_params_final = study.best_params
best_params_final['n_estimators'] = study.best_trial.user_attrs['best_iteration']

print(f"Número ideal de estimadores (n_estimators) encontrado: {best_params_final['n_estimators']}")

xgb_optuna_model = XGBClassifier(
    **best_params_final,
    random_state=SEED,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight
)

# Dando um nome novo para não confundir com o anterior
models["XGBoost (Optuna AUC ROC)"] = xgb_optuna_model
print("\nModelo 'XGBoost (Optuna AUC ROC)' adicionado à lista para avaliação final.")


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-09-19 22:17:37,262] A new study created in memory with name: no-name-0173bd9c-a172-49a8-8bc0-900c5b914397


--- Iniciando busca de hiperparâmetros com Optuna (estratégia holdout) ---


Best trial: 0. Best value: 0.886531:   1%|          | 1/150 [00:01<03:49,  1.54s/it, 1.53/1800 seconds]

[I 2025-09-19 22:17:38,798] Trial 0 finished with value: 0.8865314999708063 and parameters: {'learning_rate': 0.008468008575248327, 'max_depth': 10, 'subsample': 0.8659969709057025, 'colsample_bytree': 0.7993292420985183, 'gamma': 1.77071686435378e-07, 'lambda': 2.275053705838343e-07, 'alpha': 3.200866785899844e-08, 'min_child_weight': 9}. Best is trial 0 with value: 0.8865314999708063.


Best trial: 1. Best value: 0.887283:   1%|▏         | 2/150 [00:02<02:32,  1.03s/it, 2.21/1800 seconds]

[I 2025-09-19 22:17:39,469] Trial 1 finished with value: 0.8872832369942196 and parameters: {'learning_rate': 0.030834348179355788, 'max_depth': 8, 'subsample': 0.5102922471479012, 'colsample_bytree': 0.9849549260809971, 'gamma': 0.04566054873446119, 'lambda': 7.032853236588588e-07, 'alpha': 3.8167167932852713e-07, 'min_child_weight': 2}. Best is trial 1 with value: 0.8872832369942196.


Best trial: 1. Best value: 0.887283:   2%|▏         | 3/150 [00:03<03:03,  1.25s/it, 3.72/1800 seconds]

[I 2025-09-19 22:17:40,983] Trial 2 finished with value: 0.8860571028201086 and parameters: {'learning_rate': 0.005670807781371429, 'max_depth': 7, 'subsample': 0.7159725093210578, 'colsample_bytree': 0.645614570099021, 'gamma': 0.000784915956255507, 'lambda': 1.634755885510359e-07, 'alpha': 3.4782238967388167e-06, 'min_child_weight': 4}. Best is trial 1 with value: 0.8872832369942196.


Best trial: 3. Best value: 0.89121:   3%|▎         | 4/150 [00:05<03:04,  1.27s/it, 5.01/1800 seconds] 

[I 2025-09-19 22:17:42,275] Trial 3 finished with value: 0.8912097857184563 and parameters: {'learning_rate': 0.013481575603601416, 'max_depth': 9, 'subsample': 0.5998368910791798, 'colsample_bytree': 0.7571172192068059, 'gamma': 0.0005486767416600901, 'lambda': 2.535541638745669e-08, 'alpha': 0.0019275890163896973, 'min_child_weight': 2}. Best is trial 3 with value: 0.8912097857184563.


Best trial: 3. Best value: 0.89121:   7%|▋         | 11/150 [00:10<01:41,  1.37it/s, 10.01/1800 seconds]

[I 2025-09-19 22:17:47,105] Trial 4 finished with value: 0.8842689904828633 and parameters: {'learning_rate': 0.0014492412389916862, 'max_depth': 10, 'subsample': 0.9828160165372797, 'colsample_bytree': 0.9041986740582306, 'gamma': 2.734702913886802e-06, 'lambda': 7.073702489270826e-08, 'alpha': 0.00895617505524071, 'min_child_weight': 5}. Best is trial 3 with value: 0.8912097857184563.
[I 2025-09-19 22:17:47,127] Trial 5 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,153] Trial 6 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,172] Trial 7 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,195] Trial 8 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,231] Trial 9 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,278] Trial 10 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  12%|█▏        | 18/150 [00:10<00:37,  3.52it/s, 10.24/1800 seconds]

[I 2025-09-19 22:17:47,312] Trial 11 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,349] Trial 12 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,386] Trial 13 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:47,421] Trial 14 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,449] Trial 15 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,473] Trial 16 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,502] Trial 17 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  15%|█▍        | 22/150 [00:10<00:26,  4.85it/s, 10.37/1800 seconds]

[I 2025-09-19 22:17:47,535] Trial 18 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,576] Trial 19 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:47,602] Trial 20 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,631] Trial 21 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  18%|█▊        | 27/150 [00:10<00:15,  8.03it/s, 10.65/1800 seconds]

[I 2025-09-19 22:17:47,747] Trial 22 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,788] Trial 23 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,834] Trial 24 pruned. Trial was pruned at iteration 5.
[I 2025-09-19 22:17:47,877] Trial 25 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:47,916] Trial 26 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  21%|██▏       | 32/150 [00:10<00:11,  9.99it/s, 10.84/1800 seconds]

[I 2025-09-19 22:17:47,949] Trial 27 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:48,009] Trial 28 pruned. Trial was pruned at iteration 3.
[I 2025-09-19 22:17:48,043] Trial 29 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,071] Trial 30 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,100] Trial 31 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  26%|██▌       | 39/150 [00:11<00:08, 13.53it/s, 11.29/1800 seconds]

[I 2025-09-19 22:17:48,375] Trial 32 pruned. Trial was pruned at iteration 88.
[I 2025-09-19 22:17:48,403] Trial 33 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,440] Trial 34 pruned. Trial was pruned at iteration 3.
[I 2025-09-19 22:17:48,468] Trial 35 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,494] Trial 36 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,523] Trial 37 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,552] Trial 38 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  29%|██▉       | 44/150 [00:11<00:06, 17.15it/s, 11.53/1800 seconds]

[I 2025-09-19 22:17:48,592] Trial 39 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,644] Trial 40 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:48,691] Trial 41 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:48,729] Trial 42 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:48,789] Trial 43 pruned. Trial was pruned at iteration 1.


Best trial: 3. Best value: 0.89121:  33%|███▎      | 49/150 [00:11<00:05, 17.40it/s, 11.76/1800 seconds]

[I 2025-09-19 22:17:48,857] Trial 44 pruned. Trial was pruned at iteration 5.
[I 2025-09-19 22:17:48,909] Trial 45 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:48,955] Trial 46 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:48,984] Trial 47 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:49,028] Trial 48 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  36%|███▌      | 54/150 [00:11<00:04, 19.93it/s, 11.98/1800 seconds]

[I 2025-09-19 22:17:49,068] Trial 49 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:49,112] Trial 50 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:49,160] Trial 51 pruned. Trial was pruned at iteration 5.
[I 2025-09-19 22:17:49,204] Trial 52 pruned. Trial was pruned at iteration 5.
[I 2025-09-19 22:17:49,247] Trial 53 pruned. Trial was pruned at iteration 1.


Best trial: 3. Best value: 0.89121:  38%|███▊      | 57/150 [00:12<00:04, 20.79it/s, 12.11/1800 seconds]

[I 2025-09-19 22:17:49,285] Trial 54 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:49,333] Trial 55 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:49,375] Trial 56 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  42%|████▏     | 63/150 [00:12<00:05, 16.21it/s, 12.61/1800 seconds]

[I 2025-09-19 22:17:49,689] Trial 57 finished with value: 0.8682526128335376 and parameters: {'learning_rate': 0.002870981104010888, 'max_depth': 10, 'subsample': 0.7382099683900285, 'colsample_bytree': 0.5447612700486453, 'gamma': 0.0005629206816224073, 'lambda': 1.0716315429112022e-08, 'alpha': 1.012600029854767e-06, 'min_child_weight': 4}. Best is trial 3 with value: 0.8912097857184563.
[I 2025-09-19 22:17:49,710] Trial 58 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:49,754] Trial 59 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:49,802] Trial 60 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:49,830] Trial 61 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:49,872] Trial 62 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  45%|████▌     | 68/150 [00:12<00:04, 20.11it/s, 12.80/1800 seconds]

[I 2025-09-19 22:17:49,905] Trial 63 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:49,957] Trial 64 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:49,983] Trial 65 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,019] Trial 66 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,062] Trial 67 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  48%|████▊     | 72/150 [00:13<00:04, 18.48it/s, 13.03/1800 seconds]

[I 2025-09-19 22:17:50,107] Trial 68 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:50,212] Trial 69 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,256] Trial 70 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,288] Trial 71 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  53%|█████▎    | 79/150 [00:13<00:02, 24.48it/s, 13.24/1800 seconds]

[I 2025-09-19 22:17:50,323] Trial 72 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,357] Trial 73 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,388] Trial 74 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,414] Trial 75 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,442] Trial 76 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,470] Trial 77 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,506] Trial 78 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  57%|█████▋    | 85/150 [00:13<00:02, 26.15it/s, 13.45/1800 seconds]

[I 2025-09-19 22:17:50,536] Trial 79 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,564] Trial 80 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,600] Trial 81 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,632] Trial 82 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,681] Trial 83 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,716] Trial 84 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  61%|██████    | 91/150 [00:13<00:02, 25.31it/s, 13.70/1800 seconds]

[I 2025-09-19 22:17:50,770] Trial 85 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,807] Trial 86 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,844] Trial 87 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,873] Trial 88 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:50,929] Trial 89 pruned. Trial was pruned at iteration 7.
[I 2025-09-19 22:17:50,964] Trial 90 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  65%|██████▍   | 97/150 [00:13<00:02, 25.54it/s, 13.94/1800 seconds]

[I 2025-09-19 22:17:51,005] Trial 91 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,047] Trial 92 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,094] Trial 93 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:51,126] Trial 94 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,166] Trial 95 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,201] Trial 96 pruned. Trial was pruned at iteration 1.


Best trial: 3. Best value: 0.89121:  69%|██████▊   | 103/150 [00:14<00:01, 25.33it/s, 14.18/1800 seconds]

[I 2025-09-19 22:17:51,259] Trial 97 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,299] Trial 98 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:51,334] Trial 99 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,362] Trial 100 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,400] Trial 101 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:51,444] Trial 102 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  73%|███████▎  | 110/150 [00:14<00:01, 27.22it/s, 14.43/1800 seconds]

[I 2025-09-19 22:17:51,495] Trial 103 pruned. Trial was pruned at iteration 5.
[I 2025-09-19 22:17:51,538] Trial 104 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,573] Trial 105 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,603] Trial 106 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:51,630] Trial 107 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,659] Trial 108 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,692] Trial 109 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  77%|███████▋  | 115/150 [00:14<00:01, 23.40it/s, 14.66/1800 seconds]

[I 2025-09-19 22:17:51,736] Trial 110 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,780] Trial 111 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,868] Trial 112 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,897] Trial 113 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:51,927] Trial 114 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  81%|████████  | 121/150 [00:14<00:01, 25.90it/s, 14.89/1800 seconds]

[I 2025-09-19 22:17:51,958] Trial 115 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,007] Trial 116 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:52,041] Trial 117 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,074] Trial 118 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:52,107] Trial 119 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,151] Trial 120 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  85%|████████▌ | 128/150 [00:15<00:00, 27.96it/s, 15.13/1800 seconds]

[I 2025-09-19 22:17:52,198] Trial 121 pruned. Trial was pruned at iteration 3.
[I 2025-09-19 22:17:52,238] Trial 122 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:52,281] Trial 123 pruned. Trial was pruned at iteration 5.
[I 2025-09-19 22:17:52,308] Trial 124 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,336] Trial 125 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,361] Trial 126 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,390] Trial 127 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  88%|████████▊ | 132/150 [00:15<00:00, 23.74it/s, 15.35/1800 seconds]

[I 2025-09-19 22:17:52,443] Trial 128 pruned. Trial was pruned at iteration 5.
[I 2025-09-19 22:17:52,503] Trial 129 pruned. Trial was pruned at iteration 3.
[I 2025-09-19 22:17:52,571] Trial 130 pruned. Trial was pruned at iteration 11.
[I 2025-09-19 22:17:52,610] Trial 131 pruned. Trial was pruned at iteration 2.


                                                                                                         

[I 2025-09-19 22:17:52,645] Trial 132 pruned. Trial was pruned at iteration 3.
[I 2025-09-19 22:17:52,705] Trial 133 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,743] Trial 134 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,776] Trial 135 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,808] Trial 136 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  93%|█████████▎| 139/150 [00:15<00:00, 24.73it/s, 15.62/1800 seconds]

[I 2025-09-19 22:17:52,846] Trial 137 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:52,884] Trial 138 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121:  97%|█████████▋| 145/150 [00:16<00:00, 17.72it/s, 16.09/1800 seconds]

[I 2025-09-19 22:17:53,169] Trial 139 finished with value: 0.8665849243883925 and parameters: {'learning_rate': 0.002037892829467389, 'max_depth': 10, 'subsample': 0.738112156547138, 'colsample_bytree': 0.5430633103790484, 'gamma': 5.0333246081208945e-05, 'lambda': 5.33391689515292e-08, 'alpha': 1.1942281505847586e-07, 'min_child_weight': 5}. Best is trial 3 with value: 0.8912097857184563.
[I 2025-09-19 22:17:53,214] Trial 140 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:53,246] Trial 141 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:53,288] Trial 142 pruned. Trial was pruned at iteration 2.
[I 2025-09-19 22:17:53,323] Trial 143 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:53,356] Trial 144 pruned. Trial was pruned at iteration 0.


Best trial: 3. Best value: 0.89121: 100%|██████████| 150/150 [00:16<00:00,  9.21it/s, 16.28/1800 seconds]

[I 2025-09-19 22:17:53,396] Trial 145 pruned. Trial was pruned at iteration 3.
[I 2025-09-19 22:17:53,438] Trial 146 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:53,464] Trial 147 pruned. Trial was pruned at iteration 1.
[I 2025-09-19 22:17:53,494] Trial 148 pruned. Trial was pruned at iteration 0.
[I 2025-09-19 22:17:53,539] Trial 149 pruned. Trial was pruned at iteration 5.

--- Busca de Hiperparâmetros Concluída ---
Melhor pontuação (AUC ROC) na validação: 0.8912
Melhores hiperparâmetros encontrados:
{'learning_rate': 0.013481575603601416, 'max_depth': 9, 'subsample': 0.5998368910791798, 'colsample_bytree': 0.7571172192068059, 'gamma': 0.0005486767416600901, 'lambda': 2.535541638745669e-08, 'alpha': 0.0019275890163896973, 'min_child_weight': 2}
Número ideal de estimadores (n_estimators) encontrado: 461

Modelo 'XGBoost (Optuna AUC ROC)' adicionado à lista para avaliação final.





In [14]:
print(f"Melhor pontuação (AUC ROC) na validação: {study.best_value:.4f}")
print("Melhores hiperparâmetros encontrados:")
print(study.best_params)

print(f"Número ideal de estimadores (n_estimators) encontrado: {study.best_trial.user_attrs['best_iteration']}")

Melhor pontuação (AUC ROC) na validação: 0.8912
Melhores hiperparâmetros encontrados:
{'learning_rate': 0.013481575603601416, 'max_depth': 9, 'subsample': 0.5998368910791798, 'colsample_bytree': 0.7571172192068059, 'gamma': 0.0005486767416600901, 'lambda': 2.535541638745669e-08, 'alpha': 0.0019275890163896973, 'min_child_weight': 2}
Número ideal de estimadores (n_estimators) encontrado: 461


In [15]:

# Lista para armazenar os resultados de cada modelo
results = []

# Loop para treinar e avaliar cada modelo
for name, model in models.items():
	print(f"Treinando o modelo: {name}...")

	# Treinar o modelo
	model.fit(X_train_scaled, y_train)

	# Fazer previsões no conjunto de teste
	y_pred = model.predict(X_test_scaled)
	y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] # Probabilidades para a classe positiva (1)

	# Calcular as métricas
	accuracy = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred, output_dict=True)
	roc_auc = roc_auc_score(y_test, y_pred_proba)
	prc_auc = average_precision_score(y_test, y_pred_proba)

	# Armazenar os resultados
	results.append({
		"Modelo": name,
		"AUC ROC": roc_auc,
		"AUC PRC": prc_auc,
		"Acurácia": accuracy,
		"Precisão (Planeta)": report['1']['precision'],
		"Recall (Planeta)": report['1']['recall'],
		"Precisão (FP)": report['0']['precision'],
		"Recall (FP)": report['0']['recall'],
		"F1-Score (Planeta)": report['1']['f1-score'],
		"F1-Score (FP)": report['0']['f1-score'],
		
	})

print("\nTodos os modelos foram treinados e avaliados.")

Treinando o modelo: Regressão Logística...
Treinando o modelo: LDA...
Treinando o modelo: KNN...
Treinando o modelo: Gradient Boosting...
Treinando o modelo: Árvore de Decisão...
Treinando o modelo: Random Forest...
Treinando o modelo: SVM...
Treinando o modelo: XGBoost...
Treinando o modelo: XGBoost (Equilibrado)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Treinando o modelo: LightGBM...
[LightGBM] [Info] Number of positive: 3170, number of negative: 690
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8703
[LightGBM] [Info] Number of data points in the train set: 3860, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Treinando o modelo: LightGBM (Tunado v3)...
[LightGBM] [Info] Number of positive: 3170, number of negative: 690
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8703
[LightGBM] [Info] Number of data points in the train set: 3860, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[



Treinando o modelo: XGBoost (Optuna AUC ROC)...

Todos os modelos foram treinados e avaliados.


In [16]:
# %%
# Criar um DataFrame com os resultados
results_df = pd.DataFrame(results)

# Ordenar o DataFrame pelo AUC ROC para ver os melhores modelos primeiro
results_df_sorted = results_df.sort_values(by="AUC ROC", ascending=False)

# Exibir a tabela de resultados formatada
display(results_df_sorted.style.format({
	'Acurácia': '{:.4f}',
	'Acurácia (FP)': '{:.4f}',
	'Precisão (Planeta)': '{:.4f}',
	'Recall (Planeta)': '{:.4f}',
	'Precisão (FP)': '{:.4f}',
	'Recall (FP)': '{:.4f}',
	'F1-Score (Planeta)': '{:.4f}',
	'F1-Score (FP)': '{:.4f}',
	'AUC ROC': '{:.4f}',
	'AUC PRC': '{:.4f}'
}).background_gradient(cmap='viridis', subset=['Acurácia', 'F1-Score (Planeta)', 'AUC ROC', 'AUC PRC']))



Unnamed: 0,Modelo,AUC ROC,AUC PRC,Acurácia,Precisão (Planeta),Recall (Planeta),Precisão (FP),Recall (FP),F1-Score (Planeta),F1-Score (FP)
8,XGBoost (Equilibrado),0.8883,0.97,0.8727,0.9198,0.9256,0.6488,0.6301,0.9227,0.6393
11,XGBoost (Optuna AUC ROC),0.8869,0.9696,0.8571,0.9215,0.9029,0.5926,0.6474,0.9121,0.6188
9,LightGBM,0.8784,0.9677,0.8675,0.9204,0.918,0.6286,0.6358,0.9192,0.6322
5,Random Forest,0.8781,0.966,0.8696,0.8751,0.9811,0.8052,0.3584,0.9251,0.496
10,LightGBM (Tunado v3),0.8766,0.9654,0.8706,0.9103,0.9344,0.6579,0.578,0.9222,0.6154
3,Gradient Boosting,0.8759,0.9674,0.8706,0.8866,0.966,0.7353,0.4335,0.9246,0.5455
7,XGBoost,0.8739,0.9665,0.8613,0.9093,0.9231,0.6211,0.578,0.9161,0.5988
6,SVM,0.8459,0.9534,0.8199,0.9269,0.8474,0.4979,0.6936,0.8854,0.5797
1,LDA,0.7906,0.9349,0.8468,0.8644,0.9647,0.6543,0.3064,0.9118,0.4173
0,Regressão Logística,0.79,0.9351,0.7588,0.9118,0.7818,0.3951,0.6532,0.8418,0.4924
