In [217]:
import requests
import pandas as pd
from pathlib import Path
import numpy as np
import io
import os
from lightkurve import search_targetpixelfile
import  matplotlib.pyplot as plt
import lightkurve as lk
from tqdm import tqdm
import concurrent.futures
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import (
	accuracy_score,
	classification_report,
	roc_auc_score,
	average_precision_score
)
import tensorflow as tf



pd.set_option('display.max_rows', None)

%matplotlib inline

import random, os
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [218]:

BASE_URL = "https://exoplanetarchive.ipac.caltech.edu/TAP/sync?query="
QUERY = 'SELECT * FROM toi'
requestUrl = BASE_URL + QUERY.replace(' ', '+') + '&format=csv'
toiDf = pd.read_csv(requestUrl)

toiDf.to_csv('toi.csv', index=False)
toiDf.head()



Unnamed: 0,tid,toi,toidisplay,toipfx,ctoi_alias,pl_pnum,tfopwg_disp,st_tmag,st_tmagerr1,st_tmagerr2,...,st_logglim,st_rad,st_raderr1,st_raderr2,st_radsymerr,st_radlim,sectors,toi_created,rowupdate,release_date
0,79748331,1064.01,TOI-1064.01,1064,79748330.0,1,CP,10.0059,0.006,-0.006,...,0,0.737189,0.056703,-0.056703,1,0,,2019-08-16 20:20:42,2024-04-25 10:08:01,2025-09-02 21:30:30
1,79748331,1064.02,TOI-1064.02,1064,79748330.0,2,CP,10.0059,0.006,-0.006,...,0,0.737189,0.056703,-0.056703,1,0,,2019-08-16 20:20:42,2024-04-25 10:08:01,2025-09-02 21:30:30
2,7088246,1065.01,TOI-1065.01,1065,7088246.0,1,KP,13.2851,0.006,-0.006,...,0,1.2,,,1,0,,2019-08-16 20:20:41,2021-10-29 12:59:15,2025-09-02 21:30:30
3,201604954,1066.01,TOI-1066.01,1066,201605000.0,1,KP,11.9707,0.006,-0.006,...,0,1.07,,,1,0,,2019-08-16 20:20:44,2021-10-29 12:59:15,2025-09-02 21:30:30
4,201642601,1067.01,TOI-1067.01,1067,201642600.0,1,KP,13.1686,0.006,-0.006,...,0,0.76701,,,1,0,,2019-08-16 20:20:44,2024-03-14 16:54:32,2025-09-02 21:30:30


In [219]:


TRUE_PLANETS = ['CP', 'KP', 'PC']

FALSE_POSITIVES = ['FP', 'FA']


toi_filtered = toiDf[toiDf['tfopwg_disp'].isin(TRUE_PLANETS + FALSE_POSITIVES)].copy()
toi_filtered['isPlanet'] = 0

toi_filtered.loc[toi_filtered['tfopwg_disp'].isin(TRUE_PLANETS), 'isPlanet'] = 1

toi_filtered['isPlanet'].value_counts()

missing_values = toi_filtered.isnull().sum()
missing_percentage = (missing_values / len(toi_filtered)) * 100


# remove all columns that have 100% missing values
toi_filtered = toi_filtered.loc[:, missing_percentage < 100]


missing_values = toi_filtered.isnull().sum()
missing_percentage = (missing_values / len(toi_filtered)) * 100

print(missing_percentage)


tid                   0.000000
toi                   0.000000
toidisplay            0.000000
toipfx                0.000000
ctoi_alias            0.000000
pl_pnum               0.000000
tfopwg_disp           0.000000
st_tmag               0.000000
st_tmagerr1           0.000000
st_tmagerr2           0.000000
st_tmagsymerr         0.000000
st_tmaglim            0.000000
rastr                 0.000000
ra                    0.000000
decstr                0.000000
dec                   0.000000
st_pmra               1.706201
st_pmraerr1           1.706201
st_pmraerr2           1.706201
st_pmrasymerr         1.706201
st_pmralim            1.706201
st_pmdec              1.706201
st_pmdecerr1          1.706201
st_pmdecerr2          1.706201
st_pmdecsymerr        1.706201
st_pmdeclim           1.706201
pl_tranmid            0.000000
pl_tranmiderr1        0.152587
pl_tranmiderr2        0.152587
pl_tranmidsymerr      0.000000
pl_tranmidlim         0.000000
pl_orbper             1.414898
pl_orbpe

In [220]:
cols_to_remove = ['pl_radeerr1', 'pl_radeerr2','st_disterr1', 'st_disterr2' ,'st_loggerr1', 'st_loggerr2', 'st_raderr1', 'st_raderr2']
toi_removed = toi_filtered.drop(columns=cols_to_remove)

missing_values = toi_removed.isnull().sum()
missing_percentage = (missing_values / len(toi_removed)) * 100
print(missing_percentage)

tid                   0.000000
toi                   0.000000
toidisplay            0.000000
toipfx                0.000000
ctoi_alias            0.000000
pl_pnum               0.000000
tfopwg_disp           0.000000
st_tmag               0.000000
st_tmagerr1           0.000000
st_tmagerr2           0.000000
st_tmagsymerr         0.000000
st_tmaglim            0.000000
rastr                 0.000000
ra                    0.000000
decstr                0.000000
dec                   0.000000
st_pmra               1.706201
st_pmraerr1           1.706201
st_pmraerr2           1.706201
st_pmrasymerr         1.706201
st_pmralim            1.706201
st_pmdec              1.706201
st_pmdecerr1          1.706201
st_pmdecerr2          1.706201
st_pmdecsymerr        1.706201
st_pmdeclim           1.706201
pl_tranmid            0.000000
pl_tranmiderr1        0.152587
pl_tranmiderr2        0.152587
pl_tranmidsymerr      0.000000
pl_tranmidlim         0.000000
pl_orbper             1.414898
pl_orbpe

In [221]:

print(f"Catálogo carregado. Total de {len(toi_filtered)} sinais para processar.")
print(f"Exemplos positivos (isPlanet=1): {len(toi_filtered[toi_filtered['isPlanet'] == 1])}")
print(f"Exemplos negativos (isPlanet=0): {len(toi_filtered[toi_filtered['isPlanet'] == 0])}")

Catálogo carregado. Total de 7209 sinais para processar.
Exemplos positivos (isPlanet=1): 5919
Exemplos negativos (isPlanet=0): 1290


In [222]:
toi_filtered['star_id'] = toi_filtered['toi'].apply(lambda x: str(x)[-2:])

print(toi_filtered['toi'].apply(lambda x: str(x)[-2:]).value_counts())
print(f"Número de estrelas diferentes: {toi_filtered['star_id'].nunique()}")

toi
01    6918
02     225
03      50
04      13
05       3
Name: count, dtype: int64
Número de estrelas diferentes: 5


In [223]:
features_for_snr = {
    'pl_orbper': ('pl_orbpererr1', 'pl_orbpererr2'),
    'pl_trandurh': ('pl_trandurherr1', 'pl_trandurherr2'),
    'pl_trandep': ('pl_trandeperr1', 'pl_trandeperr2'),
    'st_tmag': ('st_tmagerr1', 'st_tmagerr2')
}

data_engineered = toi_filtered.copy()

for feature, (err1_col, err2_col) in features_for_snr.items():
    # Calcula a incerteza média. O erro 2 é negativo por convenção.
    avg_error = (data_engineered[err1_col] - data_engineered[err2_col]) / 2
    
    # Calcula a SNR, evitando divisão por zero
    snr_col_name = f'{feature}_snr'
    data_engineered[snr_col_name] = data_engineered[feature] / avg_error
    data_engineered[snr_col_name].replace([np.inf, -np.inf], 0, inplace=True) # Substitui infinitos por 0

print("Novas colunas de SNR criadas:")
print([col for col in data_engineered.columns if '_snr' in col])


# ETAPA 2: Definir a lista final de colunas para remover
# Agora a lista é bem menor e mais precisa.
final_cols_to_drop = [
	# Identificadores e metadados
	'tid', 'toi', 'toidisplay', 'toipfx', 'ctoi_alias', 'pl_pnum', 'toi_created',
	'rowupdate', 'release_date',
	
	# Coluna fonte do target (evitar data leakage)
	'tfopwg_disp',
	
	# Coordenadas redundantes (formato string)
	'rastr', 'decstr',
	
	# Colunas de erro que já foram usadas para criar a SNR
	'pl_orbpererr1', 'pl_orbpererr2', 'pl_trandurherr1', 'pl_trandurherr2',
	'pl_trandeperr1', 'pl_trandeperr2', 'st_tmagerr1', 'st_tmagerr2',

	# Colunas de erro simétrico (para simplificar, já temos a SNR)
	'st_pmrasymerr', 'st_pmdecsymerr', 'pl_tranmidsymerr', 'pl_orbpersymerr',
	'pl_trandurhsymerr', 'pl_trandepsymerr', 'pl_radesymerr', 'st_distsymerr',
	'st_teffsymerr', 'st_loggsymerr', 'st_radsymerr'
]

# Remover apenas as colunas que existem no DataFrame
final_cols_to_drop_existing = [col for col in final_cols_to_drop if col in data_engineered.columns]
data_for_model = data_engineered.drop(columns=final_cols_to_drop_existing)

print("\nNúmero de features final para o modelo:", data_for_model.shape[1] - 1) # -1 para excluir a coluna 'isPlanet'
print("Features finais (amostra):")
print(data_for_model.columns.tolist())




Novas colunas de SNR criadas:
['pl_orbper_snr', 'pl_trandurh_snr', 'pl_trandep_snr', 'st_tmag_snr']

Número de features final para o modelo: 50
Features finais (amostra):
['st_tmag', 'st_tmagsymerr', 'st_tmaglim', 'ra', 'dec', 'st_pmra', 'st_pmraerr1', 'st_pmraerr2', 'st_pmralim', 'st_pmdec', 'st_pmdecerr1', 'st_pmdecerr2', 'st_pmdeclim', 'pl_tranmid', 'pl_tranmiderr1', 'pl_tranmiderr2', 'pl_tranmidlim', 'pl_orbper', 'pl_orbperlim', 'pl_trandurh', 'pl_trandurhlim', 'pl_trandep', 'pl_trandeplim', 'pl_rade', 'pl_radeerr1', 'pl_radeerr2', 'pl_radelim', 'pl_insol', 'pl_eqt', 'st_dist', 'st_disterr1', 'st_disterr2', 'st_distlim', 'st_teff', 'st_tefferr1', 'st_tefferr2', 'st_tefflim', 'st_logg', 'st_loggerr1', 'st_loggerr2', 'st_logglim', 'st_rad', 'st_raderr1', 'st_raderr2', 'st_radlim', 'isPlanet', 'star_id', 'pl_orbper_snr', 'pl_trandurh_snr', 'pl_trandep_snr', 'st_tmag_snr']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_engineered[snr_col_name].replace([np.inf, -np.inf], 0, inplace=True) # Substitui infinitos por 0
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_engineered[snr_col_name].replace([np.inf, -np.inf], 0, inplace=True) # Substitui infinitos por 0
The behavior will change in 

In [224]:
data_null_removed = data_for_model.dropna().copy()
print(f"Após remoção de nulos, total de {len(data_for_model)} sinais para processar.")

Após remoção de nulos, total de 7209 sinais para processar.


In [225]:
data_null_removed.to_csv('data_for_model.csv', index=False)

data_final = data_null_removed.copy()

In [226]:
missing_values = data_final.isnull().sum()
print(missing_values)

st_tmag            0
st_tmagsymerr      0
st_tmaglim         0
ra                 0
dec                0
st_pmra            0
st_pmraerr1        0
st_pmraerr2        0
st_pmralim         0
st_pmdec           0
st_pmdecerr1       0
st_pmdecerr2       0
st_pmdeclim        0
pl_tranmid         0
pl_tranmiderr1     0
pl_tranmiderr2     0
pl_tranmidlim      0
pl_orbper          0
pl_orbperlim       0
pl_trandurh        0
pl_trandurhlim     0
pl_trandep         0
pl_trandeplim      0
pl_rade            0
pl_radeerr1        0
pl_radeerr2        0
pl_radelim         0
pl_insol           0
pl_eqt             0
st_dist            0
st_disterr1        0
st_disterr2        0
st_distlim         0
st_teff            0
st_tefferr1        0
st_tefferr2        0
st_tefflim         0
st_logg            0
st_loggerr1        0
st_loggerr2        0
st_logglim         0
st_rad             0
st_raderr1         0
st_raderr2         0
st_radlim          0
isPlanet           0
star_id            0
pl_orbper_snr

In [227]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X = data_final.drop(columns=['isPlanet', 'star_id'])
y = data_final['isPlanet']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

groups = data_final['star_id']
print(f"Total de sinais: {len(X)}")
print(f"Total de estrelas únicas (grupos): {len(groups.unique())}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Tamanho do conjunto de treino: {X_train_scaled.shape[0]} amostras")
print(f"Tamanho do conjunto de teste: {X_test_scaled.shape[0]} amostras")


# Relação entre classes no conjunto de treino
print("Distribuição das classes no conjunto de treino:")
print(y_train.value_counts(normalize=True))

Total de sinais: 4826
Total de estrelas únicas (grupos): 5
Tamanho do conjunto de treino: 3860 amostras
Tamanho do conjunto de teste: 966 amostras
Distribuição das classes no conjunto de treino:
isPlanet
1    0.821244
0    0.178756
Name: proportion, dtype: float64


In [228]:


count_neg = y_train.value_counts()[0]
count_pos = y_train.value_counts()[1]
scale_pos_weight = count_neg / count_pos

models = {
	"Regressão Logística": LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
	"LDA": LinearDiscriminantAnalysis(),

	'KNN': KNeighborsClassifier(n_neighbors=5),
	'Gradient Boosting': GradientBoostingClassifier(random_state=SEED), 
	"Árvore de Decisão": DecisionTreeClassifier(random_state=42, class_weight='balanced'),
	"Random Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
	"SVM": SVC(random_state=42, probability=True, class_weight='balanced'),
	"XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=scale_pos_weight),
	"XGBoost (Equilibrado)": XGBClassifier(
		random_state=42,
		use_label_encoder=False,
		eval_metric='logloss',
		scale_pos_weight=scale_pos_weight,
		n_estimators=600,
		learning_rate=0.04,
		max_depth=6,
		subsample=0.8,
		colsample_bytree=0.8
	),
	"LightGBM": LGBMClassifier(random_state=42, class_weight='balanced'),
	"LightGBM (Tunado v3)": LGBMClassifier(
		random_state=42,
		class_weight='balanced',
		n_estimators=500,
		learning_rate=0.05,
	),
}


In [229]:

# Lista para armazenar os resultados de cada modelo
results = []

# Loop para treinar e avaliar cada modelo
for name, model in models.items():
	print(f"Treinando o modelo: {name}...")

	# Treinar o modelo
	model.fit(X_train_scaled, y_train)

	# Fazer previsões no conjunto de teste
	y_pred = model.predict(X_test_scaled)
	y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] # Probabilidades para a classe positiva (1)

	# Calcular as métricas
	accuracy = accuracy_score(y_test, y_pred)
	report = classification_report(y_test, y_pred, output_dict=True)
	roc_auc = roc_auc_score(y_test, y_pred_proba)
	prc_auc = average_precision_score(y_test, y_pred_proba)

	# Armazenar os resultados
	results.append({
		"Modelo": name,
		"AUC ROC": roc_auc,
		"AUC PRC": prc_auc,
		"Acurácia": accuracy,
		"Precisão (Planeta)": report['1']['precision'],
		"Recall (Planeta)": report['1']['recall'],
		"Precisão (FP)": report['0']['precision'],
		"Recall (FP)": report['0']['recall'],
		"F1-Score (Planeta)": report['1']['f1-score'],
		"F1-Score (FP)": report['0']['f1-score'],
		
	})

print("\nTodos os modelos foram treinados e avaliados.")

Treinando o modelo: Regressão Logística...
Treinando o modelo: LDA...
Treinando o modelo: KNN...
Treinando o modelo: Gradient Boosting...
Treinando o modelo: Árvore de Decisão...
Treinando o modelo: Random Forest...
Treinando o modelo: SVM...
Treinando o modelo: XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Treinando o modelo: XGBoost (Equilibrado)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Treinando o modelo: LightGBM...
[LightGBM] [Info] Number of positive: 3170, number of negative: 690
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8703
[LightGBM] [Info] Number of data points in the train set: 3860, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




Treinando o modelo: LightGBM (Tunado v3)...
[LightGBM] [Info] Number of positive: 3170, number of negative: 690
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000775 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8703
[LightGBM] [Info] Number of data points in the train set: 3860, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

Todos os modelos foram treinados e avaliados.




In [230]:
# %%
# Criar um DataFrame com os resultados
results_df = pd.DataFrame(results)

# Ordenar o DataFrame pelo AUC ROC para ver os melhores modelos primeiro
results_df_sorted = results_df.sort_values(by="AUC ROC", ascending=False)

# Exibir a tabela de resultados formatada
display(results_df_sorted.style.format({
	'Acurácia': '{:.4f}',
	'Acurácia (FP)': '{:.4f}',
	'Precisão (Planeta)': '{:.4f}',
	'Recall (Planeta)': '{:.4f}',
	'Precisão (FP)': '{:.4f}',
	'Recall (FP)': '{:.4f}',
	'F1-Score (Planeta)': '{:.4f}',
	'F1-Score (FP)': '{:.4f}',
	'AUC ROC': '{:.4f}',
	'AUC PRC': '{:.4f}'
}).background_gradient(cmap='viridis', subset=['Acurácia', 'F1-Score (Planeta)', 'AUC ROC', 'AUC PRC']))


# "Acurácia Média": np.mean(fold_accuracies),
# 		"Acurácia Desv. Padrão": np.std(fold_accuracies),
# 		"AUC ROC Médio": np.mean(fold_roc_aucs),
# 		"AUC PRC Médio": np.mean(fold_prc_aucs),
# 		"F1 (Planeta) Médio": np.mean(fold_f1_scores)

Unnamed: 0,Modelo,AUC ROC,AUC PRC,Acurácia,Precisão (Planeta),Recall (Planeta),Precisão (FP),Recall (FP),F1-Score (Planeta),F1-Score (FP)
8,XGBoost (Equilibrado),0.8873,0.9695,0.8696,0.9164,0.9256,0.6424,0.6127,0.921,0.6272
5,Random Forest,0.8786,0.9655,0.853,0.8637,0.9748,0.7183,0.2948,0.9159,0.418
9,LightGBM,0.8784,0.9677,0.8675,0.9204,0.918,0.6286,0.6358,0.9192,0.6322
10,LightGBM (Tunado v3),0.8766,0.9654,0.8706,0.9103,0.9344,0.6579,0.578,0.9222,0.6154
3,Gradient Boosting,0.8755,0.9673,0.8716,0.8876,0.966,0.7379,0.4393,0.9251,0.5507
7,XGBoost,0.8739,0.9665,0.8613,0.9093,0.9231,0.6211,0.578,0.9161,0.5988
6,SVM,0.8459,0.9534,0.8199,0.9269,0.8474,0.4979,0.6936,0.8854,0.5797
1,LDA,0.7906,0.9349,0.8468,0.8644,0.9647,0.6543,0.3064,0.9118,0.4173
0,Regressão Logística,0.79,0.9351,0.7588,0.9118,0.7818,0.3951,0.6532,0.8418,0.4924
2,KNN,0.7656,0.9113,0.8427,0.8589,0.9672,0.6438,0.2717,0.9098,0.3821
