In [1]:
!pip install uv
!uv pip install 'autogluon.tabular[mitra]'

[2mUsing Python 3.11.2 environment at: /mnt/raid/.venvs/jupyter[0m
[2mAudited [1m1 package[0m [2min 17ms[0m[0m


In [2]:
!uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

[2mUsing Python 3.11.2 environment at: /mnt/raid/.venvs/jupyter[0m
[2mAudited [1m3 packages[0m [2min 10ms[0m[0m


In [3]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
import torch
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, average_precision_score,
    precision_recall_curve, roc_auc_score
)
import matplotlib.pyplot as plt

In [4]:
data = '../Data/1_cumulative_2025.csv'
df = pd.read_csv(data)

print(df.shape)

(9564, 83)


In [5]:
df['target'] = (df['koi_disposition'] == 'CONFIRMED').astype(int)
df = df.dropna(subset=['target'])

In [6]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())


(9564, 84)
Features disponibles: ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_vet_stat', 'koi_vet_date', 'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov', 'koi_comment', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_longp', 'koi_impact', 'koi_duration', 'koi_ingress', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_limbdark_mod', 'koi_ldm_coeff4', 'koi_ldm_coeff3', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_parm_prov', 'koi_max_sngle_ev', 'koi_max_mult_ev', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_tce_delivname', 'koi_quarters', 'koi_bin_oedp_sig', 'koi_trans_mod', 'koi_model_dof', 'koi_model_chisq', 'koi_datalink_dvr', 'koi_datalink_dvs', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sage', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'ko

In [7]:
drop_cols = [
    # 100% NaN
    'koi_longp', 'koi_model_chisq', 'koi_model_dof', 'koi_ingress', 'koi_sage',
    
    # IDs / nombres únicos / paths (no útiles como features)
    'rowid', 'kepoi_name', 
    
    # nombre oficial -> leakage (presente mayormente para confirmed)
    'kepler_name',
    
    # vetado / metadatos constantes o que filtran la etiqueta
    'koi_vet_stat', 'koi_vet_date', 'koi_disp_prov', 'koi_parm_prov',
    
    # delivery / report paths (cada fila distinto)
    'koi_tce_delivname', 'koi_datalink_dvr', 'koi_datalink_dvs',
    
    # columnas constantes según tu inspección (no aportan)
    'koi_limbdark_mod', 'koi_trans_mod', 'koi_ldm_coeff3', 'koi_ldm_coeff4',
    
    # CRÍTICO: Agregar estas
    'koi_disposition',      # target original
    'koi_pdisposition',     # disposición planetaria (LEAKAGE!)
    'koi_score',           # score de clasificación (posible LEAKAGE)
    'koi_comment',         # comentarios (posible LEAKAGE)

    # FLAGS DE VETTING (posible leakage indirecto)
    'koi_fpflag_nt',   # Flag: not transit-like
    'koi_fpflag_ss',   # Flag: stellar eclipse
    'koi_fpflag_co',   # Flag: centroid offset
    'koi_fpflag_ec',   # Flag: ephemeris match
    
    # MÉTRICAS DERIVADAS DEL VETTING
    'koi_max_mult_ev', # Eventos múltiples (calculado post-análisis)
    'koi_dicco_msky',  # Diferencia en centroide
    'koi_dikco_msky',  # Similar a dicco
    
    # OTROS SOSPECHOSOS
    'koi_bin_oedp_sig', # Significancia de odd-even depth (post-análisis)
]

drop_exist = [c for c in drop_cols]
df = df.drop(columns=drop_exist)

In [8]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())

(9564, 53)
Features disponibles: ['kepid', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_max_sngle_ev', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_quarters', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'koi_fwm_stat_sig', 'koi_fwm_sra', 'koi_fwm_sdec', 'koi_fwm_srao', 'koi_fwm_sdeco', 'koi_fwm_prao', 'koi_fwm_pdeco', 'koi_dicco_mra', 'koi_dicco_mdec', 'koi_dikco_mra', 'koi_dikco_mdec', 'target']


In [9]:

# 2. Densidad estelar vs período (ley de Kepler)
df['kepler_ratio'] = df['koi_srho'] * (df['koi_period'] ** 2)

# 3. Temperatura de equilibrio vs temperatura estelar
df['temp_ratio'] = df['koi_teq'] / df['koi_steff']

# 4. Duración esperada vs observada
expected_duration = 13 * df['koi_srad'] * (df['koi_period'] / 365.25) ** (1/3)
df['duration_anomaly'] = (df['koi_duration'] - expected_duration).abs()

# 5. Marca tránsitos ultra-profundos
df['ultra_deep'] = (df['koi_depth'] > 5000).astype(int)

df['log_period'] = np.log10(df['koi_period'])
df['log_depth'] = np.log10(df['koi_depth'])
df['log_prad'] = np.log10(df['koi_prad'])
df['log_teq'] = np.log10(df['koi_teq'])

# Clasificar por tipo de órbita
def classify_period(p):
    if p < 1:
        return 'ultra_short'  # Júpiter caliente extremo
    elif p < 10:
        return 'short'        # Júpiter caliente
    elif p < 100:
        return 'medium'       # Similar a Venus/Mercurio
    else:
        return 'long'         # Similar a Tierra o más

df['period_class'] = df['koi_period'].apply(classify_period)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())

(9564, 62)
Features disponibles: ['kepid', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_max_sngle_ev', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_quarters', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'koi_fwm_stat_sig', 'koi_fwm_sra', 'koi_fwm_sdec', 'koi_fwm_srao', 'koi_fwm_sdeco', 'koi_fwm_prao', 'koi_fwm_pdeco', 'koi_dicco_mra', 'koi_dicco_mdec', 'koi_dikco_mra', 'koi_dikco_mdec', 'target', 'kepler_ratio', 'temp_ratio', 'duration_anomaly', 'ultra_deep', 'log_period', 'log_depth', 'log_prad', 'log_teq', 'period_class']


In [11]:
#  agregar features derivadas

# Duración relativa del tránsito
df['transit_ratio'] = df['koi_duration'] / (df['koi_period'] * 24)  # en horas

# Temperatura vs período (ley de Kepler simplificada)
df['temp_period_ratio'] = df['koi_steff'] / np.sqrt(df['koi_period'])

# Impacto del tránsito (centralidad)
df['impact_squared'] = df['koi_impact'] ** 2

print(f"Shape con features nuevas: {df.shape}")

Shape con features nuevas: (9564, 65)


In [12]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df['target'])
train_data = TabularDataset(train)
test_data = TabularDataset(test)

In [13]:
full_predictor = TabularPredictor(label='target', eval_metric='roc_auc', verbosity=2)

hyperparameters = {
    'GBM': {},
    'CAT': {},
    'XGB': {},
}

full_predictor.fit(
    train_data,
    presets='best_quality',
    time_limit=1200,
    hyperparameters=hyperparameters,
    hyperparameter_tune_kwargs={
      'num_trials': 10,
      'scheduler': 'local',
      'searcher': 'random',
    },
    num_bag_folds=10,
    num_stack_levels=2,  # Limitar stacking
)


print("\nModel Leaderboard:")
full_predictor.leaderboard(test_data)


No path specified. Models will be saved in: "AutogluonModels/ag-20251002_044542"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Debian 6.1.153-1 (2025-09-20)
CPU Count:          12
Memory Avail:       8.76 GB / 15.55 GB (56.3%)
Disk Space Avail:   4182.64 GB / 4619.45 GB (90.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=2, num_bag_folds=10, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout valida

  0%|          | 0/10 [00:00<?, ?it/s]

	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=0.49%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=1.16%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=0.70%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=0.89%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=0.55%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=0.95%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=0.70%)
	Fitting 10 child models (S1F1 - S1F10) | Fittin

  0%|          | 0/10 [00:00<?, ?it/s]

	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=6.70%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=5.47%)
	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=6.34%)
	Stopping HPO to satisfy time limit...
Fitted model: CatBoost_BAG_L1/T1 ...
	0.9813	 = Validation score   (roc_auc)
	39.67s	 = Training   runtime
	0.08s	 = Validation runtime
Fitted model: CatBoost_BAG_L1/T2 ...
	0.9812	 = Validation score   (roc_auc)
	22.08s	 = Training   runtime
	0.08s	 = Validation runtime
Fitted model: CatBoost_BAG_L1/T3 ...
	0.9813	 = Validation score   (roc_auc)
	147.37s	 = Training   runtime
	0.09s	 = Validation runtime
Hyperparameter tuning model: XGBoost_BAG_L1 ... Tuning model for up to 274.62s of the 534.41s of remaining time.


  0%|          | 0/10 [00:00<?, ?it/s]

	Fitting 10 child models (S1F1 - S1F10) | Fitting with ParallelLocalFoldFittingStrategy (10 workers, per: cpus=1, gpus=0, memory=0.73%)
[36mray::_ray_fit()[39m (pid=87661, ip=192.168.1.75)
  File "/mnt/raid/.venvs/jupyter/lib/python3.11/site-packages/autogluon/core/models/ensemble/fold_fitting_strategy.py", line 446, in _ray_fit
    fold_model.fit(X=X_fold, y=y_fold, X_val=X_val_fold, y_val=y_val_fold, time_limit=time_limit_fold, **resources, **kwargs_fold)
  File "/mnt/raid/.venvs/jupyter/lib/python3.11/site-packages/autogluon/core/models/abstract/abstract_model.py", line 1068, in fit
    out = self._fit(**kwargs)
          ^^^^^^^^^^^^^^^^^^^
  File "/mnt/raid/.venvs/jupyter/lib/python3.11/site-packages/autogluon/tabular/models/xgboost/xgboost_model.py", line 191, in _fit
    self.model.fit(X=X, y=y, eval_set=eval_set, verbose=False, sample_weight=sample_weight)
  File "/mnt/raid/.venvs/jupyter/lib/python3.11/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kw


Model Leaderboard:


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.985373,0.982809,roc_auc,0.433121,0.553943,243.465892,0.002522,0.001121,0.150444,2,True,14
1,LightGBM_BAG_L1/T1,0.985225,0.981457,roc_auc,0.180367,0.125191,13.083639,0.180367,0.125191,13.083639,1,True,1
2,LightGBM_BAG_L1/T5,0.984951,0.981888,roc_auc,0.063238,0.093872,10.514207,0.063238,0.093872,10.514207,1,True,5
3,CatBoost_BAG_L1/T1,0.984935,0.981264,roc_auc,0.081225,0.076012,39.670042,0.081225,0.076012,39.670042,1,True,11
4,LightGBM_BAG_L1/T10,0.984895,0.980755,roc_auc,0.047914,0.057473,9.224936,0.047914,0.057473,9.224936,1,True,10
5,CatBoost_BAG_L1/T2,0.984891,0.981176,roc_auc,0.044259,0.0753,22.07733,0.044259,0.0753,22.07733,1,True,12
6,LightGBM_BAG_L1/T8,0.984853,0.981476,roc_auc,0.067914,0.095459,10.598997,0.067914,0.095459,10.598997,1,True,8
7,CatBoost_BAG_L1/T3,0.984746,0.981309,roc_auc,0.076775,0.087435,147.371008,0.076775,0.087435,147.371008,1,True,13
8,LightGBM_BAG_L1/T4,0.984706,0.980468,roc_auc,0.518651,0.560855,32.586603,0.518651,0.560855,32.586603,1,True,4
9,LightGBM_BAG_L1/T3,0.984677,0.98158,roc_auc,0.097187,0.124744,13.083863,0.097187,0.124744,13.083863,1,True,3


In [14]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score
import pandas as pd

# Usar el nombre correcto del predictor
y_true = test_data['target'].values
y_pred = full_predictor.predict(test_data)  # ← Cambio aquí
y_proba = full_predictor.predict_proba(test_data)  # ← Y aquí

# ROC-AUC (complementa PR-AUC)
if isinstance(y_proba, pd.DataFrame):
    y_scores = y_proba[1].values if 1 in y_proba.columns else y_proba.iloc[:, -1].values
else:
    y_scores = y_proba[:, 1]

print(f"ROC-AUC: {roc_auc_score(y_true, y_scores):.4f}")
print(f"PR-AUC:  {average_precision_score(y_true, y_scores):.4f}")

# Matriz de confusión
cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(f"TN: {cm[0,0]:4d}  FP: {cm[0,1]:4d}")
print(f"FN: {cm[1,0]:4d}  TP: {cm[1,1]:4d}")

# Reporte detallado
print("\n" + classification_report(y_true, y_pred, 
      target_names=['Not Confirmed', 'Confirmed']))

ROC-AUC: 0.9854
PR-AUC:  0.9648

Confusion Matrix:
TN: 1979  FP:   67
FN:   90  TP:  734

               precision    recall  f1-score   support

Not Confirmed       0.96      0.97      0.96      2046
    Confirmed       0.92      0.89      0.90       824

     accuracy                           0.95      2870
    macro avg       0.94      0.93      0.93      2870
 weighted avg       0.94      0.95      0.95      2870



In [15]:
# Identificar casos donde el modelo falla
test_with_preds = test.copy()
test_with_preds['pred'] = y_pred
test_with_preds['pred_proba'] = y_scores
test_with_preds['error'] = test_with_preds['target'] != test_with_preds['pred']

# Falsos Positivos
fp = test_with_preds[(test_with_preds['target']==0) & (test_with_preds['pred']==1)]
print(f"\nFalsos Positivos: {len(fp)}")
if len(fp) > 0:
    print(fp[['kepid', 'koi_period', 'koi_depth', 'pred_proba']].head())

# Falsos Negativos
fn = test_with_preds[(test_with_preds['target']==1) & (test_with_preds['pred']==0)]
print(f"\nFalsos Negativos: {len(fn)}")
if len(fn) > 0:
    print(fn[['kepid', 'koi_period', 'koi_depth', 'pred_proba']].head())


Falsos Positivos: 67
         kepid  koi_period  koi_depth  pred_proba
1794   7449554    2.420883      223.3    0.899205
1189   9850893    8.480398      259.7    0.796177
2366  12602314    1.407020      181.9    0.907932
5143   5621333   27.096515      208.4    0.948784
4177   6223324    1.975074      130.5    0.560758

Falsos Negativos: 90
        kepid  koi_period  koi_depth  pred_proba
5715  6803855   71.452317      216.9    0.439567
5873  4947556   13.026796      356.6    0.143835
778   7532973    2.144632    19837.0    0.001231
4927  6705026    9.508487       97.9    0.249275
6810  8037038    4.095139      181.7    0.499297


In [16]:
# Ver qué features usa el modelo
fi = full_predictor.feature_importance(test_data, silent=True)  # ← Cambio aquí
print("\nTop 15 Features:")
print(fi.head(15))


Top 15 Features:
                  importance    stddev   p_value  n  p99_high   p99_low
koi_model_snr       0.034780  0.002179  0.000002  5  0.039267  0.030293
koi_count           0.004761  0.000504  0.000015  5  0.005798  0.003724
koi_prad            0.004091  0.001241  0.000903  5  0.006646  0.001535
duration_anomaly    0.003424  0.000740  0.000246  5  0.004946  0.001901
koi_dicco_mdec      0.002649  0.000332  0.000029  5  0.003333  0.001966
log_prad            0.002318  0.000933  0.002572  5  0.004239  0.000396
koi_dicco_mra       0.002310  0.000441  0.000152  5  0.003219  0.001402
koi_ror             0.002073  0.000630  0.000908  5  0.003371  0.000776
koi_fwm_srao        0.001738  0.000162  0.000009  5  0.002071  0.001405
koi_smet            0.001171  0.000160  0.000041  5  0.001501  0.000840
koi_dikco_mra       0.001010  0.000235  0.000328  5  0.001494  0.000526
transit_ratio       0.000985  0.000405  0.002768  5  0.001819  0.000152
koi_impact          0.000959  0.000333  0.0014

In [17]:
# Verificar correlaciones básicas
import pandas as pd

correlations = []
for col in ['koi_period', 'koi_depth', 'koi_duration', 'koi_prad', 'koi_model_snr']:
    if col in df.columns:
        corr = df[[col, 'target']].corr().iloc[0, 1]
        correlations.append((col, corr))

print("Correlación directa con target:")
for feat, corr in sorted(correlations, key=lambda x: abs(x[1]), reverse=True):
    print(f"  {feat:20s}: {corr:+.4f}")

Correlación directa con target:
  koi_depth           : -0.1803
  koi_model_snr       : -0.1485
  koi_duration        : -0.1349
  koi_period          : -0.0227
  koi_prad            : -0.0212
