In [1]:
!pip install uv
!uv pip install 'autogluon.tabular[mitra]'

[2mUsing Python 3.11.2 environment at: /home/jsr-mario/.venvs/jupyter_env[0m
[2mAudited [1m1 package[0m [2min 38ms[0m[0m


In [2]:
# !uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [3]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
import torch
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, average_precision_score,
    precision_recall_curve, roc_auc_score
)
import matplotlib.pyplot as plt

In [4]:
data = '../Data/1_cumulative_2025.csv'
df = pd.read_csv(data)

print(df.shape)

(9564, 83)


In [5]:
df['target'] = (df['koi_disposition'] == 'CONFIRMED').astype(int)
df = df.dropna(subset=['target'])

In [6]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())


(9564, 84)
Features disponibles: ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_vet_stat', 'koi_vet_date', 'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov', 'koi_comment', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_longp', 'koi_impact', 'koi_duration', 'koi_ingress', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_limbdark_mod', 'koi_ldm_coeff4', 'koi_ldm_coeff3', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_parm_prov', 'koi_max_sngle_ev', 'koi_max_mult_ev', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_tce_delivname', 'koi_quarters', 'koi_bin_oedp_sig', 'koi_trans_mod', 'koi_model_dof', 'koi_model_chisq', 'koi_datalink_dvr', 'koi_datalink_dvs', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sage', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'ko

In [7]:
drop_cols = [
    # 100% NaN
    'koi_longp', 'koi_model_chisq', 'koi_model_dof', 'koi_ingress', 'koi_sage',
    
    # IDs / nombres únicos / paths (no útiles como features)
    'rowid', 'kepoi_name', 
    
    # nombre oficial -> leakage (presente mayormente para confirmed)
    'kepler_name',
    
    # vetado / metadatos constantes o que filtran la etiqueta
    'koi_vet_stat', 'koi_vet_date', 'koi_disp_prov', 'koi_parm_prov',
    
    # delivery / report paths (cada fila distinto)
    'koi_tce_delivname', 'koi_datalink_dvr', 'koi_datalink_dvs',
    
    # columnas constantes según tu inspección (no aportan)
    'koi_limbdark_mod', 'koi_trans_mod', 'koi_ldm_coeff3', 'koi_ldm_coeff4',
    
    # CRÍTICO: Agregar estas
    'koi_disposition',      # target original
    'koi_pdisposition',     # disposición planetaria (LEAKAGE!)
    'koi_score',           # score de clasificación (posible LEAKAGE)
    'koi_comment',         # comentarios (posible LEAKAGE)

    # FLAGS DE VETTING (posible leakage indirecto)
    'koi_fpflag_nt',   # Flag: not transit-like
    'koi_fpflag_ss',   # Flag: stellar eclipse
    'koi_fpflag_co',   # Flag: centroid offset
    'koi_fpflag_ec',   # Flag: ephemeris match
    
    # MÉTRICAS DERIVADAS DEL VETTING
    'koi_max_mult_ev', # Eventos múltiples (calculado post-análisis)
    'koi_dicco_msky',  # Diferencia en centroide
    'koi_dikco_msky',  # Similar a dicco
    
    # OTROS SOSPECHOSOS
    'koi_bin_oedp_sig', # Significancia de odd-even depth (post-análisis)
]

drop_exist = [c for c in drop_cols]
df = df.drop(columns=drop_exist)

In [8]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())

(9564, 53)
Features disponibles: ['kepid', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_max_sngle_ev', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_quarters', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'koi_fwm_stat_sig', 'koi_fwm_sra', 'koi_fwm_sdec', 'koi_fwm_srao', 'koi_fwm_sdeco', 'koi_fwm_prao', 'koi_fwm_pdeco', 'koi_dicco_mra', 'koi_dicco_mdec', 'koi_dikco_mra', 'koi_dikco_mdec', 'target']


In [9]:

# 2. Densidad estelar vs período (ley de Kepler)
df['kepler_ratio'] = df['koi_srho'] * (df['koi_period'] ** 2)

# 3. Temperatura de equilibrio vs temperatura estelar
df['temp_ratio'] = df['koi_teq'] / df['koi_steff']

# 4. Duración esperada vs observada
expected_duration = 13 * df['koi_srad'] * (df['koi_period'] / 365.25) ** (1/3)
df['duration_anomaly'] = (df['koi_duration'] - expected_duration).abs()

# 5. Marca tránsitos ultra-profundos
df['ultra_deep'] = (df['koi_depth'] > 5000).astype(int)

df['log_period'] = np.log10(df['koi_period'])
df['log_depth'] = np.log10(df['koi_depth'])
df['log_prad'] = np.log10(df['koi_prad'])
df['log_teq'] = np.log10(df['koi_teq'])

# Clasificar por tipo de órbita
def classify_period(p):
    if p < 1:
        return 'ultra_short'  # Júpiter caliente extremo
    elif p < 10:
        return 'short'        # Júpiter caliente
    elif p < 100:
        return 'medium'       # Similar a Venus/Mercurio
    else:
        return 'long'         # Similar a Tierra o más

df['period_class'] = df['koi_period'].apply(classify_period)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())

(9564, 62)
Features disponibles: ['kepid', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_max_sngle_ev', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_quarters', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'koi_fwm_stat_sig', 'koi_fwm_sra', 'koi_fwm_sdec', 'koi_fwm_srao', 'koi_fwm_sdeco', 'koi_fwm_prao', 'koi_fwm_pdeco', 'koi_dicco_mra', 'koi_dicco_mdec', 'koi_dikco_mra', 'koi_dikco_mdec', 'target', 'kepler_ratio', 'temp_ratio', 'duration_anomaly', 'ultra_deep', 'log_period', 'log_depth', 'log_prad', 'log_teq', 'period_class']


In [11]:
#  agregar features derivadas

# Duración relativa del tránsito
df['transit_ratio'] = df['koi_duration'] / (df['koi_period'] * 24)  # en horas

# Temperatura vs período (ley de Kepler simplificada)
df['temp_period_ratio'] = df['koi_steff'] / np.sqrt(df['koi_period'])

# Impacto del tránsito (centralidad)
df['impact_squared'] = df['koi_impact'] ** 2

print(f"Shape con features nuevas: {df.shape}")

Shape con features nuevas: (9564, 65)


In [12]:
# df2 = pd.DataFrame()

# import math

df['transit_morgan'] = ( (df['koi_period'].astype(float) * df['koi_srad'].astype(float))
                          / (np.pi * df['koi_sma'].astype(float)) ) \
                        * np.sqrt(((1 + df['koi_prad'].astype(float) / df['koi_srad'].astype(float))**2)
                                  - df['koi_impact'].astype(float)**2)

df['caida_brillo']= (df['koi_ror'])**2


# Clasificar por número de tránsitos
def classify_transits(n):
    if n >= 3:
        return 1   # Candidato a planeta
    elif 1 <= n <= 2:
        return -1     # Falso positivo

df['transit_class'] = df['koi_num_transits'].apply(classify_transits)
# cols = [
#     'target',
#     "koi_period",
#     "koi_duration",
#     # "koi_ingress",
#     "koi_depth",
#     "koi_ror",
#     "koi_srho",
#     "koi_fittype",
#     "koi_prad",
#     "koi_insol",
#     "koi_srad",
#     "koi_impact",
#     "koi_sma"
# ]


# for c in cols:
#     df2[c] = df[c]

# df = df2
# print(df.shape)

In [13]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df['target'])
train_data = TabularDataset(train)
test_data = TabularDataset(test)

In [14]:
full_predictor = TabularPredictor(label='target', eval_metric='roc_auc', verbosity=1)

hyperparameters = {
    'GBM': {},
    'CAT': {},
    'XGB': {},
}

full_predictor.fit(
    train_data,
    presets='best_quality',
    time_limit=400,
    hyperparameters=hyperparameters,
    hyperparameter_tune_kwargs={
      'num_trials': 4,
      'scheduler': 'local',
      'searcher': 'random',
    },
    num_bag_folds=2,
    num_stack_levels=2,  # Limitar stacking
)


print("\nModel Leaderboard:")
full_predictor.leaderboard(test_data)


No path specified. Models will be saved in: "AutogluonModels/ag-20251004_235938"
2025-10-04 17:59:41,111	INFO worker.py:1843 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
  0%|          | 0/4 [00:00<?, ?it/s]
 25%|██▌       | 1/4 [00:02<00:08,  3.00s/it]
 50%|█████     | 2/4 [00:05<00:05,  2.94s/it]
 75%|███████▌  | 3/4 [00:08<00:02,  2.74s/it]
 75%|███████▌  | 3/4 [00:12<00:04,  4.09s/it]
  0%|          | 0/4 [00:00<?, ?it/s]
 25%|██▌       | 1/4 [00:05<00:15,  5.16s/it]
 50%|█████     | 2/4 [00:08<00:08,  4.21s/it]
 50%|█████     | 2/4 [00:12<00:12,  6.03s/it]
  0%|          | 0/4 [00:00<?, ?it/s]
[36m(_dystack pid=3391)[0m [36mray::_ray_fit()[39m (pid=5103, ip=10.71.12.11)
[36m(_dystack pid=3391)[0m   File "/home/jsr-mario/.venvs/jupyter_env/lib/python3.11/site-packages/autogluon/core/models/ensemble/fold_fitting_strategy.py", line 446, in _ray_fit
[36m(_dystack pid=3391)[0m     fold_model.fit(X=X_fold, y=y_fold, X_val=X_val

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

[36mray::_ray_fit()[39m (pid=10645, ip=10.71.12.11)
  File "/home/jsr-mario/.venvs/jupyter_env/lib/python3.11/site-packages/autogluon/core/models/ensemble/fold_fitting_strategy.py", line 446, in _ray_fit
    fold_model.fit(X=X_fold, y=y_fold, X_val=X_val_fold, y_val=y_val_fold, time_limit=time_limit_fold, **resources, **kwargs_fold)
  File "/home/jsr-mario/.venvs/jupyter_env/lib/python3.11/site-packages/autogluon/core/models/abstract/abstract_model.py", line 1068, in fit
    out = self._fit(**kwargs)
          ^^^^^^^^^^^^^^^^^^^
  File "/home/jsr-mario/.venvs/jupyter_env/lib/python3.11/site-packages/autogluon/tabular/models/xgboost/xgboost_model.py", line 191, in _fit
    self.model.fit(X=X, y=y, eval_set=eval_set, verbose=False, sample_weight=sample_weight)
  File "/home/jsr-mario/.venvs/jupyter_env/lib/python3.11/site-packages/xgboost/core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/jsr-mario/.venvs/jupyter_env/lib/python3.11/site-pa


Model Leaderboard:


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.98527,0.979885,roc_auc,0.191667,0.167619,46.082008,0.001816,0.000828,0.100952,2,True,9
1,CatBoost_BAG_L1/T4,0.984847,0.978945,roc_auc,0.013168,0.012453,14.649281,0.013168,0.012453,14.649281,1,True,8
2,CatBoost_BAG_L1/T1,0.984673,0.978919,roc_auc,0.029926,0.012028,5.968385,0.029926,0.012028,5.968385,1,True,5
3,CatBoost_BAG_L1/T2,0.984653,0.978428,roc_auc,0.008473,0.00935,3.980594,0.008473,0.00935,3.980594,1,True,6
4,LightGBM_BAG_L1/T3,0.984562,0.977873,roc_auc,0.016292,0.021739,3.199028,0.016292,0.021739,3.199028,1,True,3
5,LightGBM_BAG_L1/T1,0.984497,0.9781,roc_auc,0.050537,0.01664,2.644613,0.050537,0.01664,2.644613,1,True,1
6,CatBoost_BAG_L1/T3,0.984402,0.979151,roc_auc,0.012247,0.012349,16.604584,0.012247,0.012349,16.604584,1,True,7
7,LightGBM_BAG_L1/T4,0.98411,0.977292,roc_auc,0.083973,0.113321,6.114192,0.083973,0.113321,6.114192,1,True,4
8,LightGBM_BAG_L1/T2,0.981827,0.974998,roc_auc,0.010481,0.012385,2.911817,0.010481,0.012385,2.911817,1,True,2


In [15]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score
import pandas as pd

y_true = test_data['target'].values
y_pred = full_predictor.predict(test_data)  
y_proba = full_predictor.predict_proba(test_data)  

# ROC-AUC (complementa PR-AUC)
if isinstance(y_proba, pd.DataFrame):
    y_scores = y_proba[1].values if 1 in y_proba.columns else y_proba.iloc[:, -1].values
else:
    y_scores = y_proba[:, 1]

print(f"ROC-AUC: {roc_auc_score(y_true, y_scores):.4f}")
print(f"PR-AUC:  {average_precision_score(y_true, y_scores):.4f}")

# Matriz de confusión
cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(f"TN: {cm[0,0]:4d}  FP: {cm[0,1]:4d}")
print(f"FN: {cm[1,0]:4d}  TP: {cm[1,1]:4d}")

# Reporte detallado
print("\n" + classification_report(y_true, y_pred, 
      target_names=['Not Confirmed', 'Confirmed']))

ROC-AUC: 0.9853
PR-AUC:  0.9652

Confusion Matrix:
TN: 1981  FP:   65
FN:   94  TP:  730

               precision    recall  f1-score   support

Not Confirmed       0.95      0.97      0.96      2046
    Confirmed       0.92      0.89      0.90       824

     accuracy                           0.94      2870
    macro avg       0.94      0.93      0.93      2870
 weighted avg       0.94      0.94      0.94      2870



In [16]:
# Identificar casos donde el modelo falla
test_with_preds = test.copy()
test_with_preds['pred'] = y_pred
test_with_preds['pred_proba'] = y_scores
test_with_preds['error'] = test_with_preds['target'] != test_with_preds['pred']

# Falsos Positivos
fp = test_with_preds[(test_with_preds['target']==0) & (test_with_preds['pred']==1)]
print(f"\nFalsos Positivos: {len(fp)}")
if len(fp) > 0:
    print(fp[['kepid', 'koi_period', 'koi_depth', 'pred_proba']].head())

# Falsos Negativos
fn = test_with_preds[(test_with_preds['target']==1) & (test_with_preds['pred']==0)]
print(f"\nFalsos Negativos: {len(fn)}")
if len(fn) > 0:
    print(fn[['kepid', 'koi_period', 'koi_depth', 'pred_proba']].head())


Falsos Positivos: 65
         kepid  koi_period  koi_depth  pred_proba
1794   7449554    2.420883      223.3    0.886058
1189   9850893    8.480398      259.7    0.666760
2366  12602314    1.407020      181.9    0.815590
5143   5621333   27.096515      208.4    0.930753
2288  10336951   38.229204      605.8    0.948893

Falsos Negativos: 94
         kepid  koi_period  koi_depth  pred_proba
5873   4947556   13.026796      356.6    0.212275
778    7532973    2.144632    19837.0    0.002033
4927   6705026    9.508487       97.9    0.339672
6810   8037038    4.095139      181.7    0.460672
953   11517719    2.495780    25070.0    0.033481


In [17]:
# Ver qué features usa el modelo
fi = full_predictor.feature_importance(test_data, silent=True)  # ← Cambio aquí
print("\nTop 15 Features:")
print(fi.head(15))


Top 15 Features:
                  importance    stddev   p_value  n  p99_high   p99_low
koi_model_snr       0.039021  0.002536  0.000002  5  0.044242  0.033800
koi_count           0.004922  0.000538  0.000017  5  0.006029  0.003815
koi_prad            0.004273  0.001169  0.000610  5  0.006680  0.001866
duration_anomaly    0.003089  0.000693  0.000284  5  0.004515  0.001663
koi_dicco_mdec      0.002589  0.000449  0.000104  5  0.003514  0.001665
koi_dicco_mra       0.002524  0.000460  0.000127  5  0.003472  0.001577
koi_fwm_srao        0.001229  0.000135  0.000017  5  0.001507  0.000951
koi_dikco_mra       0.001125  0.000300  0.000552  5  0.001743  0.000508
koi_smet            0.001093  0.000140  0.000032  5  0.001382  0.000804
koi_dikco_mdec      0.000950  0.000251  0.000532  5  0.001466  0.000434
transit_morgan      0.000913  0.000387  0.003087  5  0.001709  0.000117
koi_ror             0.000766  0.000419  0.007515  5  0.001630 -0.000097
log_prad            0.000721  0.000508  0.0169

In [18]:
print(full_predictor.path)

/home/jsr-mario/Documents/otros/cursos/NASA/Back/AutogluonModels/ag-20251004_235938


In [19]:
predictor = TabularPredictor.load('AutogluonModels/ag-20251004_235938')  # usa tu ruta real
predicciones = predictor.predict(df)

df['prediction'] = predicciones
df.to_csv('predicciones.csv', index=False)