In [1]:
!pip install uv
!uv pip install 'autogluon.tabular[mitra]'

[2mUsing Python 3.11.2 environment at: /mnt/raid/.venvs/jupyter[0m
[2mAudited [1m1 package[0m [2min 20ms[0m[0m


In [2]:
!uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

[2mUsing Python 3.11.2 environment at: /mnt/raid/.venvs/jupyter[0m
[2mAudited [1m3 packages[0m [2min 10ms[0m[0m


In [3]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
import torch
from sklearn.metrics import (
    accuracy_score, classification_report,
    confusion_matrix, average_precision_score,
    precision_recall_curve, roc_auc_score
)
import matplotlib.pyplot as plt

In [4]:
data = '../Data/1_cumulative_2025.csv'
df = pd.read_csv(data)

print(df.shape)

(9564, 83)


In [5]:
df['target'] = (df['koi_disposition'] == 'CONFIRMED').astype(int)
df = df.dropna(subset=['target'])

In [6]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())


(9564, 84)
Features disponibles: ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_disposition', 'koi_vet_stat', 'koi_vet_date', 'koi_pdisposition', 'koi_score', 'koi_fpflag_nt', 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov', 'koi_comment', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_longp', 'koi_impact', 'koi_duration', 'koi_ingress', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_limbdark_mod', 'koi_ldm_coeff4', 'koi_ldm_coeff3', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_parm_prov', 'koi_max_sngle_ev', 'koi_max_mult_ev', 'koi_model_snr', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_tce_delivname', 'koi_quarters', 'koi_bin_oedp_sig', 'koi_trans_mod', 'koi_model_dof', 'koi_model_chisq', 'koi_datalink_dvr', 'koi_datalink_dvs', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sage', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'ko

In [7]:
drop_cols = [
    # 100% NaN
    'koi_longp', 'koi_model_chisq', 'koi_model_dof', 'koi_ingress', 'koi_sage',
    
    # IDs / nombres únicos / paths (no útiles como features)
    'rowid', 'kepoi_name', 
    
    # nombre oficial -> leakage (presente mayormente para confirmed)
    'kepler_name',
    
    # vetado / metadatos constantes o que filtran la etiqueta
    'koi_vet_stat', 'koi_vet_date', 'koi_disp_prov', 'koi_parm_prov',
    
    # delivery / report paths (cada fila distinto)
    'koi_tce_delivname', 'koi_datalink_dvr', 'koi_datalink_dvs',
    
    # columnas constantes según tu inspección (no aportan)
    'koi_limbdark_mod', 'koi_trans_mod', 'koi_ldm_coeff3', 'koi_ldm_coeff4',
    
    # CRÍTICO: Agregar estas
    'koi_disposition',      # target original
    'koi_pdisposition',     # disposición planetaria (LEAKAGE!)
    'koi_score',           # score de clasificación (posible LEAKAGE)
    'koi_comment',         # comentarios (posible LEAKAGE)

    # FLAGS DE VETTING (posible leakage indirecto)
    'koi_fpflag_nt',   # Flag: not transit-like
    'koi_fpflag_ss',   # Flag: stellar eclipse
    'koi_fpflag_co',   # Flag: centroid offset
    'koi_fpflag_ec',   # Flag: ephemeris match
    
    # MÉTRICAS DERIVADAS DEL VETTING
    'koi_max_mult_ev', # Eventos múltiples (calculado post-análisis)
    'koi_dicco_msky',  # Diferencia en centroide
    'koi_dikco_msky',  # Similar a dicco
    
    # OTROS SOSPECHOSOS
    'koi_bin_oedp_sig', # Significancia de odd-even depth (post-análisis)

    'koi_model_snr', # puede que no sea valido
]

drop_exist = [c for c in drop_cols]
df = df.drop(columns=drop_exist)

In [8]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())

(9564, 52)
Features disponibles: ['kepid', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_max_sngle_ev', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_quarters', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'koi_fwm_stat_sig', 'koi_fwm_sra', 'koi_fwm_sdec', 'koi_fwm_srao', 'koi_fwm_sdeco', 'koi_fwm_prao', 'koi_fwm_pdeco', 'koi_dicco_mra', 'koi_dicco_mdec', 'koi_dikco_mra', 'koi_dikco_mdec', 'target']


In [9]:

# 2. Densidad estelar vs período (ley de Kepler)
df['kepler_ratio'] = df['koi_srho'] * (df['koi_period'] ** 2)

# 3. Temperatura de equilibrio vs temperatura estelar
df['temp_ratio'] = df['koi_teq'] / df['koi_steff']

# 4. Duración esperada vs observada
expected_duration = 13 * df['koi_srad'] * (df['koi_period'] / 365.25) ** (1/3)
df['duration_anomaly'] = (df['koi_duration'] - expected_duration).abs()

# 5. Marca tránsitos ultra-profundos
df['ultra_deep'] = (df['koi_depth'] > 5000).astype(int)

df['log_period'] = np.log10(df['koi_period'])
df['log_depth'] = np.log10(df['koi_depth'])
df['log_prad'] = np.log10(df['koi_prad'])
df['log_teq'] = np.log10(df['koi_teq'])

# Clasificar por tipo de órbita
def classify_period(p):
    if p < 1:
        return 'ultra_short'  # Júpiter caliente extremo
    elif p < 10:
        return 'short'        # Júpiter caliente
    elif p < 100:
        return 'medium'       # Similar a Venus/Mercurio
    else:
        return 'long'         # Similar a Tierra o más

df['period_class'] = df['koi_period'].apply(classify_period)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [10]:
print(df.shape)
print("Features disponibles:", df.columns.tolist())

(9564, 61)
Features disponibles: ['kepid', 'koi_period', 'koi_time0bk', 'koi_time0', 'koi_eccen', 'koi_impact', 'koi_duration', 'koi_depth', 'koi_ror', 'koi_srho', 'koi_fittype', 'koi_prad', 'koi_sma', 'koi_incl', 'koi_teq', 'koi_insol', 'koi_dor', 'koi_ldm_coeff2', 'koi_ldm_coeff1', 'koi_max_sngle_ev', 'koi_count', 'koi_num_transits', 'koi_tce_plnt_num', 'koi_quarters', 'koi_steff', 'koi_slogg', 'koi_smet', 'koi_srad', 'koi_smass', 'koi_sparprov', 'ra', 'dec', 'koi_kepmag', 'koi_gmag', 'koi_rmag', 'koi_imag', 'koi_zmag', 'koi_jmag', 'koi_hmag', 'koi_kmag', 'koi_fwm_stat_sig', 'koi_fwm_sra', 'koi_fwm_sdec', 'koi_fwm_srao', 'koi_fwm_sdeco', 'koi_fwm_prao', 'koi_fwm_pdeco', 'koi_dicco_mra', 'koi_dicco_mdec', 'koi_dikco_mra', 'koi_dikco_mdec', 'target', 'kepler_ratio', 'temp_ratio', 'duration_anomaly', 'ultra_deep', 'log_period', 'log_depth', 'log_prad', 'log_teq', 'period_class']


In [11]:
# ANTES del train_test_split, agregar features derivadas
import numpy as np

# 2. Duración relativa del tránsito
df['transit_ratio'] = df['koi_duration'] / (df['koi_period'] * 24)  # en horas

# 3. Temperatura vs período (ley de Kepler simplificada)
df['temp_period_ratio'] = df['koi_steff'] / np.sqrt(df['koi_period'])

# 4. Impacto del tránsito (centralidad)
df['impact_squared'] = df['koi_impact'] ** 2

print(f"Shape con features nuevas: {df.shape}")

Shape con features nuevas: (9564, 64)


In [12]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df['target'])
train_data = TabularDataset(train)
test_data = TabularDataset(test)

In [13]:
full_predictor = TabularPredictor(label='target', eval_metric='roc_auc', verbosity=2)

hyperparameters = {
    'GBM': [
        {'extra_trees': True, 'num_leaves': 64, 'learning_rate': 0.05, 'ag_args': {'name_suffix': '_XT'}},
        {'extra_trees': False, 'num_leaves': 128, 'learning_rate': 0.03, 'ag_args': {'name_suffix': '_Large'}},
    ],
    'CAT': [
        {'iterations': 300, 'learning_rate': 0.05, 'ag_args': {'name_suffix': '_300it'}},
        {},  
    ],
    'XGB': [
        {'nthread': 4, 'num_round': 200, 'learning_rate': 0.05, 'max_depth': 6},
        {},  # default
    ],
    'RF': [
        {'n_estimators': 300, 'criterion': 'gini'},
    ],
    'XT': [
        {'n_estimators': 300, 'criterion': 'gini'},
    ],
    'NN_TORCH': [
        {'max_epochs': 30, 'hidden_size': 128, 'num_layers': 2, 'dropout_prob': 0.1, 'ag_args': {'name_suffix': '_nn'}},
    ],
}

full_predictor.fit(
    train_data,
    presets='best_quality',
    time_limit=120,
    num_gpus=1,
    hyperparameters=hyperparameters,
    hyperparameter_tune_kwargs={
        'scheduler': 'local',
        'searcher': 'random',
        'num_trials': 5,       # ← 10 trials por modelo
    }
)

print("\nModel Leaderboard:")
full_predictor.leaderboard(test_data)


2025-10-01 02:05:23,538	INFO timeout.py:54 -- Reached timeout of 1.9679526090621948 seconds. Stopping all trials.
2025-10-01 02:05:23,547	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/mnt/raid/Documents/Other/NASA/NASA-Hackathon/Back/AutogluonModels/ag-20251001_080318/models/NeuralNetTorch_nn_BAG_L2' in 0.0040s.
- 6072b_00000: FileNotFoundError('Could not fetch metrics for 6072b_00000: both result.json and progress.csv were not found at /mnt/raid/Documents/Other/NASA/NASA-Hackathon/Back/AutogluonModels/ag-20251001_080318/models/NeuralNetTorch_nn_BAG_L2/6072b_00000')
- 6072b_00001: FileNotFoundError('Could not fetch metrics for 6072b_00001: both result.json and progress.csv were not found at /mnt/raid/Documents/Other/NASA/NASA-Hackathon/Back/AutogluonModels/ag-20251001_080318/models/NeuralNetTorch_nn_BAG_L2/6072b_00001')
- 6072b_00002: FileNotFoundError('Could not fetch metrics for 6072b_00002: both result.json and progress.csv were not foun


Model Leaderboard:


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,0.982712,0.97902,roc_auc,0.266915,0.286836,17.399515,0.001457,0.000989,0.109575,3,True,8
1,WeightedEnsemble_L2,0.982712,0.97902,roc_auc,0.267017,0.286977,17.382099,0.001559,0.001129,0.092159,2,True,5
2,LightGBM_XT_BAG_L1/T1,0.982632,0.978967,roc_auc,0.162619,0.166081,8.385677,0.162619,0.166081,8.385677,1,True,1
3,LightGBM_XT_BAG_L2/T1,0.98209,0.976804,roc_auc,0.414255,0.477694,52.516219,0.076939,0.109579,5.962197,2,True,6
4,LightGBM_Large_BAG_L2/T1,0.981432,0.972446,roc_auc,0.389282,0.417992,52.760384,0.051967,0.049877,6.206362,2,True,7
5,LightGBM_Large_BAG_L1/T1,0.980435,0.974319,roc_auc,0.102839,0.119766,8.904263,0.102839,0.119766,8.904263,1,True,2
6,CatBoost_300it_BAG_L1/T1,0.975662,0.969346,roc_auc,0.04751,0.040883,14.710468,0.04751,0.040883,14.710468,1,True,3
7,CatBoost_BAG_L1/T1,0.975508,0.969127,roc_auc,0.024347,0.041385,14.553614,0.024347,0.041385,14.553614,1,True,4


In [14]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score
import pandas as pd

# Usar el nombre correcto del predictor
y_true = test_data['target'].values
y_pred = full_predictor.predict(test_data)  # ← Cambio aquí
y_proba = full_predictor.predict_proba(test_data)  # ← Y aquí

# ROC-AUC (complementa PR-AUC)
if isinstance(y_proba, pd.DataFrame):
    y_scores = y_proba[1].values if 1 in y_proba.columns else y_proba.iloc[:, -1].values
else:
    y_scores = y_proba[:, 1]

print(f"ROC-AUC: {roc_auc_score(y_true, y_scores):.4f}")
print(f"PR-AUC:  {average_precision_score(y_true, y_scores):.4f}")

# Matriz de confusión
cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(f"TN: {cm[0,0]:4d}  FP: {cm[0,1]:4d}")
print(f"FN: {cm[1,0]:4d}  TP: {cm[1,1]:4d}")

# Reporte detallado
print("\n" + classification_report(y_true, y_pred, 
      target_names=['Not Confirmed', 'Confirmed']))

ROC-AUC: 0.9827
PR-AUC:  0.9586

Confusion Matrix:
TN: 1966  FP:   80
FN:  111  TP:  713

               precision    recall  f1-score   support

Not Confirmed       0.95      0.96      0.95      2046
    Confirmed       0.90      0.87      0.88       824

     accuracy                           0.93      2870
    macro avg       0.92      0.91      0.92      2870
 weighted avg       0.93      0.93      0.93      2870



In [15]:
# Identificar casos donde el modelo falla
test_with_preds = test.copy()
test_with_preds['pred'] = y_pred
test_with_preds['pred_proba'] = y_scores
test_with_preds['error'] = test_with_preds['target'] != test_with_preds['pred']

# Falsos Positivos
fp = test_with_preds[(test_with_preds['target']==0) & (test_with_preds['pred']==1)]
print(f"\nFalsos Positivos: {len(fp)}")
if len(fp) > 0:
    print(fp[['kepid', 'koi_period', 'koi_depth', 'pred_proba']].head())

# Falsos Negativos
fn = test_with_preds[(test_with_preds['target']==1) & (test_with_preds['pred']==0)]
print(f"\nFalsos Negativos: {len(fn)}")
if len(fn) > 0:
    print(fn[['kepid', 'koi_period', 'koi_depth', 'pred_proba']].head())


Falsos Positivos: 80
        kepid  koi_period  koi_depth  pred_proba
4216  8098212    3.487541       85.1    0.685562
1794  7449554    2.420883      223.3    0.665459
1189  9850893    8.480398      259.7    0.508897
2120  8803882   89.465468       67.9    0.565087
5143  5621333   27.096515      208.4    0.772619

Falsos Negativos: 111
        kepid  koi_period  koi_depth  pred_proba
5715  6803855   71.452317      216.9    0.389670
5873  4947556   13.026796      356.6    0.189490
778   7532973    2.144632    19837.0    0.001088
2858  5978170    5.688791      241.5    0.460132
4927  6705026    9.508487       97.9    0.112578


In [16]:
# Ver qué features usa el modelo
fi = full_predictor.feature_importance(test_data, silent=True)  # ← Cambio aquí
print("\nTop 15 Features:")
print(fi.head(15))


Top 15 Features:
                  importance    stddev   p_value  n  p99_high   p99_low
koi_max_sngle_ev    0.011362  0.001082  0.000010  5  0.013589  0.009135
koi_count           0.007812  0.000661  0.000006  5  0.009172  0.006451
duration_anomaly    0.006622  0.001032  0.000069  5  0.008747  0.004497
koi_fittype         0.005136  0.000779  0.000062  5  0.006741  0.003531
koi_fwm_srao        0.003752  0.000500  0.000037  5  0.004781  0.002723
koi_dicco_mra       0.003562  0.000626  0.000110  5  0.004851  0.002272
koi_dicco_mdec      0.003509  0.000834  0.000355  5  0.005226  0.001792
koi_prad            0.003400  0.001148  0.001350  5  0.005764  0.001035
log_prad            0.002626  0.000991  0.002030  5  0.004667  0.000586
koi_dikco_mra       0.002574  0.000500  0.000163  5  0.003604  0.001544
koi_fwm_sdeco       0.002105  0.000387  0.000131  5  0.002901  0.001309
koi_num_transits    0.001348  0.000245  0.000126  5  0.001854  0.000843
koi_ror             0.001295  0.000574  0.0036

In [17]:
# Verificar correlaciones básicas
import pandas as pd

correlations = []
for col in ['koi_period', 'koi_depth', 'koi_duration', 'koi_prad', 'koi_model_snr']:
    if col in df.columns:
        corr = df[[col, 'target']].corr().iloc[0, 1]
        correlations.append((col, corr))

print("Correlación directa con target:")
for feat, corr in sorted(correlations, key=lambda x: abs(x[1]), reverse=True):
    print(f"  {feat:20s}: {corr:+.4f}")

Correlación directa con target:
  koi_depth           : -0.1803
  koi_duration        : -0.1349
  koi_period          : -0.0227
  koi_prad            : -0.0212
