In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import lightkurve as lk
import os



In [2]:
# Get confirmed planets and false positives with their periods
url = "https://exoplanetarchive.ipac.caltech.edu/cgi-bin/nstedAPI/nph-nstedAPI"
params = "table=cumulative&select=kepid,kepoi_name,koi_period,koi_time0bk,koi_disposition&format=csv"

df = pd.read_csv(f"{url}?{params}")

# Filter to confirmed and false positives only
df = df[df['koi_disposition'] != 'CANDIDATE'].copy()
df['label'] = (df['koi_disposition'] == 'CONFIRMED').astype(int)
df = df.dropna(subset=['koi_period', 'koi_time0bk'])

print(f"Total samples: {len(df)}")
print(f"Planets: {sum(df['label']==1)}, False Positives: {sum(df['label']==0)}")
print(df.head())

Total samples: 7585
Planets: 2746, False Positives: 4839
      kepid kepoi_name  koi_period  koi_time0bk koi_disposition  label
0  10797460  K00752.01    9.488036   170.538750       CONFIRMED      1
1  10797460  K00752.02   54.418383   162.513840       CONFIRMED      1
3  10848459  K00754.01    1.736952   170.307565  FALSE POSITIVE      0
4  10854555  K00755.01    2.525592   171.595550       CONFIRMED      1
5  10872983  K00756.01   11.094321   171.201160       CONFIRMED      1


In [3]:
from astroquery.mast import Observations
Observations.TIMEOUT = 600

def download_lightcurve(kepid, save_dir='../data/raw', max_retries=3):
    """Download Kepler light curve."""
    
    filepath = f"{save_dir}/kic_{kepid}.fits"
    
    # Skip if exists
    if os.path.exists(filepath):
        return filepath
    
    for attempt in range(max_retries):
        try:
            search = lk.search_lightcurve(f"KIC {kepid}", mission="Kepler", cadence="long")
            if len(search) == 0:
                return None
            
            lc = search.download_all().stitch()
            lc.to_fits(filepath, overwrite=True)
            return filepath
            
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Failed {kepid}: {e}")
                return None
    
    return None

In [4]:
def process_lightcurve(filepath, period, t0, n_points=201):
    """
    Load, clean, phase-fold, and bin light curve.
    
    Returns fixed-length array ready for CNN.
    """
    
    try:
        # Load
        lc = lk.read(filepath)
        
        # Clean
        lc = lc.remove_nans()
        lc = lc.remove_outliers(sigma=5)
        lc = lc.flatten(window_length=401)
        lc = lc.normalize()
        
        # Phase fold at orbital period
        folded = lc.fold(period=period, epoch_time=t0)
        
        # Bin to fixed length
        binned = folded.bin(bins=n_points)
        flux = binned.flux.value
        
        # Handle any remaining NaN
        if np.any(np.isnan(flux)):
            flux = np.nan_to_num(flux, nan=1.0)
        
        # Normalize to zero mean, unit variance
        flux = (flux - np.mean(flux)) / (np.std(flux) + 1e-8)
        
        return flux
        
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
        return None

In [6]:
# Start with small sample to test pipeline
test_df = pd.concat([
    df[df['label'] == 1].head(5),  # 5 planets
    df[df['label'] == 0].head(5)   # 5 false positives
])

print(f"Testing with {len(test_df)} samples...")

for idx, row in test_df.iterrows():
    kepid = row['kepid']
    filepath = download_lightcurve(kepid)
    status = "Yes" if filepath else "No"
    print(f"  KIC {kepid}: {status}")

Testing with 10 samples...
  KIC 10797460: Yes
  KIC 10797460: Yes
  KIC 10854555: Yes
  KIC 10872983: Yes
  KIC 10872983: Yes
  KIC 10848459: Yes
  KIC 6721123: Yes
  KIC 10419211: Yes
  KIC 10464078: Yes
  KIC 10480982: Yes
