In [1]:
import os, warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
from tqdm.auto import tqdm
import lightkurve as lk
from astroquery.ipac.nexsci.nasa_exoplanet_archive import NasaExoplanetArchive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (7,4)
np.random.seed(42)
os.makedirs("artifacts", exist_ok=True)

TAB_N_PER_CLASS = 200    

LC_TARGET_WINDOWS = 1200  

PER_TARGET_LIMIT = 3      

PHASE_HALF_WIDTH = 0.12  
LC_N_PER_CLASS = 400
MIN_POINTS = 60           
MAX_WIDTH = 0.25          
N_BINS = 200              

print("Setup complete.")

  from .autonotebook import tqdm as notebook_tqdm


Setup complete.




In [4]:
save_dir = "artifacts"
os.makedirs(save_dir, exist_ok=True)

In [18]:
import pandas as pd

# The file you uploaded is directly accessible by its name
file_path = 'koi.csv'

# Load the data into a pandas DataFrame
try:
    df = pd.read_csv(file_path)

    # Display the first 5 rows of the dataframe
    print("First 5 rows of the dataset:")
    print(df.head())

    # Display a concise summary of the dataframe (columns, data types, non-null values)
    print("\nDataset Info:")
    df.info()

    # Display the shape of the dataset (rows, columns)
    print(f"\nDataset Shape: {df.shape}")

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found in the current environment.")


First 5 rows of the dataset:
      kepid kepoi_name   kepler_name koi_disposition koi_pdisposition  \
0  10797460  K00752.01  Kepler-227 b       CONFIRMED        CANDIDATE   
1  10797460  K00752.02  Kepler-227 c       CONFIRMED        CANDIDATE   
2  10811496  K00753.01           NaN       CANDIDATE        CANDIDATE   
3  10848459  K00754.01           NaN  FALSE POSITIVE   FALSE POSITIVE   
4  10854555  K00755.01  Kepler-664 b       CONFIRMED        CANDIDATE   

   koi_score  koi_fpflag_nt  koi_fpflag_ss  koi_fpflag_co  koi_fpflag_ec  ...  \
0      1.000              0              0              0              0  ...   
1      0.969              0              0              0              0  ...   
2      0.000              0              0              0              0  ...   
3      0.000              0              1              0              0  ...   
4      1.000              0              0              0              0  ...   

   koi_steff_err2  koi_slogg  koi_slogg_err1 

In [19]:
# Filter for relevant dispositions and valid data points
df = df[df["koi_disposition"].isin(["CONFIRMED", "CANDIDATE", "FALSE POSITIVE"])]
df = df[df["koi_period"] > 0]
df = df.dropna(subset=["koi_time0bk"])

# 2. Sort and remove duplicates to get one entry per star system
df = (df.sort_values(["kepid", "koi_period"])
      .drop_duplicates(subset=["kepid"], keep="first")
      .reset_index(drop=True))

# 3. Ensure numeric columns are correctly typed
num_cols = ["koi_period", "koi_time0bk", "koi_duration", "koi_depth",
            "koi_model_snr", "koi_insol", "koi_teq"]
for c in num_cols:
    # Ensure the column exists before trying to convert it
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# 4. Create the 'habitable_zone' feature
# Check if 'koi_insol' column exists and is not all NaN
if "koi_insol" in df.columns and df["koi_insol"].notna().any():
    df["habitable_zone"] = df["koi_insol"].apply(lambda x: 1 if 0.25 < x < 1.5 else 0)
    print("\nNumber of planets in habitable zone:")
    print(df["habitable_zone"].value_counts())
else:
    df["habitable_zone"] = 0 # Default to 0 if no insolation data
    print("\n'koi_insol' column not found or is empty. 'habitable_zone' set to 0.")


# 5. Create the balanced tabular dataset for modeling
tabular_df = (df.groupby("koi_disposition", group_keys=False)
                .apply(lambda g: g.sample(min(TAB_N_PER_CLASS, len(g)), random_state=42))
                .reset_index(drop=True))

# 6. Create the prioritized list for light curve processing
prior_df = (df.sort_values(["koi_disposition", "koi_model_snr"], ascending=[True, False])
            .groupby("koi_disposition", group_keys=True)
            .head(LC_N_PER_CLASS)
            .reset_index(drop=True))

# 7. Save the processed dataframes to disk
tabular_df.to_csv(f"{save_dir}/tabular_df.csv", index=False)
prior_df.to_csv(f"{save_dir}/prior_df.csv", index=False)

print("\nFinal dataset sizes:")
print("Tabular DF:", tabular_df["koi_disposition"].value_counts().to_dict())
print("Prior LC DF:", prior_df["koi_disposition"].value_counts().to_dict())

print(f"Files saved to: {save_dir}")


Number of planets in habitable zone:
habitable_zone
0    7680
1     534
Name: count, dtype: int64

Final dataset sizes:
Tabular DF: {'CANDIDATE': 200, 'CONFIRMED': 200, 'FALSE POSITIVE': 200}
Prior LC DF: {'CANDIDATE': 400, 'CONFIRMED': 400, 'FALSE POSITIVE': 400}
Files saved to: artifacts


In [8]:
import cupy as cp

In [20]:
# Robust helpers with KIC disambiguation, adaptive window, and mild denoising

from lightkurve import KeplerQualityFlags as KQ

def download_kepler_lc(kepid, limit_per_target=PER_TARGET_LIMIT, quarter=None):
    """
    LOAD & stitch Kepler long-cadence PDCSAP light curves for a given KIC (kepid).
    This function now exclusively loads from the local cache.
    """
    target = f"KIC {int(kepid)}"
    lc_cache_dir = os.path.join("artifacts", "lc_cache")

    # Search for the light curve files within the local cache directory
    # The 'glob' pattern finds all FITS files for the given KIC ID
    search_pattern = os.path.join(lc_cache_dir, "mast", "Kepler", f"kplr{int(kepid):09d}-*", "*.fits")
    
    # Use lightkurve's open function with the glob pattern
    try:
        lc_collection = lk.open(search_pattern)
        if len(lc_collection) == 0:
            return None
    except Exception:
        return None # Return None if files are not found or corrupted

    # Stitch the light curves from the collection
    lc = lc_collection.stitch()

    # Clean → detrend → normalize → conservative outlier removal
    lc = (
        lc.remove_nans()
          .flatten(window_length=401)
          .normalize()
          .remove_outliers(sigma=7)
    )
    return lc


def fold_and_slice(lc, period_days, epoch_bkjd,
                   phase_half_width=PHASE_HALF_WIDTH,
                   min_points=MIN_POINTS, max_width=MAX_WIDTH):
    """
    Fold on given period & epoch (BKJD) and slice a transit-centered window.
    Adaptively widens window up to max_width if too sparse.
    Now uses CuPy for GPU acceleration.
    """
    folded = lc.fold(period_days, epoch_time=epoch_bkjd)
    
    # Move data to GPU
    ph = cp.asarray(folded.phase.value)
    fx = cp.asarray(folded.flux.value)

    width = float(phase_half_width)
    for _ in range(3):  # widen at most 3x
        mask = (ph > -width) & (ph < width)
        if mask.sum() >= min_points:
            return ph[mask], fx[mask]
        width = min(max_width, width * 1.5)
    return cp.array([]), cp.array([])


In [21]:
def to_fixed_bins(phase, flux, n_bins=N_BINS):
    """
    Resample (phase, flux) onto a uniform grid; returns (grid, binned_flux).
    Now uses CuPy for GPU acceleration.
    """
    if len(phase) == 0:
        return cp.array([]), cp.array([])
    
    # Operations on GPU
    order = cp.argsort(phase)
    phase_sorted = phase[order]
    flux_sorted  = flux[order]
    
    # Linspace and interp need to be done on the GPU
    grid = cp.linspace(phase_sorted.min(), phase_sorted.max(), n_bins)
    flux_binned = cp.interp(grid, phase_sorted, flux_sorted)
    
    # Return data on GPU, will be moved to CPU later
    return grid.astype(cp.float32), flux_binned.astype(cp.float32)

In [22]:
# Create a directory to cache the downloaded FITS files
lc_cache_dir = os.path.join(save_dir, "lc_cache")
os.makedirs(lc_cache_dir, exist_ok=True)
print(f"Light curve cache directory: {lc_cache_dir}")

# Loop through the dataframe and download the files
for _, row in tqdm(prior_df.iterrows(), total=len(prior_df), desc="Downloading FITS files"):
    kepid = int(row["kepid"])
    target = f"KIC {int(kepid)}"
    
    try:
        # Search for the light curve
        srch = lk.search_lightcurve(
            target, mission="Kepler", author="Kepler",
            exptime="long", limit=PER_TARGET_LIMIT
        )
        if len(srch) > 0:
            # Download, providing a path to cache the files
            # Lightkurve is smart and will not re-download if files exist
            srch.download_all(download_dir=lc_cache_dir)
    except Exception as e:
        print(f"Could not download for kepid {kepid}: {e}")

print("All available light curves have been downloaded/cached.")

Light curve cache directory: artifacts\lc_cache


Downloading FITS files:   0%|          | 5/1200 [00:54<3:37:23, 10.92s/it]


KeyboardInterrupt: 

In [13]:
label_map = {"FALSE POSITIVE": 0, "CANDIDATE": 1, "CONFIRMED": 2}

X_lc_list, y_lc_list = [], []
lc_records = []
built = 0

# Iterate through prioritized list; early-stop when we hit LC_TARGET_WINDOWS
for _, row in tqdm(prior_df.iterrows(), total=len(prior_df), desc="Building LC windows"):
    kepid = int(row["kepid"])
    disp  = str(row["koi_disposition"])

    per   = float(row["koi_period"]) if pd.notnull(row["koi_period"]) else np.nan
    epoch = float(row["koi_time0bk"]) if pd.notnull(row["koi_time0bk"]) else np.nan
    if not np.isfinite(per) or not np.isfinite(epoch):
        continue

    try:
        lc = download_kepler_lc(kepid, limit_per_target=PER_TARGET_LIMIT)
        if lc is None:
            continue

        ph, fx = fold_and_slice(lc, per, epoch,
                                phase_half_width=PHASE_HALF_WIDTH,
                                min_points=MIN_POINTS, max_width=MAX_WIDTH)
        if len(ph) == 0:
            continue

        grid, fx_bin = to_fixed_bins(ph, fx, n_bins=N_BINS)
        if len(fx_bin) == 0:
            continue

        # Keep data on GPU; append the CuPy array directly
        X_lc_list.append(fx_bin)
        y_lc_list.append(label_map[disp])

        lc_records.append({
            "kepid": kepid,
            "label": disp,
            "y": label_map[disp],
            "koi_period": per,
            "koi_time0bk": epoch,
            "koi_duration": float(row.get("koi_duration", np.nan)),
            "koi_depth": float(row.get("koi_depth", np.nan)),
            "koi_model_snr": float(row.get("koi_model_snr", np.nan))
        })

        built += 1
        if built % 100 == 0:
            print(f"...built {built} windows")
        if built >= LC_TARGET_WINDOWS:
            break

    except Exception as e:
        # Keep going on network or parsing errors
        pass

if built == 0:
    raise RuntimeError("No LC windows built. Relax filters or increase PER_TARGET_LIMIT / pool size.")

# Stack arrays on the GPU first, then move the final result to CPU
X_lc_gpu = cp.vstack(X_lc_list)
X_lc = cp.asnumpy(X_lc_gpu)

y_lc = np.array(y_lc_list, dtype=np.int64)  # shape: (W,)
features_lc = pd.DataFrame.from_records(lc_records)

print("LC windows:", X_lc.shape, "labels:", y_lc.shape)
print(features_lc["label"].value_counts())

Building LC windows:   5%|▌         | 61/1200 [04:31<1:24:29,  4.45s/it]


KeyboardInterrupt: 

In [None]:
# Use the balanced tabular sample we prepared in Cell 1
tab_cols = ["koi_period","koi_duration","koi_depth","koi_model_snr",
            "koi_fpflag_nt","koi_fpflag_ss","koi_fpflag_co","koi_fpflag_ec"]

features_tab = sample_tab_df.copy().reset_index(drop=True)
y_tab = features_tab["koi_disposition"].map(label_map).to_numpy(dtype=np.int64)

# Clean NaNs and type cast
for c in tab_cols:
    features_tab[c] = pd.to_numeric(features_tab[c], errors="coerce")
features_tab[tab_cols] = features_tab[tab_cols].fillna(0.0).astype(np.float32)

# Standardize only tabular features (not LC flux vectors)
scaler = StandardScaler()
X_tab = scaler.fit_transform(features_tab[tab_cols].to_numpy(dtype=np.float32))

print("Tabular X:", X_tab.shape, "y:", y_tab.shape)
print(features_tab["koi_disposition"].value_counts())

# Quick class balance visuals
ax = features_tab["koi_disposition"].value_counts().reindex(["CONFIRMED","CANDIDATE","FALSE POSITIVE"]).plot(kind="bar")
ax.set_title("Tabular — class counts"); ax.set_ylabel("n"); plt.xticks(rotation=15); plt.show()

ax2 = pd.Series(y_lc).map({v:k for k,v in label_map.items()}).value_counts() \
        .reindex(["CONFIRMED","CANDIDATE","FALSE POSITIVE"]).plot(kind="bar")
ax2.set_title("LC windows — class counts"); ax2.set_ylabel("n"); plt.xticks(rotation=15); plt.show()