# Baseline
Based on https://www.kaggle.com/code/ahsuna123/neurips-adc-25-intro-training

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import numpy as np
import pandas as pd
import kego.plotting
import polars as pl
import os
import pandas as pd
from glob import glob
from collections import defaultdict
import numpy as np

In [None]:
PATH_DATA = "/mnt/e/ariel-data-challenge-2025"
PATH_DATA_TRAIN = os.path.join(PATH_DATA, "train")

In [None]:
PLANET_ID = 1010375142
# signal_f = pl.read_parquet("~/Downloads/FGS1_signal_0_1010375142.parquet")
# signal_c = pl.read_parquet("~/Downloads/AIRS-CH0_signal_0_1010375142.parquet")
# train = pl.read_csv("~/Downloads/train.csv")
# train_info = pl.read_csv("~/Downloads/train_star_info.csv")
signal_f = pl.read_parquet(
    f"{PATH_DATA_TRAIN}/{PLANET_ID}/FGS1_signal_0.parquet"
)
signal_c = pl.read_parquet(
    f"{PATH_DATA_TRAIN}/{PLANET_ID}/AIRS-CH0_signal_0.parquet"
)
train = pl.read_csv(f"{PATH_DATA}/train.csv")
wavelengths = pl.read_csv(f"{PATH_DATA}/wavelengths.csv")
train_info = pl.read_csv(f"{PATH_DATA}/train_star_info.csv")

In [None]:
# --- 1. Basic Train Set Stats ---
print("ü™ê Number of training planets:", train.shape[0])
print("üìà Number of target labels (wavelengths):", train.shape[1] - 1)
print("üî¨ Length of wavelength grid:", wavelengths.shape[0])

# --- 2. Target Stats (per flux column) ---
target_cols = [col for col in train.columns if col != "planet_id"]
flux_summary = train[target_cols].describe()
print("\nüìä Flux value summary (first 5 rows):")
print(flux_summary)

In [None]:
# --- 3. Unique Stars ---
if "planet_id" in train_info.columns:
    num_stars = train_info.drop("planet_id").unique().shape[0]
else:
    num_stars = train_info.unique().shape[0]
print("\nüåü Number of unique stars in training:", num_stars)

# --- 4. Planets with Multiple Observations ---
obs_counts = defaultdict(int)
train_planets = os.listdir(PATH_DATA_TRAIN)

for pid in train_planets:
    air_obs = glob(f"train/{pid}/AIRS-CH0_signal_*.parquet")
    obs_counts[pid] = len(air_obs)

multi_obs = {pid: count for pid, count in obs_counts.items() if count > 1}
print("\nüîÅ Planets with multiple observations:", len(multi_obs))

# --- 5. Check Calibration File Coverage ---
missing_calibs = []
expected = {"dark", "dead", "flat", "linear_corr", "read"}

for pid in train_planets:
    for band in ["AIRS-CH0", "FGS1"]:
        calib_path = f"train/{pid}/{band}_calibration"
        calib_files = (
            {os.path.splitext(f)[0] for f in os.listdir(calib_path)}
            if os.path.exists(calib_path)
            else set()
        )
        missing = expected - calib_files
        if missing:
            missing_calibs.append((pid, band, missing))

print("\nüß™ Planets missing calibration files:", len(missing_calibs))
if missing_calibs:
    print("   Example:", missing_calibs[0])

# --- 6. Optional: Distribution of Observations Per Planet ---
obs_distribution = (
    pd.Series(list(obs_counts.values())).value_counts().sort_index()
)
print("\nüóÇ Observation count distribution per planet (AIR-CH0):")
print(obs_distribution)

# --- 7. Planet-Star Uniqueness Check ---
merged = train[["planet_id"]].join(train_info, on="planet_id", how="left")
unique_links = merged[
    ["planet_id"] + [col for col in train_info.columns if col != "planet_id"]
].unique()
print("\nüîó Unique planet-star mappings:", unique_links.shape[0])

In [None]:
kego.plotting.plot_line(
    y=train.filter(pl.col("planet_id") == PLANET_ID).row(0)[1:]
)

In [None]:
def signal_c_to_image(signal_c):
    return signal_c.to_numpy().reshape(11250, 32, 356)


def signal_f_to_image(signal_f):
    return signal_f.to_numpy().reshape(135000, 32, 32)


def smooth_signal_f(signal_f: pd.DataFrame, window=800):
    n_signal = net_signal_f(signal_f)
    cum_signal = np.cumsum(n_signal)
    smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window
    return smooth_signal


def net_signal_f(signal_f: pd.DataFrame):
    mean_signal = signal_f.mean_horizontal()
    n_signal = mean_signal[1::2] - mean_signal[0::2]
    return n_signal


def smooth_signal_c(signal: pd.DataFrame, window=80):
    net_signal = net_signal_c(signal)
    cum_signal = net_signal.cumsum()
    smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window
    return smooth_signal


def net_signal_c(signal_c: pd.DataFrame):
    signal_c = np.reshape(signal_c, shape=(11250, 32, 356))
    mean_signal = signal_c.mean(axis=2).mean(axis=1)
    net_signal = mean_signal[1::2] - mean_signal[0::2]
    return net_signal

In [None]:
figures, axes_grid, axes_colorbar = kego.plotting.create_axes_grid(
    2,
    1,
    unravel=True,
    figure_size=(9, 4),
    title=f"Planet: {PLANET_ID}",
    top=0.14,
)
kego.plotting.plot_colormesh(
    signal_f_to_image(signal_f)[0],
    colormap="plasma",
    axes=axes_grid[0],
    vmin=0,
    vmax=2.5e4,
    title="low",
)
kego.plotting.plot_colormesh(
    signal_f_to_image(signal_f)[1],
    colormap="plasma",
    axes=axes_grid[1],
    vmin=0,
    vmax=2.5e4,
    title="high",
)

In [None]:
font_size = 12

figures, axes_grid, axes_colorbar = kego.plotting.create_axes_grid(
    1,
    2,
    unravel=True,
    spacing_y=0.1,
    figure_size=(12, 5),
    title=f"Planet: {PLANET_ID}",
    top=0.07,
)
axes = kego.plotting.plot_line(
    y=smooth_signal_f(signal_f),
    axes=axes_grid[0],
    font_size=font_size,
    linewidth=1,
)
kego.plotting.plot_line(
    y=net_signal_f(signal_f),
    axes=axes_grid[1],
    font_size=font_size,
    ylim=axes.get_ylim(),
    linewidth=0.02,
)
kego.plotting.plot_line(
    y=smooth_signal_f(signal_f, window=200),
    axes=axes_grid[1],
    font_size=font_size,
    color="red",
    linewidth=0.5,
)

In [None]:
font_size = 12
figures, axes_grid, axes_colorbar = kego.plotting.create_axes_grid(
    1,
    2,
    unravel=True,
    spacing_y=0.1,
    figure_size=(12, 5),
    title=f"Planet: {PLANET_ID}",
    top=0.07,
)
axes = kego.plotting.plot_line(
    y=smooth_signal_c(signal_c),
    axes=axes_grid[0],
    font_size=font_size,
    linewidth=1,
)
kego.plotting.plot_line(
    y=net_signal_c(signal_c),
    axes=axes_grid[1],
    font_size=font_size,
    linewidth=0.1,
    ylim=axes.get_ylim(),
)
kego.plotting.plot_line(
    y=smooth_signal_c(signal_c, window=40),
    axes=axes_grid[1],
    font_size=font_size,
    color="red",
    linewidth=0.5,
)

In [None]:
kego.plotting.plot_colormesh(
    signal_c_to_image(signal_c)[5000], vmax=800, colormap="plasma", vmin=450
)

In [None]:
mean_signal = (
    signal_f.cast(pl.Int32).sum_horizontal().cast(pl.Float32).to_numpy() / 1024
)  # mean over the 32*32 pixels
net_signal = mean_signal[1::2] - mean_signal[0::2]
net_signal

In [None]:
def feature_engineering(f_raw, a_raw):
    """Create a dataframe with two features from the raw data.

    Parameters:
    f_raw: ndarray of shape (n_planets, 67500)
    a_raw: ndarray of shape (n_planets, 5625)

    Return value:
    df: DataFrame of shape (n_planets, 2)
    """
    obscured = f_raw[:, 23500:44000].mean(axis=1)
    unobscured = (
        f_raw[:, :20500].mean(axis=1) + f_raw[:, 47000:].mean(axis=1)
    ) / 2
    f_relative_reduction = (unobscured - obscured) / unobscured
    obscured = a_raw[:, 1958:3666].mean(axis=1)
    unobscured = (
        a_raw[:, :1708].mean(axis=1) + a_raw[:, 3916:].mean(axis=1)
    ) / 2
    a_relative_reduction = (unobscured - obscured) / unobscured

    df = pd.DataFrame(
        {
            "a_relative_reduction": a_relative_reduction,
            "f_relative_reduction": f_relative_reduction,
        }
    )

    return df