In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pywt   # pip install pywavelets on local; Kaggle already has it
from typing import Tuple
import pandas as pd

# --- 1. Load the training targets and submission file ---
print("Loading data...")
train_df = pd.read_csv("/kaggle/input/ariel-data-challenge-2025/train.csv")
sample_submission = pd.read_csv("/kaggle/input/ariel-data-challenge-2025/sample_submission.csv")

# Isolate the target columns (wavelengths)
wl_cols = [f'wl_{i}' for i in range(1, 284)]
train_targets = train_df[wl_cols]

# --- 2. Calculate the mean and standard deviation for each wavelength ---
print("Calculating mean and std dev for each wavelength...")
mean_spectrum = train_targets.mean(axis=0)
std_spectrum = train_targets.std(axis=0)


Loading data...
Calculating mean and std dev for each wavelength...


In [2]:
display(train_df)

Unnamed: 0,planet_id,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,...,wl_274,wl_275,wl_276,wl_277,wl_278,wl_279,wl_280,wl_281,wl_282,wl_283
0,34983,0.018291,0.018088,0.018087,0.018085,0.018084,0.018084,0.018084,0.018084,0.018085,...,0.018109,0.018112,0.018118,0.018123,0.018125,0.018127,0.018130,0.018134,0.018138,0.018142
1,1873185,0.006347,0.006343,0.006343,0.006343,0.006343,0.006343,0.006343,0.006342,0.006342,...,0.006340,0.006340,0.006339,0.006339,0.006339,0.006339,0.006339,0.006339,0.006339,0.006339
2,3849793,0.046061,0.046139,0.046130,0.046117,0.046107,0.046105,0.046109,0.046112,0.046111,...,0.046144,0.046133,0.046131,0.046138,0.046141,0.046147,0.046147,0.046139,0.046134,0.046133
3,8456603,0.015363,0.015387,0.015385,0.015385,0.015385,0.015385,0.015384,0.015383,0.015383,...,0.015471,0.015471,0.015467,0.015465,0.015465,0.015464,0.015461,0.015460,0.015460,0.015460
4,23615382,0.014474,0.014636,0.014628,0.014635,0.014643,0.014642,0.014637,0.014635,0.014639,...,0.014473,0.014467,0.014433,0.014426,0.014435,0.014422,0.014399,0.014429,0.014444,0.014418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,4290810553,0.023917,0.023831,0.023825,0.023816,0.023806,0.023794,0.023782,0.023771,0.023760,...,0.023839,0.023835,0.023834,0.023830,0.023830,0.023832,0.023828,0.023822,0.023821,0.023821
1096,4291452525,0.020838,0.020657,0.020662,0.020660,0.020643,0.020596,0.020521,0.020438,0.020372,...,0.019771,0.019760,0.019766,0.019769,0.019766,0.019752,0.019781,0.019825,0.019794,0.019793
1097,4291977477,0.006497,0.006500,0.006500,0.006501,0.006501,0.006500,0.006500,0.006500,0.006500,...,0.006499,0.006498,0.006498,0.006498,0.006498,0.006498,0.006498,0.006498,0.006498,0.006498
1098,4293108648,0.014412,0.014514,0.014513,0.014511,0.014510,0.014509,0.014508,0.014509,0.014511,...,0.014572,0.014572,0.014569,0.014567,0.014567,0.014566,0.014565,0.014567,0.014568,0.014567


period/scale mapping, COI, denois

In [3]:
def pick_scales(min_period_s: float, max_period_s: float, dt_s: float, n=64):
    """
    Returns PyWavelets scales spanning [min_period, max_period] (seconds) for 'morl'.
    PyWavelets uses pseudo-frequency: f = center_frequency / (scale * dt).
    """
    cf = pywt.central_frequency('morl')  # center frequency for Morlet in PyWavelets
    f_min = 1.0 / max_period_s
    f_max = 1.0 / max(min_period_s, 1e-12)
    # Use logspace in frequency, then convert to scale
    freqs = np.geomspace(f_min, f_max, num=n)
    scales = cf / (freqs * dt_s)
    return scales, freqs

def morlet_coi_times(freqs_hz: np.ndarray):
    """
    Approximate Cone-of-Influence half-width (seconds) for Morlet per Torrence & Compo.
    MATLAB doc gives: pred_times = sqrt(2) * cf / f  (cf≈6/(2π) for analytic Morlet there).
    We'll use that same form as a guide; treat as advisory bounds, not exact.
    """
    cf_tc = 6.0 / (2.0 * np.pi)               # MATLAB/T&C convention
    return np.sqrt(2.0) * cf_tc / np.maximum(freqs_hz, 1e-12)  # seconds


Compute & plot scalogram

In [4]:
def cwt_scalogram(y: np.ndarray, dt_s: float,
                  min_period_s=60.0, max_period_s=3600.0,
                  n_scales=72, vmin=None, vmax=None, title="AIRS white — CWT (Morlet)"):
    """
    y      : 1-D signal (white curve after CDS+bin)
    dt_s   : sampling period (seconds per point)
    """
    y = np.asarray(y, float)
    y = y - np.nanmedian(y)  # detrend mean; you can also high-pass if needed

    scales, freqs = pick_scales(min_period_s, max_period_s, dt_s, n=n_scales)
    coef, freqs_pywt = pywt.cwt(y, scales, 'morl', sampling_period=dt_s)  # coef: (n_scales, n_time)
    power = np.abs(coef)**2

    # Period axis (seconds) from PyWavelets frequencies
    periods_s = 1.0 / np.maximum(freqs_pywt, 1e-12)

    # Plot
    t = np.arange(len(y)) * dt_s
    plt.figure(figsize=(9, 4.5))
    im = plt.imshow(power, extent=[t[0], t[-1], periods_s[-1], periods_s[0]],
                    aspect='auto', origin='upper', vmin=vmin, vmax=vmax)
    cb = plt.colorbar(im); cb.set_label("Wavelet power")

    # COI guide (optional)
    coi_half = morlet_coi_times(freqs_pywt)  # seconds
    # draw top/bottom COI masks as shaded areas
    plt.fill_between(t, periods_s[-1], coi_half[0], color='k', alpha=0.05, step='mid')
    plt.fill_between(t, coi_half[0], periods_s[0], color='k', alpha=0.05, step='mid')

    plt.xlabel("Time (s)")
    plt.ylabel("Period (s)")
    plt.title(title)
    plt.tight_layout()
    plt.show()

    return power, periods_s, t


Ridge & window suggestion

In [5]:
def ridge_and_window(power: np.ndarray, periods_s: np.ndarray, t: np.ndarray,
                     expected_period_range=(120, 3600), smooth_k=5) -> Tuple[int, int]:
    """
    Very simple ridge: within an expected period band, pick max-power scale per time,
    then threshold to get a contiguous transit-like window.
    Returns indices (start, end) on t (integer indices).
    """
    lo, hi = expected_period_range
    band = (periods_s >= lo) & (periods_s <= hi)
    if band.sum() < 3:  # fallback
        band[:] = True

    band_power = power[band, :]                       # (n_scales_band, n_time)
    scale_argmax = np.argmax(band_power, axis=0)      # per-time best scale
    ridge_strength = band_power[scale_argmax, np.arange(band_power.shape[1])]

    # smooth strength
    from numpy.lib.stride_tricks import sliding_window_view
    k = max(1, smooth_k)
    if ridge_strength.size >= k:
        sv = sliding_window_view(ridge_strength, k).mean(-1)
        rs = np.pad(sv, (k//2, k - 1 - k//2), mode='edge')
    else:
        rs = ridge_strength

    # threshold at, say, 70th percentile to propose a window
    thr = np.nanpercentile(rs, 70.0)
    mask = rs >= thr

    # pick the longest contiguous True run
    if not mask.any():
        return 0, len(t) - 1
    # find runs
    diff = np.diff(mask.astype(int))
    starts = np.where(diff == 1)[0] + 1
    ends   = np.where(diff == -1)[0] + 1
    if mask[0]:  starts = np.r_[0, starts]
    if mask[-1]: ends   = np.r_[ends, len(mask)]
    if len(starts) == 0:
        return 0, len(t) - 1
    lengths = ends - starts
    i = np.argmax(lengths)
    return int(starts[i]), int(ends[i])


In [6]:

# --- 3. Create the submission DataFrame ---
print("Building submission file...")
# Get the planet_id from the sample submission (for the test set)
submission_df = sample_submission[['planet_id']].copy()

# Create columns for all the wl and sigma predictions
# This is a bit of pandas magic to create the columns in the right order
wl_sigma_cols = []
for i in range(1, 284):
    wl_sigma_cols.append(f'wl_{i}')
    wl_sigma_cols.append(f'sigma_{i}')

# Recreate the submission DataFrame with the correct columns, initialized to zero
final_submission = pd.DataFrame(columns=['planet_id'] + wl_sigma_cols)
final_submission['planet_id'] = submission_df['planet_id']


# --- 4. Populate the submission file ---
# Assign the calculated mean to all the 'wl_' columns
for i, col in enumerate(wl_cols):
    final_submission[col] = mean_spectrum[i]

# Assign the calculated standard deviation to all the 'sigma_' columns
sigma_cols = [f'sigma_{i}' for i in range(1, 284)]
for i, col in enumerate(sigma_cols):
    # We use the std dev of the wl columns as our sigma estimate
    final_submission[col] = std_spectrum[i]


# --- 5. Save the submission file ---
final_submission.to_csv("submission.csv", index=False)
print("submission.csv created successfully!")
print("First 5 columns of submission file:")
print(final_submission.head().iloc[:, :5])

Building submission file...
submission.csv created successfully!
First 5 columns of submission file:
   planet_id      wl_1   sigma_1      wl_2   sigma_2
0    1103775  0.014609  0.010652  0.014588  0.010579


  final_submission[col] = mean_spectrum[i]
  final_submission[col] = std_spectrum[i]
