In [1]:
# ðŸ“¦ Imports and Environment Setup
import os
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from glob import glob
from tqdm import tqdm

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [2]:
# ðŸ“‚ Load Metadata and Labels
train_df = pd.read_csv('/kaggle/input/ariel-data-challenge-2025/train.csv', index_col='planet_id')
wavelengths = pd.read_csv('/kaggle/input/ariel-data-challenge-2025/wavelengths.csv')
train_star_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2025/train_star_info.csv')
train_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2025/adc_info.csv')

In [3]:
# ðŸ§  Functions to Read and Preprocess Raw Instrument Data

def f_read_and_preprocess(dataset, planet_ids):
    """Reads FGS1 signal parquet files and computes net signal for each planet."""
    f_raw = np.full((len(planet_ids), 67500), np.nan, dtype=np.float32)
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        f_signal = pl.read_parquet(f'/kaggle/input/ariel-data-challenge-2025/{dataset}/{planet_id}/FGS1_signal_0.parquet')
        mean_signal = f_signal.cast(pl.Int32).sum_horizontal().cast(pl.Float32).to_numpy() / 1024
        net_signal = mean_signal[1::2] - mean_signal[0::2]
        f_raw[i] = net_signal
    return f_raw

def a_read_and_preprocess(dataset, planet_ids):
    """Reads AIRS-CH0 signal parquet files and computes net signal for each planet."""
    a_raw = np.full((len(planet_ids), 5625), np.nan, dtype=np.float32)
    for i, planet_id in tqdm(list(enumerate(planet_ids))):
        signal = pl.read_parquet(f'/kaggle/input/ariel-data-challenge-2025/{dataset}/{planet_id}/AIRS-CH0_signal_0.parquet')
        mean_signal = signal.cast(pl.Int32).sum_horizontal().cast(pl.Float32).to_numpy() / (32 * 356)
        net_signal = mean_signal[1::2] - mean_signal[0::2]
        a_raw[i] = net_signal
    return a_raw

In [4]:
# ðŸ”§ Feature Engineering

def feature_engineering(f_raw, a_raw, n_bins=75):
    """Reduces raw signal dimensions by averaging over bins, then concatenates FGS1 and AIRS features."""
    f_feat = f_raw.reshape(f_raw.shape[0], n_bins, -1).mean(axis=2)
    a_feat = a_raw.reshape(a_raw.shape[0], n_bins, -1).mean(axis=2)
    return np.concatenate([f_feat, a_feat], axis=1)

In [5]:
# ðŸ§ª Load and Preprocess Training Data
planet_ids = train_df.index
f_raw_train = f_read_and_preprocess('train', planet_ids)
a_raw_train = a_read_and_preprocess('train', planet_ids)

# âž• Feature Engineering
X = feature_engineering(f_raw_train, a_raw_train)
y = train_df.values

# ðŸ”„ Normalize Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1100/1100 [13:15<00:00,  1.38it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1100/1100 [13:24<00:00,  1.37it/s]


In [6]:
# ðŸ¤– Model Training
model = Ridge(alpha=0.1)
model.fit(X_scaled, y)

# ðŸ“Š Evaluate Training Performance
y_pred = model.predict(X_scaled)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)
print(f"Train MSE: {mse:.6f} | R2: {r2:.6f}")

Train MSE: 0.000009 | R2: 0.918323


In [7]:
# ðŸ’¾ Save Artifacts
sigma_pred = 0.01  # fixed uncertainty estimate

with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pickle', 'wb') as f:
    pickle.dump(scaler, f)
with open('sigma_pred.pickle', 'wb') as f:
    pickle.dump(sigma_pred, f)

In [8]:
# ðŸ“¥ Load Test Data and Generate Features
test_adc_info = pd.read_csv('/kaggle/input/ariel-data-challenge-2025/test_star_info.csv', index_col='planet_id')
sample_submission = pd.read_csv('/kaggle/input/ariel-data-challenge-2025/sample_submission.csv', index_col='planet_id')

f_raw_test = f_read_and_preprocess('test', sample_submission.index)
a_raw_test = a_read_and_preprocess('test', sample_submission.index)

X_test = feature_engineering(f_raw_test, a_raw_test)
X_test_scaled = scaler.transform(X_test)

# ðŸ§  Make Predictions
y_test_pred = model.predict(X_test_scaled)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.30it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.30it/s]


In [9]:
# ðŸ“¤ Post-processing and Prepare Submission

def postprocessing(pred_array, index, sigma_pred):
    """Formats prediction output for submission with fixed or array uncertainty."""
    columns = [f"wl_{i+1}" for i in range(pred_array.shape[1])]
    df_pred = pd.DataFrame(pred_array.clip(0, None), index=index, columns=columns)
    
    if np.isscalar(sigma_pred):
        sigma_array = np.full_like(pred_array, sigma_pred)
    else:
        sigma_array = sigma_pred

    df_sigma = pd.DataFrame(sigma_array, index=index, columns=[f"{c}_uncertainty" for c in columns])
    return pd.concat([df_pred, df_sigma], axis=1)

submission = postprocessing(y_test_pred, sample_submission.index, sigma_pred)
submission.to_csv('submission.csv')
submission.head()

Unnamed: 0_level_0,wl_1,wl_2,wl_3,wl_4,wl_5,wl_6,wl_7,wl_8,wl_9,wl_10,...,wl_274_uncertainty,wl_275_uncertainty,wl_276_uncertainty,wl_277_uncertainty,wl_278_uncertainty,wl_279_uncertainty,wl_280_uncertainty,wl_281_uncertainty,wl_282_uncertainty,wl_283_uncertainty
planet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1103775,0.016282,0.016137,0.016138,0.016133,0.016124,0.016111,0.016095,0.01608,0.016068,0.016059,...,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01
