In [1]:
import numpy as np
import pandas as pd
import os

# path to the folder containing the data
path_folder = "../dataset/ariel-data-challenge-2024/"

## axis_info


From Kaggle [discussion](https://www.kaggle.com/competitions/ariel-data-challenge-2024/discussion/540555):

The AIRS_CH0 integration time indeed alternates between 0.1 and 4.5, as it stands for the sequence of exposure (ultra short, long, ultra short etc). it is used in the calibration notebook.

In [2]:
# load axis_info
axis_info = pd.read_parquet(os.path.join(path_folder, "axis_info.parquet"))

print(axis_info.keys())

cut_inf, cut_sup = (39, 321)

AIRS_CH0_axis0_h = np.array(axis_info["AIRS-CH0-axis0-h"].dropna())

AIRS_CH0_axis2_um = np.array(axis_info["AIRS-CH0-axis2-um"].dropna())
AIRS_CH0_axis2_um_cut = AIRS_CH0_axis2_um[cut_inf:cut_sup]

AIRS_CH0_integration_time = np.array(axis_info["AIRS-CH0-integration_time"].dropna())

FGS1_axis0_h = np.array(axis_info["FGS1-axis0-h"].dropna())

print("AIRS_CH0_axis0_h: ", AIRS_CH0_axis0_h.shape)
print("AIRS_CH0_axis2_um: ", AIRS_CH0_axis2_um.shape, "->" , AIRS_CH0_axis2_um_cut.shape)
print("AIRS_CH0_integration_time: ", AIRS_CH0_integration_time.shape)
print("FGS1_axis0_h: ", FGS1_axis0_h.shape)

Index(['AIRS-CH0-axis0-h', 'AIRS-CH0-axis2-um', 'AIRS-CH0-integration_time',
       'FGS1-axis0-h'],
      dtype='object')
AIRS_CH0_axis0_h:  (11250,)
AIRS_CH0_axis2_um:  (356,) -> (282,)
AIRS_CH0_integration_time:  (11250,)
FGS1_axis0_h:  (135000,)


In [3]:
diff_within_exposure = np.diff(AIRS_CH0_axis0_h)[::2]

diff_between_exposure = np.diff(AIRS_CH0_axis0_h)[1::2]

print("diff_within_exposure")
mean_diff = np.mean(diff_within_exposure)
std_diff = np.std(diff_within_exposure)
print("mean_diff: ", mean_diff, "std_diff: ", std_diff)
print("mean (unit in seconds): ", mean_diff * 3600)

print("diff_between_exposure")
mean_diff = np.mean(diff_between_exposure)
std_diff = np.std(diff_between_exposure)
print("mean_diff: ", mean_diff, "std_diff: ", std_diff)
print("mean (unit in seconds): ", mean_diff * 3600)

diff_within_exposure
mean_diff:  2.777777777777837e-05 std_diff:  4.019715289383054e-16
mean (unit in seconds):  0.10000000000000213
diff_between_exposure
mean_diff:  0.001305555555555555 std_diff:  3.542347252760878e-16
mean (unit in seconds):  4.699999999999998


## sample_submission

total 567 columns: (planet_id * 1) + (wl_id * 283) + (sigma_id * 283)

Note: sigma_id looks like the certentity we hold about the model's output

In [4]:
sample_submission = pd.read_csv(os.path.join(path_folder, "sample_submission.csv"))

print(sample_submission.shape)
print(sample_submission.keys())

(1, 567)
Index(['planet_id', 'wl_1', 'wl_2', 'wl_3', 'wl_4', 'wl_5', 'wl_6', 'wl_7',
       'wl_8', 'wl_9',
       ...
       'sigma_274', 'sigma_275', 'sigma_276', 'sigma_277', 'sigma_278',
       'sigma_279', 'sigma_280', 'sigma_281', 'sigma_282', 'sigma_283'],
      dtype='object', length=567)


## train_adc_info/test_adc_info

usd to reverse the adc process

In [5]:
train_adc_info = pd.read_csv(os.path.join(path_folder, "train_adc_info.csv"))
test_adc_info = pd.read_csv(os.path.join(path_folder, "test_adc_info.csv"))

print("train_adc_info")
print(train_adc_info.shape)
print(train_adc_info.keys())

print("\n\ntest_adc_info")
print(test_adc_info.shape)

train_adc_info
(673, 6)
Index(['planet_id', 'FGS1_adc_offset', 'FGS1_adc_gain', 'AIRS-CH0_adc_offset',
       'AIRS-CH0_adc_gain', 'star'],
      dtype='object')


test_adc_info
(1, 6)


## train_labels

total 284 columns: (planet_id * 1) + (wl_id * 283)

In [6]:
train_labels = pd.read_csv(os.path.join(path_folder, "train_labels.csv"))

print(train_labels.shape)
print(train_labels.keys())

(673, 284)
Index(['planet_id', 'wl_1', 'wl_2', 'wl_3', 'wl_4', 'wl_5', 'wl_6', 'wl_7',
       'wl_8', 'wl_9',
       ...
       'wl_274', 'wl_275', 'wl_276', 'wl_277', 'wl_278', 'wl_279', 'wl_280',
       'wl_281', 'wl_282', 'wl_283'],
      dtype='object', length=284)


## Wavelengths

![Wavelengths](../img/wavelengths.png "Wavelengths")

In [7]:
wavelengths = pd.read_csv(os.path.join(path_folder, "wavelengths.csv"))
# Total 283 columns, wl_1 = 0.705 um represents the FGS1 wavelength
# wl_2 - wl_283 (1.9 - 3.9 um) represents the spectral range of the AIRS data

## AIRS-CH0_signal

original data shape: (training examples, time steps, wavelengths, spacial)

shape of each image is wavelengths $\times$ spacial ($356 \times 32$), where x-axis represents the frequency dimension (spectrum), and the y-axis represents the spacial dimension of the detector.

## FGS1_signal

original data shape: (training examples, time steps, wavelengths, spacial)

shape of each image is spacial $\times$ spacial ($32 \times 32$).