In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import multiprocessing
import concurrent

from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_squared_error

import sklearn.model_selection
import itertools
import sklearn.linear_model
import sklearn.metrics

import mlframework.plotting

In [None]:
FOLDER_COMPETITION = os.environ["PATH_EFOLDER"] + "ariel-data-challenge-2024/"
# FOLDER_COMPETITION = "/kaggle/input/"
!ls $FOLDER_COMPETITION

In [None]:
!ls /home/kristian/Projects/mlframework/data/ariel

In [None]:
train_adc_info = pd.read_csv(
    FOLDER_COMPETITION + "train_adc_info.csv", index_col="planet_id"
)
train_labels = pd.read_csv(
    FOLDER_COMPETITION + "train_labels.csv", index_col="planet_id"
)
test_adc_info = pd.read_csv(
    FOLDER_COMPETITION + "test_adc_info.csv", index_col="planet_id"
)
sample_submission = pd.read_csv(
    FOLDER_COMPETITION + "sample_submission.csv", index_col="planet_id"
)

In [None]:
test_adc_info

In [None]:
plt.figure(figsize=(6, 2))
plt.title("Histogram of the planets' sizes (regression targets)", fontsize=18)
plt.hist(train_labels.values.ravel(), bins=20, density=True, color='olive')
plt.xlabel(r"Planet's size $(\frac{r}{R})^2$", fontsize=14)
plt.ylabel('Density', fontsize=14)
plt.xlim(0, 0.008)
plt.show()

In [None]:
planet_id = 14485303
f_signal = pd.read_parquet(FOLDER_COMPETITION + f'train/{planet_id}/FGS1_signal.parquet')
f_signal

In [None]:
mean_signal = f_signal.values.mean(axis=1)
net_signal = mean_signal[1::2] - mean_signal[0::2]
cum_signal = net_signal.cumsum()
window=800
smooth_signal = (cum_signal[window:] - cum_signal[:-window]) / window

_, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
ax1.plot(net_signal, label='raw net signal')
ax1.legend()
ax2.plot(smooth_signal, color='c', label='smoothened net signal')
ax2.legend()
ax2.set_xlabel('time')
plt.suptitle('FGS1 light curve', y=0.96)

plt.show()

In [None]:
adc_info = train_adc_info
planet_ids = adc_info.index

dataset = 'train'
i = 1

f_signal = pd.read_parquet(FOLDER_COMPETITION + f'{dataset}/{planet_id}/FGS1_signal.parquet')
mean_signal = f_signal.values.mean(axis=1) # mean over the 32*32 pixels
net_signal = mean_signal[1::2] - mean_signal[0::2]
gain = adc_info.FGS1_adc_gain.values[i]

In [None]:
def get_phase(i, planet_id, n_steps):
    f_signal = pd.read_parquet(FOLDER_COMPETITION + f'{dataset}/{planet_id}/FGS1_signal.parquet')
    mean_signal = f_signal.values.mean(axis=1) # mean over the 32*32 pixels
    net_signal = mean_signal[1::2] - mean_signal[0::2]
    gain = adc_info.FGS1_adc_gain.values[i]
    return [net_signal[i*n_steps:(i+1)*n_steps].mean() * gain for i in range(len(net_signal) // n_steps + 1)]

def f_read_and_preprocess(dataset, adc_info, n_steps=8000):
    """Read the FGS1 files for all planet_ids and extract the signal.
    
    Parameters
    dataset: 'train' or 'test'
    adc_info: metadata dataframe, either train_adc_info or test_adc_info
    
    Returns
    dataframe with one row per planet_id
    
    """
    planet_ids = adc_info.index

    plt.figure()
    plt.plot([net_signal[i*n_steps:(i+1)*n_steps].mean() * gain for i in range(len(net_signal) // n_steps + 1)])
    plt.show()

    ids = planet_ids#[:40]
    indices = range(len(ids))
    print(ids)
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()-2) as pool:
        result = pool.apply_async(get_phase, (indices, ids, n_steps))
    phases = result.get()
    # with concurrent.futures.ProcessPoolExecutor() as pool:
    #     phases = list(tqdm(pool.map(get_phase, indices, ids, itertools.repeat(n_steps)), total=len(ids)))
    df = pd.DataFrame(
                phases,
                columns=[f"phase_{i}" for i in range(len(phases[0]))],
                index=indices
    )
    filepath = f"phases_step{n_steps}_nids{len(indices)}.csv"
    print(f"... saving {filepath}")
    df.to_csv(filepath)
    return df


if __name__ == "__main__":
    for n_steps in [100, 400, 1000, 2000, 8000][::-1]:
        train = f_read_and_preprocess('train', train_adc_info, n_steps=n_steps)

In [None]:
!ls

In [None]:
import mlframework.plotting.axes_utils
import mlframework.plotting.utils_plotting

samples = range(20)
figure, axes_grid, axes_colorbar = mlframework.plotting.utils_plotting.create_axes_grid(
    1, len(samples)
)
for n_samples, sample in enumerate(samples):
    axes = axes_grid[0, n_samples]
    axes.plot()

In [None]:
# train_labels.to_numpy()[:,0]

In [None]:
oof_pred.shape

In [None]:
# oof_pred[:,6]

In [None]:
model = RidgeCV()
train_labels_sel = train_labels.iloc[:train.shape[0]]
oof_pred = cross_val_predict(model, train, train_labels_sel)

print(f"# R2 score: {r2_score(train_labels_sel, oof_pred):.3f}")
sigma_pred = mean_squared_error(train_labels_sel, oof_pred, squared=False)
print(f"# Root mean squared error: {sigma_pred:.6f}")

col = 1
plt.scatter(oof_pred[:,col], train_labels_sel.iloc[:,col], s=15, c=train_adc_info.iloc[:train.shape[0]]["star"])
plt.gca().set_aspect('equal')
plt.xlabel('y_pred')
plt.ylabel('y_true')
plt.title('Comparing y_true and y_pred')
plt.show()