In [5]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

DATA_PATH = '../data/'
OUTPUT_PATH = '../'

train_log_df = pd.read_csv(os.path.join(DATA_PATH, 'train_log.csv'))
print("Training log loaded successfully")

Training log loaded successfully


In [6]:
dataframe_cache = {}
def load_lightcurve(object_id: str, split: str) -> pd.DataFrame:
    """Loads lightcurve data for a single object, using a cache"""
    file_path = os.path.join(DATA_PATH, split, "train_full_lightcurves.csv")
    if file_path not in dataframe_cache:
        try:
            full_df = pd.read_csv(file_path)
            dataframe_cache[file_path] = full_df
        except FileNotFoundError:
            return pd.DataFrame()
    else:
        full_df = dataframe_cache[file_path]
    
    object_df = full_df[full_df['object_id'] == object_id].copy()
    return object_df

def engineer_features(df: pd.DataFrame, object_id: str) -> dict:
    """Engineers a set of features for a single object's lightcurve data."""
    if df.empty:
        return {}
    
    # Make a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Create flux ratio feature
    df['Flux_Ratio'] = df['Flux'] / df['Flux_err']

    agg_features = {
        'Flux_mean': df['Flux'].mean(),
        'Flux_std': df['Flux'].std(),
        'Flux_min': df['Flux'].min(),
        'Flux_max': df['Flux'].max(),
        'Flux_median': df['Flux'].median(),
        'Flux_skew': df['Flux'].skew(),
        'Flux_err_mean': df['Flux_err'].mean(),
        'Flux_err_std': df['Flux_err'].std(),
        'Flux_Ratio_mean': df['Flux_Ratio'].mean(),
        'Flux_Ratio_skew': df['Flux_Ratio'].skew(),
    }

    # Per Filter Agg
    # Pivot to get columns for each filter
    df_pivot = df.pivot_table(
        index='Time (MJD)', columns='Filter', values='Flux'
    )

    # Cal. stats for each filter's flux
    for f in ['u', 'g', 'r', 'i', 'z', 'y']:
        if f in df_pivot.columns:
            agg_features[f'{f}_Flux_mean'] = df_pivot[f].mean()
            agg_features[f'{f}_Flux_std'] = df_pivot[f].std()
            agg_features[f'{f}_Flux_max'] = df_pivot[f].max()
            agg_features[f'{f}_Flux_min'] = df_pivot[f].min()
        else:
            # Fill with NaN if the filter is missing
            agg_features[f'{f}_Flux_mean'] = np.nan
            agg_features[f'{f}_Flux_std'] = np.nan
            agg_features[f'{f}_Flux_max'] = np.nan
            agg_features[f'{f}_Flux_min'] = np.nan
    
    agg_features['object_id'] = object_id

    return agg_features

In [7]:
print("Starting feature engineering process for all training objects...")

all_features = [
    engineer_features(
        load_lightcurve(row['object_id'], row['split']),
        row['object_id']
    )
    for _, row in tqdm(train_log_df.iterrows(), total=len(train_log_df))
]

all_features = [f for f in all_features if f]
train_features_df = pd.DataFrame(all_features)

final_train_df = pd.merge(
    train_log_df.drop(columns=['Z_err', 'SpecType']),
    train_features_df,
    on='object_id',
    how='left'
)

Starting feature engineering process for all training objects...


100%|██████████| 3043/3043 [00:11<00:00, 272.69it/s]


In [8]:
# Save it
output_file_path = os.path.join(OUTPUT_PATH, 'train_features.csv')
final_train_df.to_csv(output_file_path, index=False)

print(f"Feature Engineering Complete")
print(f"New feature dataframe shape: {final_train_df.shape}")
print(f"Saved features to: {output_file_path}")

print("\n--- New Training DataFrame Head ---")
print(final_train_df.head())

print("\n --- New Training DataFrame Info ---")
final_train_df.info()

Feature Engineering Complete
New feature dataframe shape: (3043, 40)
Saved features to: ../train_features.csv

--- New Training DataFrame Head ---
                  object_id       Z    EBV  \
0  Dornhoth_fervain_onodrim  3.0490  0.110   
1       Dornhoth_galadh_ylf  0.4324  0.058   
2      Elrim_melethril_thul  0.4673  0.577   
3        Ithil_tobas_rodwen  0.6946  0.012   
4       Mirion_adar_Druadan  0.4161  0.058   

                               English Translation     split  target  \
0  Trawn Folk (Dwarfs) + northern + Ents (people)   split_01       0   
1    Trawn Folk (Dwarfs) + tree + drinking vessel   split_01       0   
2                  Elves +  lover (fem.)  + breath  split_01       0   
3                    moon +  roof  +  noble maiden  split_01       0   
4            jewel, Silmaril  + father + Wild Man   split_01       0   

   Flux_mean  Flux_std  Flux_min   Flux_max  ...  i_Flux_max  i_Flux_min  \
0   0.928483  4.803445 -2.756285  25.047343  ...   22.951323   -2.2

Now I have transformed the complex, variable length time series data into a clean, fixed size tabular format(`(3043, 40)`). Each row now represents one astronomical object, and the columns are our engineered features. This format is ideal for ML models

Now I can start implementing the basemodel