In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import gc
import re

def flatten_exoplanet_spectra(
    data_dir="planet_data_random",
    target_file="FM_Parameter_Table.csv",
    output_file="ABC_Feature_Matrix.csv",
    batch_size=500
):
    """
    Flatten and vectorize exoplanet spectral data for randomly selected planets.
    
    Parameters:
    -----------
    data_dir : str
        Directory containing the spectral CSV files
    target_file : str
        Path to the target parameter file
    output_file : str
        Path for the output feature matrix
    batch_size : int
        Number of planets to process before concatenating (for memory efficiency)
    """
    
    # Load the target parameter table
    print("Loading target parameter table...")
    target_df = pd.read_csv(target_file)
    target_df = target_df.rename(columns={'planet_ID': 'planet_ID'}) if 'planet_ID' in target_df.columns else target_df
    print(f"Target file loaded: {target_df.shape}")
    
    # Extract planet IDs from filenames in the data directory
    print("Extracting planet IDs from filenames...")
    data_path = Path(data_dir)
    planet_files = list(data_path.glob("Planet_*_data.csv"))
    
    # Extract planet IDs using regex pattern
    planet_ids = []
    for file_path in planet_files:
        match = re.search(r'Planet_(\d+)_data\.csv', file_path.name)
        if match:
            planet_id = int(match.group(1))
            planet_ids.append(planet_id)
    
    planet_ids.sort()  # Sort for consistent processing order
    num_planets = len(planet_ids)
    print(f"Found {num_planets} planet files to process")
    print(f"Planet ID range: {min(planet_ids)} to {max(planet_ids)}")
    
    # Initialize list to collect batches
    batch_list = []
    
    # Process planets in batches for memory efficiency
    print(f"Processing {num_planets} spectral files...")
    
    for i, planet_id in enumerate(planet_ids):
        # Construct file path
        spectral_file = data_path / f"Planet_{planet_id}_data.csv"
        
        try:
            # Load spectral data
            spectrum_df = pd.read_csv(spectral_file)
            
            # Validate that required columns exist
            required_cols = ['instrument_wlgrid', 'instrument_spectrum']
            if not all(col in spectrum_df.columns for col in required_cols):
                raise ValueError(f"Missing required columns in {spectral_file}")
            
            # CRUCIAL STEP: Sort by instrument_wlgrid in ascending order
            spectrum_df = spectrum_df.sort_values(by='instrument_wlgrid', ascending=True)
            
            # Extract spectrum values after sorting (52 elements)
            spectrum_values = spectrum_df['instrument_spectrum'].values
            
            # Validate length
            if len(spectrum_values) != 52:
                raise ValueError(f"Expected 52 spectrum values, got {len(spectrum_values)}")
            
            # Create feature names
            feature_names = [f'spec_{i}' for i in range(52)]
            
            # Create a single-row DataFrame with planet_ID and spectral features
            row_dict = {'planet_ID': planet_id}
            row_dict.update({feature_names[i]: spectrum_values[i] for i in range(52)})
            row_df = pd.DataFrame([row_dict])
            
            # Add to batch
            batch_list.append(row_df)
            
            # Print progress
            if (i + 1) % 1000 == 0:
                print(f"  Processed {i + 1} / {num_planets} planets")
        
        except FileNotFoundError:
            print(f"WARNING: File not found: {spectral_file}")
            continue
        except Exception as e:
            print(f"ERROR processing planet {planet_id}: {e}")
            continue
    
    # Concatenate all batches into a single DataFrame
    print("Concatenating all spectral features...")
    spectral_matrix = pd.concat(batch_list, ignore_index=True)
    print(f"Spectral matrix shape: {spectral_matrix.shape}")
    
    # Clear memory
    del batch_list
    gc.collect()
    
    # Merge with target parameters using planet_ID as key
    print("Merging with target parameters...")
    final_matrix = spectral_matrix.merge(
        target_df,
        on='planet_ID',
        how='inner'
    )
    print(f"Final merged matrix shape: {final_matrix.shape}")
    
    # Save to output file
    print(f"Saving final feature matrix to {output_file}...")
    final_matrix.to_csv(output_file, index=False)
    print(f"SUCCESS: Feature matrix saved to {output_file}")
    print(f"Final dimensions: {final_matrix.shape[0]} rows × {final_matrix.shape[1]} columns")
    
    return final_matrix


if __name__ == "__main__":
    # Run the pipeline with default parameters
    # The function now automatically detects and processes only the random planet files
    result = flatten_exoplanet_spectra(
        data_dir="planet_data_random",         # Directory with Planet_*.csv files
        target_file="FM_Parameter_Table.csv",  # Target parameters file
        output_file="ABC_Feature_Matrix_Random.csv",  # New output file name
        batch_size=500                         # Process in batches for efficiency
    )

Loading target parameter table...
Target file loaded: (91392, 7)
Extracting planet IDs from filenames...
Found 20000 planet files to process
Planet ID range: 1 to 91389
Processing 20000 spectral files...
  Processed 1000 / 20000 planets
  Processed 2000 / 20000 planets
  Processed 3000 / 20000 planets
  Processed 4000 / 20000 planets
  Processed 5000 / 20000 planets
  Processed 6000 / 20000 planets
  Processed 7000 / 20000 planets
  Processed 8000 / 20000 planets
  Processed 9000 / 20000 planets
  Processed 10000 / 20000 planets
  Processed 11000 / 20000 planets
  Processed 12000 / 20000 planets
  Processed 13000 / 20000 planets
  Processed 14000 / 20000 planets
  Processed 15000 / 20000 planets
  Processed 16000 / 20000 planets
  Processed 17000 / 20000 planets
  Processed 18000 / 20000 planets
  Processed 19000 / 20000 planets
  Processed 20000 / 20000 planets
Concatenating all spectral features...
Spectral matrix shape: (20000, 53)
Merging with target parameters...
Final merged matr