In [None]:
import h5py
import pandas as pd
import numpy as np
import os
import random

def explore_planet_structure(filename):
    """Explore what's inside a planet group to see all parameters"""
    print(f"Exploring structure of {filename}...\n")
    
    with h5py.File(filename, 'r') as f:
        sample_key = list(f.keys())[0]
        print(f"Sample planet group: {sample_key}")
        print(f"Contents of {sample_key}:\n")
        
        sample_group = f[sample_key]
        
        def print_structure(name, obj, indent=0):
            prefix = "  " * indent
            if isinstance(obj, h5py.Dataset):
                print(f"{prefix}Dataset: {name}")
                print(f"{prefix}  Shape: {obj.shape} | dtype: {obj.dtype}")
                if obj.size <= 10:
                    print(f"{prefix}  Values: {np.array(obj)}")
            elif isinstance(obj, h5py.Group):
                print(f"{prefix}Group: {name}/")
        
        for key in sample_group.keys():
            item = sample_group[key]
            print_structure(key, item)
        print()

def extract_planet_data(planet_group):
    """
    Extract all data from a planet group into a dictionary
    Handles both flat datasets and nested groups
    """
    data_dict = {}
    
    def extract_recursive(group, prefix=''):
        for key in group.keys():
            item = group[key]
            current_key = f"{prefix}{key}" if prefix else key
            
            if isinstance(item, h5py.Dataset):
                data = np.array(item)
                
                if len(data.shape) == 1:
                    data_dict[current_key] = data
                else:
                    data_dict[current_key] = data.flatten()
            
            elif isinstance(item, h5py.Group):
                extract_recursive(item, f"{current_key}/")
    
    extract_recursive(planet_group)
    return data_dict

def hdf5_random_planets_to_individual_csv(hdf5_filename, n_planets=20000, output_dir='planet_data_random'):
    """
    Extract random planets from HDF5 and save each planet as a separate CSV file
    """
    
    # Create output directory
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    try:
        with h5py.File(hdf5_filename, 'r') as f:
            all_planet_keys = sorted(f.keys())
            total_planets = len(all_planet_keys)
            
            print(f"Total planets available: {total_planets}")
            print(f"Requesting {n_planets} random planets")
            print()
            
            # Randomly select n_planets
            if n_planets > total_planets:
                print(f"Warning: Requested {n_planets} but only {total_planets} available")
                print(f"Extracting all {total_planets} planets instead\n")
                n_planets = total_planets
            
            random_indices = random.sample(range(total_planets), n_planets)
            random_indices.sort()  # Sort for sequential access (faster)
            selected_keys = [all_planet_keys[i] for i in random_indices]
            
            print(f"Extracting {len(selected_keys)} randomly selected planets...")
            print(f"Output directory: {output_dir}/\n")
            
            # First, explore structure
            explore_planet_structure(hdf5_filename)
            
            # Extract each planet individually
            successful_extractions = 0
            failed_extractions = 0
            
            for idx, planet_key in enumerate(selected_keys):
                if (idx + 1) % 1000 == 0:
                    print(f"  Processed {idx + 1}/{len(selected_keys)} planets...")
                    print(f"    Successfully saved: {successful_extractions}")
                    print(f"    Failed: {failed_extractions}\n")
                
                try:
                    planet_group = f[planet_key]
                    planet_data = extract_planet_data(planet_group)
                    
                    if not planet_data:
                        print(f"    Warning: No data found for {planet_key}")
                        failed_extractions += 1
                        continue
                    
                    # Find the maximum length for reshaping
                    max_len = max(len(v) if isinstance(v, np.ndarray) else 1 
                                 for v in planet_data.values())
                    
                    # Create a dictionary with aligned lengths
                    aligned_data = {}
                    for key, value in planet_data.items():
                        if isinstance(value, np.ndarray):
                            if len(value) == 1:
                                aligned_data[key] = [value[0]] * max_len
                            else:
                                aligned_data[key] = value[:max_len]
                        else:
                            aligned_data[key] = [value] * max_len
                    
                    # Create DataFrame
                    df = pd.DataFrame(aligned_data)
                    
                    # Save to individual CSV file
                    csv_filename = os.path.join(output_dir, f'{planet_key}_data.csv')
                    df.to_csv(csv_filename, index=False)
                    
                    successful_extractions += 1
                
                except Exception as e:
                    print(f"    Error processing {planet_key}: {e}")
                    failed_extractions += 1
                    continue
            
            print(f"\n" + "=" * 70)
            print(f"Extraction Complete!")
            print(f"=" * 70)
            print(f"Successfully saved: {successful_extractions} planet CSV files")
            print(f"Failed: {failed_extractions} planets")
            print(f"Output directory: {output_dir}/")
            print(f"Total size of directory: {get_directory_size(output_dir) / (1024**3):.2f} GB")
    
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

def get_directory_size(directory):
    """Calculate total size of all files in directory"""
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            if os.path.exists(filepath):
                total_size += os.path.getsize(filepath)
    return total_size

# Main execution
if __name__ == "__main__":
    hdf5_file = 'SpectralData.hdf5'
    n_planets_to_extract = 20000
    output_directory = 'planet_data_random'
    
    print("=" * 70)
    print("ABC Dataset HDF5 to CSV Converter - Individual Planet Files")
    print("=" * 70 + "\n")
    
    # Set random seed for reproducibility (comment out for different selection each time)
    random.seed(42)
    np.random.seed(42)
    
    # Extract each planet as a separate CSV file
    hdf5_random_planets_to_individual_csv(hdf5_file, n_planets=n_planets_to_extract, 
                                          output_dir=output_directory)