In [1]:
import os
import h5py
from astropy.io import fits
import logging
from concurrent.futures import ThreadPoolExecutor
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def convert_fits_to_h5(fits_file, h5_file, target_length=3748):
    """Converts a single FITS file to HDF5 format."""
    try:
        with fits.open(fits_file) as hdul:
            if len(hdul) > 0:  # Check if the Primary HDU exists
                spectra_data = hdul[0].data  # Assuming the spectra data is in the Primary HDU
                if spectra_data is not None:
                    spectra_data = spectra_data[:target_length]  # Trim to target length if necessary
                    
                    # Save to HDF5
                    with h5py.File(h5_file, 'w') as hf:
                        hf.create_dataset('spectra', data=spectra_data)
                else:
                    logging.error(f"{fits_file} does not contain data in the Primary HDU")
            else:
                logging.error(f"{fits_file} does not contain the expected HDU")
    except Exception as e:
        logging.error(f"Error converting {fits_file} to {h5_file}: {e}")

def batch_convert_fits_to_h5(file_list, target_dir, target_length=3748):
    """Convert a batch of FITS files to HDF5 format."""
    os.makedirs(target_dir, exist_ok=True)  # Create target directory if it doesn't exist
    
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(convert_fits_to_h5, fits_file, os.path.join(target_dir, os.path.splitext(os.path.basename(fits_file))[0] + ".h5"), target_length) for fits_file in file_list]
        for future in futures:
            future.result()  # Wait for all threads to complete

    logging.info(f"All FITS files converted to HDF5 and saved in {target_dir}")

def generate_file_list_from_directories(base_dirs, limit_per_dir=10000):
    """Generates a list of files and labels from the pre-separated directories."""
    spectra_dirs = {
        "gal_spectra": 0,  # Label 0 for galaxies
        "star_spectra": 1,  # Label 1 for stars
        "agn_spectra": 2,   # Label 2 for AGNs
        "bin_spectra": 3    # Label 3 for binary stars
    }

    file_list = []
    labels = []

    logging.info("Gathering FITS files from pre-separated directories...")
    for dir_name, label in spectra_dirs.items():
        for base_dir in base_dirs:
            dir_path = os.path.join(base_dir, dir_name)
            if os.path.exists(dir_path):
                logging.info(f"Checking directory: {dir_path}")
                dir_files = os.listdir(dir_path)
                logging.info(f"Found {len(dir_files)} files in {dir_path}")

                # Collect all files in the directory
                for file in dir_files:
                    file_path = os.path.join(dir_path, file)
                    file_list.append(file_path)

                # Randomly select files up to the limit
                if len(file_list) > limit_per_dir:
                    selected_files = random.sample(file_list, limit_per_dir)
                else:
                    selected_files = file_list

                # Append selected files and their labels
                labels.extend([label] * len(selected_files))

    logging.info(f"Total spectra files collected: {len(file_list)}")
    return file_list, labels

# Convert all FITS files to HDF5
train_files, train_labels = generate_file_list_from_directories(["training_set"], limit_per_dir=10000)
val_files, val_labels = generate_file_list_from_directories(["validation_set"], limit_per_dir=10000)

batch_convert_fits_to_h5(train_files, "training_h5")
batch_convert_fits_to_h5(val_files, "validation_h5")


2024-09-30 18:22:31,524 - INFO - Gathering FITS files from pre-separated directories...
2024-09-30 18:22:31,525 - INFO - Checking directory: training_set/gal_spectra
2024-09-30 18:22:31,527 - INFO - Found 1699 files in training_set/gal_spectra
2024-09-30 18:22:31,528 - INFO - Checking directory: training_set/star_spectra
2024-09-30 18:22:31,567 - INFO - Found 86037 files in training_set/star_spectra
2024-09-30 18:22:31,643 - INFO - Checking directory: training_set/agn_spectra
2024-09-30 18:22:31,659 - INFO - Found 35936 files in training_set/agn_spectra
2024-09-30 18:22:31,690 - INFO - Checking directory: training_set/bin_spectra
2024-09-30 18:22:31,708 - INFO - Found 40676 files in training_set/bin_spectra
2024-09-30 18:22:31,740 - INFO - Total spectra files collected: 164348
2024-09-30 18:22:31,742 - INFO - Gathering FITS files from pre-separated directories...
2024-09-30 18:22:31,743 - INFO - Checking directory: validation_set/gal_spectra
2024-09-30 18:22:31,745 - INFO - Found 400 f

In [1]:
import os
import numpy as np
from astropy.io import fits
import logging
from concurrent.futures import ThreadPoolExecutor
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def convert_fits_to_npy(fits_file, npy_file, target_length=3748):
    """Converts a single FITS file to NumPy format."""
    try:
        with fits.open(fits_file) as hdul:
            if len(hdul) > 0:  # Check if the Primary HDU exists
                spectra_data = hdul[0].data  # Assuming the spectra data is in the Primary HDU
                if spectra_data is not None:
                    spectra_data = spectra_data[:target_length]  # Trim to target length if necessary
                    
                    # Save to NumPy array
                    np.save(npy_file, spectra_data)
                else:
                    logging.error(f"{fits_file} does not contain data in the Primary HDU")
            else:
                logging.error(f"{fits_file} does not contain the expected HDU")
    except Exception as e:
        logging.error(f"Error converting {fits_file} to {npy_file}: {e}")

def batch_convert_fits_to_npy(file_list, target_dir, target_length=3748):
    """Convert a batch of FITS files to NumPy format."""
    os.makedirs(target_dir, exist_ok=True)  # Create target directory if it doesn't exist
    
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(convert_fits_to_npy, fits_file, os.path.join(target_dir, os.path.splitext(os.path.basename(fits_file))[0] + ".npy"), target_length) for fits_file in file_list]
        for future in futures:
            future.result()  # Wait for all threads to complete

    logging.info(f"All FITS files converted to NumPy arrays and saved in {target_dir}")

def generate_file_list_from_directories(base_dirs, limit_per_dir=10000):
    """Generates a list of files and labels from the pre-separated directories."""
    spectra_dirs = {
        "gal_spectra": 0,  # Label 0 for galaxies
        "star_spectra": 1,  # Label 1 for stars
        "agn_spectra": 2,   # Label 2 for AGNs
        "bin_spectra": 3    # Label 3 for binary stars
    }

    file_list = []
    labels = []

    logging.info("Gathering FITS files from pre-separated directories...")
    for dir_name, label in spectra_dirs.items():
        for base_dir in base_dirs:
            dir_path = os.path.join(base_dir, dir_name)
            if os.path.exists(dir_path):
                logging.info(f"Checking directory: {dir_path}")
                dir_files = os.listdir(dir_path)
                logging.info(f"Found {len(dir_files)} files in {dir_path}")

                # Collect all files in the directory
                for file in dir_files:
                    file_path = os.path.join(dir_path, file)
                    file_list.append(file_path)

                # Randomly select files up to the limit
                if len(file_list) > limit_per_dir:
                    selected_files = random.sample(file_list, limit_per_dir)
                else:
                    selected_files = file_list

                # Append selected files and their labels
                labels.extend([label] * len(selected_files))

    logging.info(f"Total spectra files collected: {len(file_list)}")
    return file_list, labels

# Convert all FITS files to NumPy arrays
train_files, train_labels = generate_file_list_from_directories(["training_set"], limit_per_dir=10000)
val_files, val_labels = generate_file_list_from_directories(["validation_set"], limit_per_dir=10000)

batch_convert_fits_to_npy(train_files, "training_npy")
batch_convert_fits_to_npy(val_files, "validation_npy")


2024-09-30 18:46:17,600 - INFO - Gathering FITS files from pre-separated directories...
2024-09-30 18:46:17,601 - INFO - Checking directory: training_set/gal_spectra
2024-09-30 18:46:17,603 - INFO - Found 1699 files in training_set/gal_spectra
2024-09-30 18:46:17,604 - INFO - Checking directory: training_set/star_spectra
2024-09-30 18:46:17,648 - INFO - Found 86037 files in training_set/star_spectra
2024-09-30 18:46:17,710 - INFO - Checking directory: training_set/agn_spectra
2024-09-30 18:46:17,726 - INFO - Found 35936 files in training_set/agn_spectra
2024-09-30 18:46:17,754 - INFO - Checking directory: training_set/bin_spectra
2024-09-30 18:46:17,770 - INFO - Found 40676 files in training_set/bin_spectra
2024-09-30 18:46:17,806 - INFO - Total spectra files collected: 164348
2024-09-30 18:46:17,808 - INFO - Gathering FITS files from pre-separated directories...
2024-09-30 18:46:17,808 - INFO - Checking directory: validation_set/gal_spectra
2024-09-30 18:46:17,810 - INFO - Found 400 f