In [2]:
import os
import pandas as pd
from tqdm import tqdm

# --- Configuration Section ---
# Please verify these paths are correct for your environment.

USER_HOME = "/content/drive/MyDrive/Bren_code/My_work/"
BASE_PATH = os.path.join(USER_HOME, "JADES_Analysis/")

# Path to the parent folder containing all individual project folders (e.g., '1180', '1181')
# This is where the script will search for your FITS files.
DATA_SEARCH_PATH = os.path.join(BASE_PATH, "JADES/")

# Path to the CSV file you want to update.
TARGET_CSV_PATH = os.path.join(BASE_PATH, "jades.csv")

# List of project IDs to search through.
# Add or remove projects here to match the folders in your DATA_SEARCH_PATH.
PROJECT_IDS = [
    '1180',
    '1181',
    '1210',
    '1286',
    'ultra_deep' # Example from your original config
]


def build_galaxy_to_project_map(search_path, project_ids):
    """
    Scans project subdirectories to find all .spec.fits files and maps
    each galaxy ID to its parent project ID.

    Returns:
        A dictionary mapping {galaxy_id (int): project_id (str)}.
    """
    print("Starting search for spectrum files. This may take several minutes...")
    galaxy_map = {}

    # Use tqdm for a progress bar during the slow search
    for project_id in tqdm(project_ids, desc="Scanning Projects"):
        project_folder = os.path.join(search_path, project_id)

        if not os.path.isdir(project_folder):
            print(f"\nWarning: Project folder not found, skipping: {project_folder}")
            continue

        for root, dirs, files in os.walk(project_folder):
            for filename in files:
                if filename.endswith('.spec.fits'):
                    try:
                        # This parsing logic is taken directly from your
                        # 'combine_gratings_specutils' function.
                        galaxy_id_str = filename.split('_')[-1].replace('.spec.fits', '')
                        galaxy_id_int = int(galaxy_id_str)

                        # Store the mapping
                        galaxy_map[galaxy_id_int] = project_id

                    except (IndexError, ValueError):
                        # This will catch filenames that don't match the expected format.
                        print(f"\nWarning: Could not parse galaxy ID from filename: {filename}")
                        continue

    print(f"\nFile search complete. Found {len(galaxy_map)} unique galaxy spectra.")
    return galaxy_map


def update_master_csv(csv_path, galaxy_map):
    """
    Loads the master CSV, adds the 'project_id' column based on the
    provided map, and saves it back to the same location.
    """
    print(f"\nUpdating master CSV file at: {csv_path}")

    try:
        df = pd.read_csv(csv_path)
    except FileNotFoundError:
        print(f"✗ ERROR: Master CSV file not found. Please check the path: {csv_path}")
        return

    # Ensure the 'galaxy_id' column is of integer type for correct mapping
    if 'galaxy_id' not in df.columns:
        print("✗ ERROR: 'galaxy_id' column not found in the CSV file.")
        return

    df['galaxy_id'] = df['galaxy_id'].astype(int)

    print(f"  - Mapping project IDs to {len(df)} records...")
    # Use the .map() function to create the new column from our dictionary
    df['project_id'] = df['galaxy_id'].map(galaxy_map)

    # Report any galaxies in the CSV that weren't found during the file search
    unmapped_count = df['project_id'].isnull().sum()
    if unmapped_count > 0:
        print(f"  - Warning: {unmapped_count} galaxies in the CSV could not be matched to a project.")

    # Overwrite the original CSV file with the updated data
    try:
        df.to_csv(csv_path, index=False)
        print(f"✓ Success: The file '{os.path.basename(csv_path)}' has been updated with the 'project_id' column.")
    except Exception as e:
        print(f"✗ ERROR: Could not save the updated CSV file. Reason: {e}")


if __name__ == '__main__':
    print("--- Starting One-Time Update of Master JADES Catalog ---")

    # It's always wise to back up your data before running a script that overwrites it.
    print(f"IMPORTANT: This script will OVERWRITE '{TARGET_CSV_PATH}'.")
    print("Please ensure you have a backup before proceeding.")
    # You can uncomment the following lines for a manual confirmation step.
    # confirmation = input("Type 'yes' to continue: ")
    # if confirmation.lower() != 'yes':
    #     print("Operation cancelled by user.")
    #     exit()

    # Step 1: Perform the slow search and build the map
    galaxy_project_map = build_galaxy_to_project_map(DATA_SEARCH_PATH, PROJECT_IDS)

    # Step 2: Use the map to update the CSV file
    if galaxy_project_map:
        update_master_csv(TARGET_CSV_PATH, galaxy_project_map)
    else:
        print("No galaxy spectra were found, so the CSV file was not updated.")

    print("\n--- Update Process Complete ---")

--- Starting One-Time Update of Master JADES Catalog ---
IMPORTANT: This script will OVERWRITE '/content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/jades.csv'.
Please ensure you have a backup before proceeding.
Starting search for spectrum files. This may take several minutes...


Scanning Projects:  20%|██        | 1/5 [00:16<01:06, 16.73s/it]























































































































































Scanning Projects:  40%|████      | 2/5 [00:41<01:05, 21.74s/it]





Scanning Projects: 100%|██████████| 5/5 [00:42<00:00,  8.47s/it]







































































































































































































































































































































































































































File search complete. Found 4820 unique galaxy spectra.

Updating master CSV file at: /content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/jades.csv





  - Mapping project IDs to 4424 records...
✓ Success: The file 'jades.csv' has been updated with the 'project_id' column.

--- Update Process Complete ---
