In [1]:
# =========================================
# Script: JADES_download.ipynb
# Purpose: Simple JADES data download and spectral line analysis
# Author: Joseph Havens
# Date: 19-07-2025
# =========================================

import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks, savgol_filter
from scipy.optimize import curve_fit
import astropy
from astropy.io import fits
from astropy.table import Table
import pandas as pd
from astropy.io import ascii
import requests
import os
import os.path
import urllib3.contrib.pyopenssl
from requests.exceptions import ConnectionError
from astropy.utils.data import download_file
import shutil
from collections import defaultdict
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import logging
import sys
from datetime import datetime

In [2]:
# === CONFIGURATION ===

# Input your base path
USER_PATH = "/content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/"
BASE_SAVE_PATH = os.path.join(USER_PATH, "JADES_SHARED/")
CSV_PARENT_PATH = USER_PATH

# --- NEW: Define JADES projects with their multiple pointings ---
# The script will loop through each program, and then through each pointing within it.

JADES_PROJECTS = {'shared':{
    '10011141':'https://s3.amazonaws.com/msaexp-nirspec/extractions/10011141',
    '10012102':'https://s3.amazonaws.com/msaexp-nirspec/extractions/10012102',
    '10013169':'https://s3.amazonaws.com/msaexp-nirspec/extractions/10013169'
    }

}
"""JADES_PROJECTS = {
    '1180': {
        'jades-gds-wide-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds-wide-v3/',
        'jades-gds-wide2-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds-wide2-v3/',
        'jades-gds-wide3-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds-wide3-v3/'
    },

    '1181': {
        'jades-gdn-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn-v3/',
        'jades-gdn09-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn09-v3/',
        'jades-gdn10-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn10-v3/',
        'jades-gdn11-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn11-v3/',
        'jades-gdn2-blue-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn2-blue-v3/',
        'jades-gdn2-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn2-v3/',
    },

    '1210': {
        'gds-deep-v3':'https://s3.amazonaws.com/msaexp-nirspec/extractions/gds-deep-v3/'
        },

    '1286': {
        'jades-gds02-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds02-v3/',
        'jades-gds03-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds03-v3/',
        'jades-gds04-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds04-v3/',
        'jades-gds05-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds05-v3/',
        'jades-gds06-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds06-v3/',
        'jades-gds07-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds07-v3/',
        'jades-gds08-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds08-v3/',
        'jades-gds1-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds1-v3/',
    },

    'ultra_deep': {
        'gds-udeep-v3':'https://s3.amazonaws.com/msaexp-nirspec/extractions/gds-udeep-v3/'
    }
}"""

"JADES_PROJECTS = {\n    '1180': {\n        'jades-gds-wide-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds-wide-v3/',\n        'jades-gds-wide2-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds-wide2-v3/',\n        'jades-gds-wide3-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gds-wide3-v3/'\n    },\n\n    '1181': {\n        'jades-gdn-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn-v3/',\n        'jades-gdn09-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn09-v3/',\n        'jades-gdn10-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn10-v3/',\n        'jades-gdn11-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn11-v3/',\n        'jades-gdn2-blue-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn2-blue-v3/',\n        'jades-gdn2-v3': 'https://s3.amazonaws.com/msaexp-nirspec/extractions/jades-gdn2-v3/',\n    },\n\n    '1210': {\n        '

In [3]:
# --- Logger Setup ---
# This block replaces the need for a global 'debug' variable.

# 1. Create a filename for the detailed log file
# Example: 'log_2025-06-08_19-38.log'
log_filename = f"log_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.log"
# In Colab/Drive, you might want to specify the full path:
log_filepath = "/content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/logs/" + log_filename

# 2. Get the root logger
logger = logging.getLogger()
logger.setLevel(logging.DEBUG) # Set the lowest level to capture ALL messages

# 3. Create a handler to write to the CONSOLE (for high-level info)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO) # Only shows INFO, WARNING, ERROR, CRITICAL
console_formatter = logging.Formatter('%(message)s') # Keep console output clean
console_handler.setFormatter(console_formatter)

# 4. Create a handler to write to the FILE (for all the details)
file_handler = logging.FileHandler(log_filepath)
file_handler.setLevel(logging.DEBUG) # Captures EVERY level, including DEBUG
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)

# 5. Add the handlers to the logger
# Important: Clear existing handlers to prevent duplicate logs in notebooks
if (logger.hasHandlers()):
    logger.handlers.clear()
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# --- End Logger Setup ---
# Now, you can use logger.info(), logger.debug(), etc. throughout your code.
logger.info(f"Logger initialized. Detailed debug output will be saved to: {log_filepath}")

Logger initialized. Detailed debug output will be saved to: /content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/logs/log_2025-10-07_19-32.log


In [None]:
def download_jades_data(projects, csv_parent_path, base_save_path):
    """
    Downloads JADES FITS files, using a separate CSV file for each
    individual pointing.
    """
    logger.info("====================" * 6)
    logger.info("--- Beginning JADES Data Download (Pointing-Specific CSVs) ---")
    count = 0

    # --- Outer Loop (by Program ID) ---
    for program_id, pointings_dict in projects.items():
        logger.info("-------------------------------------------------")
        logger.info(f"Processing Program ID: {program_id}")

        # Define and create the save directory for the entire program
        program_save_path = os.path.join(base_save_path, program_id)
        os.makedirs(program_save_path, exist_ok=True)

        # --- Inner Loop (by Pointing/URL) ---
        for pointing_name, data_url in pointings_dict.items():
            logger.info(f"  --> Processing pointing: {pointing_name}")

            # --- Load the CSV file specific to THIS pointing ---
            csv_filename = f"{program_id}-{pointing_name}.csv"
            target_list_path = os.path.join(csv_parent_path, csv_filename)
            try:
                target_list = pd.read_csv(target_list_path)
                logger.info(f"    Loaded {len(target_list)} targets from '{csv_filename}'.")
            except FileNotFoundError:
                logger.warning(f"    Target list '{csv_filename}' not found. Skipping this pointing.")
                continue # Move to the next pointing in the inner loop

            # Loop through each target file in this pointing's CSV
            for i, row in target_list.iterrows():
                target_file = row['file']
                file_path = os.path.join(program_save_path, target_file)

                # Skip if file already exists (it might be in multiple pointing CSVs)
                if os.path.exists(file_path):
                    logger.debug(f"    Skipping '{target_file}', already downloaded.")
                    continue

                file_url = data_url + target_file
                logger.info(f"    Downloading: {target_file}")
                try:
                    r = requests.get(file_url, allow_redirects=True, timeout=120)
                    r.raise_for_status()

                    with open(file_path, 'wb') as f:
                        f.write(r.content)
                        count += 1
                    try:
                        with fits.open(file_path) as hdul:
                            logger.debug(f"      ✓ FITS verification successful.")
                    except Exception:
                        logger.error(f"      ✗ CORRUPTED FILE: '{target_file}' is not valid.")
                        os.remove(file_path)

                except requests.exceptions.HTTPError as e:
                    if e.response.status_code == 404:
                        logger.debug(f"      - '{target_file}' not found at this pointing.")
                    else:
                        logger.error(f"      ✗ HTTP Error for '{target_file}': {e}")
                except Exception as e:
                    logger.error(f"      ✗ An unknown error occurred for '{target_file}': {e}")

    logger.info("====================" * 6)
    logger.info("!!!! JADES DOWNLOAD COMPLETE !!!!")
    logger.info(f"Total files downloaded: {count}")
    logger.info("====================" * 6)

In [None]:
download_jades_data(JADES_PROJECTS, CSV_PARENT_PATH, BASE_SAVE_PATH)

--- Beginning JADES Data Download (Pointing-Specific CSVs) ---


AttributeError: 'str' object has no attribute 'items'

In [None]:
from astropy.io import fits
# Path to one of your downloaded JADES files
jades_file_path = '/content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/JADES/1180/jades-gds-wide3-v3_prism-clear_b70.spec.fits' # Replace with a real file name
# Print the structure of the FITS file
with fits.open(jades_file_path) as hdul:
    print(hdul[1].header)

In [5]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict
import logging

# Astropy and Specutils imports
import astropy.units as u
from astropy.io import fits
from astropy.table import Table
!pip install specutils
from specutils import Spectrum # Use the modern Spectrum class
from astropy.nddata import StdDevUncertainty
from astropy import constants as const

# =========================================
# 1. CONFIGURATION
# =========================================
BASE_PATH = "/content/drive/MyDrive/Bren_code/My_work/SMACS_Analysis/"
SMACS_PROJECT_PATH = os.path.join(BASE_PATH, "GLEAM_SMACS/")
MASTER_SMACS_CSV = os.path.join(BASE_PATH, "smacs.csv")
RAW_SPECTRA_PATH = os.path.join(SMACS_PROJECT_PATH, "SMACS/")
CLEAN_SPECTRA_PATH = os.path.join(SMACS_PROJECT_PATH, "SMACS/")

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger()

# =========================================
# 2. HELPER FUNCTIONS
# =========================================
def combine_gratings_robust(base_data_folder):
    """
    Combines multi-grating spectra, robustly reading units from FITS headers.
    """
    logger.info(f"--- Combining raw spectra from {base_data_folder} ---")
    all_fits_files = [os.path.join(root, file) for root, dirs, files in os.walk(base_data_folder) for file in files if file.endswith('.spec.fits')]

    grouped_spectra = defaultdict(list)
    for fits_path in all_fits_files:
        filename = os.path.basename(fits_path)
        try:
            galaxy_id = filename.split('_')[-1].replace('.spec.fits', '')
            int(galaxy_id)
            grouped_spectra[galaxy_id].append(fits_path)
        except (ValueError, IndexError):
            continue

    combined_spectra = {}
    for galaxy_id, file_paths in tqdm(sorted(grouped_spectra.items()), desc="Combining Gratings"):
        all_wave, all_flux, all_err = [], [], []
        try:
            with fits.open(file_paths[0]) as hdul:
                wave_unit = u.Unit(hdul[1].header.get('TUNIT1', 'um'))
                flux_unit = u.Unit(hdul[1].header.get('TUNIT2', 'uJy'))
        except Exception:
            wave_unit, flux_unit = u.um, u.uJy

        for f_path in file_paths:
            try:
                with fits.open(f_path) as hdul:
                    data = hdul[1].data
                    all_wave.append(data['wave'])
                    all_flux.append(data['flux'])
                    all_err.append(data['err'])
            except Exception as e:
                logger.error(f"GID {galaxy_id}: Failed to process raw file '{os.path.basename(f_path)}'. Reason: {e}")
                continue

        if not all_wave: continue
        wave, flux, err = np.concatenate(all_wave), np.concatenate(all_flux), np.concatenate(all_err)
        sort_idx = np.argsort(wave)
        spectrum_object = Spectrum(
            flux=flux[sort_idx] * flux_unit,
            spectral_axis=wave[sort_idx] * wave_unit,
            uncertainty=StdDevUncertainty(err[sort_idx] * flux_unit))
        combined_spectra[galaxy_id] = {'spectrum': spectrum_object}
    return combined_spectra

def save_correct_spectrum_format(spectrum, output_path, filename):
    """
    Saves a Spectrum1D object to the correct FITS Table format that GLEAM needs.
    """
    os.makedirs(output_path, exist_ok=True)
    valid_mask = np.isfinite(spectrum.flux) & np.isfinite(spectrum.uncertainty.array)
    if not np.any(valid_mask):
        logger.warning(f"Spectrum for {filename} contains no valid data. Skipping save.")
        return
    output_table = Table({'wl': spectrum.spectral_axis[valid_mask], 'flux': spectrum.flux[valid_mask],
                          'stdev': spectrum.uncertainty.array[valid_mask] * spectrum.flux.unit})
    output_table.write(os.path.join(output_path, filename), format='fits', overwrite=True)


# =========================================
# 3. MAIN PREPARATION SCRIPT
# =========================================
logger.info("### STARTING SMACS SPECTRA PREPARATION ###")
os.makedirs(CLEAN_SPECTRA_PATH, exist_ok=True)

# Load master list and prepare it
df_master_list = pd.read_csv(MASTER_SMACS_CSV)
def extract_gid_from_filename(filename):
    try: return int(filename.split('_')[-1].replace('.spec.fits', ''))
    except (ValueError, IndexError): return None
df_master_list['galaxy_id'] = df_master_list['file'].apply(extract_gid_from_filename)
df_master_list.dropna(subset=['galaxy_id'], inplace=True)
df_master_list['galaxy_id'] = df_master_list['galaxy_id'].astype(int)
df_master_list = df_master_list.rename(columns={'Redshift': 'z'})

# Combine raw spectra
combined_spectra = combine_gratings_robust(RAW_SPECTRA_PATH)
logger.info(f"\n--- Processing and Saving {len(combined_spectra)} Spectra ---")

for galaxy_id_str, spec_info in tqdm(combined_spectra.items(), desc="Processing & Saving Spectra"):
    try:
        galaxy_id = int(galaxy_id_str)
        z_rows = df_master_list[df_master_list['galaxy_id'] == galaxy_id]
        if z_rows.empty: continue

        z = z_rows['z'].iloc[0]
        spectrum_object = spec_info['spectrum']

        # --- UNIT CONVERSION CORRECTION ---
        # 1. Convert wavelength to rest-frame Angstroms
        wave_rest = (spectrum_object.spectral_axis).to(u.AA)

        # 2. Redshift the flux and uncertainty (in F_nu units)
        flux_rest_fnu = spectrum_object.flux
        err_rest_fnu = spectrum_object.uncertainty.quantity

        # 3. Convert flux and uncertainty from F_nu (uJy) to F_lambda (erg/s/cm2/AA)
        # This is the direct and correct way to do the conversion.
        target_flux_unit = u.erg / u.s / u.cm**2 / u.AA
        equivalency = u.spectral_density(wave_rest)

        flux_in_flambda = flux_rest_fnu.to(target_flux_unit, equivalencies=equivalency)
        err_in_flambda = err_rest_fnu.to(target_flux_unit, equivalencies=equivalency)

        spec_final = Spectrum(
            flux=flux_in_flambda,
            spectral_axis=wave_rest,
            uncertainty=StdDevUncertainty(err_in_flambda))
        # --- END CORRECTION ---

        gleam_filename = f"spec1d.SMACS.JWST_NIRSpec.SMACS0723.{galaxy_id}.fits"
        save_correct_spectrum_format(spec_final, CLEAN_SPECTRA_PATH, gleam_filename)

    except Exception as e:
        logger.error(f"An unexpected error occurred for GID {galaxy_id_str}: {e}")
        continue

logger.info(f"### SMACS PREPARATION COMPLETE. Clean files are in: {CLEAN_SPECTRA_PATH} ###")

Collecting specutils
  Downloading specutils-2.1.0-py3-none-any.whl.metadata (6.2 kB)
Collecting gwcs>=0.22 (from specutils)
  Downloading gwcs-0.26.0-py3-none-any.whl.metadata (4.5 kB)
Collecting asdf-astropy>=0.5 (from specutils)
  Downloading asdf_astropy-0.8.0-py3-none-any.whl.metadata (5.5 kB)
Collecting asdf>=3.3.0 (from specutils)
  Downloading asdf-5.0.0-py3-none-any.whl.metadata (12 kB)
Collecting ndcube>=2.0 (from specutils)
  Downloading ndcube-2.3.4-py3-none-any.whl.metadata (7.5 kB)
Collecting asdf-standard>=1.1.0 (from asdf>=3.3.0->specutils)
  Downloading asdf_standard-1.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting asdf-transform-schemas>=0.3 (from asdf>=3.3.0->specutils)
  Downloading asdf_transform_schemas-0.6.0-py3-none-any.whl.metadata (2.2 kB)
Collecting jmespath>=0.6.2 (from asdf>=3.3.0->specutils)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting asdf-coordinates-schemas>=0.4 (from asdf-astropy>=0.5->specutils)
  Downloading asdf_coor

### STARTING SMACS SPECTRA PREPARATION ###
--- Combining raw spectra from /content/drive/MyDrive/Bren_code/My_work/SMACS_Analysis/GLEAM_SMACS/SMACS/ ---


Combining Gratings:   0%|          | 0/1 [00:00<?, ?it/s]


--- Processing and Saving 1 Spectra ---


Processing & Saving Spectra:   0%|          | 0/1 [00:00<?, ?it/s]

### SMACS PREPARATION COMPLETE. Clean files are in: /content/drive/MyDrive/Bren_code/My_work/SMACS_Analysis/GLEAM_SMACS/SMACS/ ###


In [None]:
# Path to the file you shared the info for
file_to_test = '/content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/GLEAM_JADES/JADES_clean/spec1d.JADES.JWST_NIRSpec.1180.10004141.fits'
spectrum = load_jades_spectrum(file_to_test)
if spectrum:
    print("Successfully loaded spectrum:")
    print(spectrum)

    median_flux_value = np.median(spectrum.flux[np.isfinite(spectrum.flux.value)])
    #print(median_flux_value)
    if median_flux_value == 0: # Handle edge case of all-zero flux
        exponent = 0
    else:
        # Get the exponent, e.g., -19 for a value of 1.5e-19
        exponent = int(np.floor(np.log10(abs(median_flux_value.value))))
    # Our scaling factor will be 10 to the power of that exponent
    scaling_factor = 10**exponent
    # 3. SCALE THE DATA FOR PLOTTING
    # Divide the flux by the scaling factor. It now becomes a simple NumPy array.
    scale_flux = spectrum.flux / scaling_factor
    scale_err = spectrum.uncertainty.quantity / scaling_factor

    # Plot flux and the uncertainty as a shaded region
    plt.figure(figsize=(12, 6))
    quantity_support()
    plt.plot(spectrum.spectral_axis, scale_flux, label='Flux')
    plt.fill_between(spectrum.spectral_axis.value,
                     (scale_flux - scale_err).value,
                     (scale_flux + scale_err).value,
                     alpha=0.3, label='Uncertainty')

    plt.xlabel(r'$\lambda$ [$\AA$]', fontsize=14)
    y_label_string = rf'$F_{{\lambda}}$ [$10^{{{exponent}}}$ erg s$^{{-1}}$ cm$^{{-2}}$ $\AA^{{-1}}$]'
    plt.ylabel(y_label_string, fontsize=14)
    plt.title("Test Load of JADES Spectrum (PID 1180)", fontsize=16)
    plt.legend()
    plt.grid(True, alpha=0.5)
    plt.show()

An unexpected error occurred while loading /content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/GLEAM_JADES/JADES_clean/spec1d.JADES.JWST_NIRSpec.1180.10004141.fits: 'uncert'


In [None]:
# ===== Ensure All GIDs are accounted for in csv =====

# 1. Set the base path for your JADES project.
USER_PATH = "/content/drive/MyDrive/Bren_code/My_work/JADES_Analysis/"

# 2. Set the path to the JADES data folder that contains your program subdirectories.
JADES_DATA_FOLDER = os.path.join(USER_PATH, "JADES/")

# 3. Set the path to the CSV provided by Bren and to my CSV
BREN_JADES_CSV = os.path.join(USER_PATH, "JADES_D_full_cat_data.csv")
MANUAL_JADES_CSV = os.path.join(USER_PATH, "jades_filtered_redshift_catalog.csv")

# 4. Define the output file paths.
OUTPUT_MATCHED_CATALOG = os.path.join(USER_PATH, "jades.csv")
OUTPUT_MISSING_LIST = os.path.join(USER_PATH, "files_missing_from_master_catalog.csv")


# === SCRIPT ===
print("--- Starting cross-match script (matching by Galaxy ID) ---")

# 1. Get a list of all '.spec.fits' files you have downloaded.
downloaded_files = []
for root, dirs, files in os.walk(JADES_DATA_FOLDER):
    for file in files:
        if file.endswith('.spec.fits'):
            downloaded_files.append(file)

print(f"Found {len(downloaded_files)} downloaded '.spec.fits' files in your JADES folders.")
df_local = pd.DataFrame(downloaded_files, columns=['file'])

# 2. Extract the Galaxy ID from your local filenames.
df_local['galaxy_id'] = pd.to_numeric(df_local['file'].str.extract(r'(\d+)\.spec\.fits$')[0], errors='coerce')
df_local = df_local.dropna(subset=['galaxy_id'])
df_local['galaxy_id'] = df_local['galaxy_id'].astype(int)

# Cut out all duplicate galaxy IDs in df_local
df_local = df_local.drop_duplicates(subset=['galaxy_id'])
print(f"Removed {len(downloaded_files) - len(df_local)} duplicate galaxy IDs from local files.")
print(f"Successfully extracted {len(df_local)} unique galaxy IDs from local files.")

# 3. Load the master JADES catalog.
try:
    df_manual = pd.read_csv(MANUAL_JADES_CSV)
    df_bren = pd.read_csv(BREN_JADES_CSV)
    df_master = pd.concat([df_manual, df_bren], ignore_index=True).drop_duplicates(subset=['srcid'])
    df_master = df_master.rename(columns={'srcid': 'galaxy_id', 'z_spec': 'z'})
    print(f"Loaded {len(df_master)} entries from the master JADES catalog.")
except FileNotFoundError:
    print(f"❌ ERROR: Master CSV not found at '{BREN_JADES_CSV}'. Please check the path.")
    exit()

# 4. Ensure the galaxy_id column in the master catalog is a numeric integer.
df_master = df_master.dropna(subset=['galaxy_id'])
df_master['galaxy_id'] = df_master['galaxy_id'].astype(int)


# 5. Find the matches by merging on the 'galaxy_id' column.
# --- MODIFIED LINE 1: Added 'grade', 'zfit', and 'comment' to the list of columns to merge. ---
df_matched = pd.merge(df_local, df_master[['galaxy_id', 'z', 'grade', 'zfit', 'comment']], on='galaxy_id', how='inner')
print(f"\nFound {len(df_matched)} matches between your files and the master catalog.")


# 6. Save the clean, matched catalog with redshifts and the new columns.
# --- MODIFIED LINE 2: Added the new columns to the list of columns to save. ---
df_matched[['galaxy_id', 'z', 'grade', 'zfit', 'comment']].drop_duplicates(subset=['galaxy_id']).to_csv(OUTPUT_MATCHED_CATALOG, index=False)
print(f"✅ SUCCESS: Matched catalog saved to:\n{OUTPUT_MATCHED_CATALOG}")


# 7. Find the files you have that are MISSING from the master catalog.
is_in_matched = df_local['galaxy_id'].isin(df_matched['galaxy_id'])
df_missing = df_local[~is_in_matched].drop_duplicates(subset=['galaxy_id'])

if not df_missing.empty:
    print(f"\nFound {len(df_missing)} downloaded files that are NOT in the master catalog.")
    df_missing.to_csv(OUTPUT_MISSING_LIST, index=False)
    print(f"✅ SUCCESS: List of missing files saved to:\n{OUTPUT_MISSING_LIST}")
else:
    print("\nAll of your downloaded files were found in the master catalog. No 'missing' file was created.")

print("\n--- Script complete ---")

In [None]:
# read in the table on the following webpage as a csv
# http://astronomy.nmsu.edu/drewski/tableofemissionlines.html

