In [5]:
import os
import glob
from astropy.io import fits
from datetime import datetime
import re
import pandas as pd
from astropy.time import Time

def parse_filename(filename):
    # Match '19AugICORE_0001_W1_mosaic-int.fits'
    match = re.match(r'(\d{2}[A-Za-z]{3}ICORE_\d{4}_W\d)_mosaic-int[.]fits', filename)
    if match:
        prefix = match.group(1)
        return prefix
    return None

def get_mjd(date_str):
    try:
        t = Time(date_str, format='iso', scale='utc')
        return t.mjd
    except Exception:
        return None

def check_header_dates(int_path, unc_path):
    with fits.open(int_path) as int_hdul, fits.open(unc_path) as unc_hdul:
        int_date_str = int_hdul[0].header.get('DATE') or int_hdul[0].header.get('DATE-OBS')
        unc_date_str = unc_hdul[0].header.get('DATE') or unc_hdul[0].header.get('DATE-OBS')
        print(f'INT file: {os.path.basename(int_path)} | Header date: {int_date_str}')
        print(f'UNC file: {os.path.basename(unc_path)} | Header date: {unc_date_str}')
        if int_date_str and unc_date_str:
            if int_date_str != unc_date_str:
                print(f'**ERROR**: Header date mismatch for {os.path.basename(int_path)} and {os.path.basename(unc_path)}. int date: {int_date_str}, unc date: {unc_date_str}')
                return False
        else:
            print(f'ERROR: Missing header date in {os.path.basename(int_path)} or {os.path.basename(unc_path)}')
            return False
    return True

def check_subframe_count(fits_path, txt_path):
    with fits.open(fits_path) as hdul:
        numfrms = hdul[0].header.get('NUMFRMS')
    with open(txt_path) as f:
        lines = f.readlines()
    # Only count lines that look like data (start with a digit and have enough columns)
    data_lines = [line for line in lines if line.strip() and line.strip()[0].isdigit()]
    expected_count = len(data_lines)
    print(f'{os.path.basename(fits_path)} | NUMFRMS (header): {numfrms} | TXT subframes: {expected_count}')
    if numfrms != expected_count:
        print(f'ERROR: Subframe count mismatch for {os.path.basename(fits_path)} and {os.path.basename(txt_path)}. FITS header NUMFRMS: {numfrms}, TXT subframes: {expected_count}')
        return False
    return True

def sanity_check_folder(folder_path):
    fits_files = glob.glob(os.path.join(folder_path, '*_mosaic-int.fits'))
    for fits_file in fits_files:
        base_name = os.path.basename(fits_file)
        prefix = parse_filename(base_name)
        if not prefix:
            print(f'ABNORMAL FILE: {base_name} does not match expected naming convention. Skipping.')
            continue

        unc_file = fits_file.replace('_mosaic-int.fits', '_mosaic-unc.fits')
        txt_file = fits_file.replace('_mosaic-int.fits', '_query_used.tbl.txt')
        if not (os.path.exists(unc_file) and os.path.exists(txt_file)):
            print(f'ERROR: Missing .unc.fits or .tbl.txt for {base_name}')
            continue

        # File naming and data identity verification
        with fits.open(fits_file) as hdul:
            hdr = hdul[0].header
            fits_date = hdr.get('DATE') or hdr.get('DATE-OBS')
            fits_band = hdr.get('FILTER') or hdr.get('BAND')
            fits_mjd = get_mjd(fits_date) if fits_date else None

        # Additional checks can be added here as needed

        # Temporal consistency via header date
        check_header_dates(fits_file, unc_file)

        # Subframe count verification
        check_subframe_count(fits_file, txt_file)

base_dir = '/Users/tmzyece25/Desktop/Academia/Physics Data'
folders = [os.path.join(base_dir, f) for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]
for folder in folders:
    sanity_check_folder(folder)

INT file: 23AugICORE_0001_W2_mosaic-int.fits | Header date: 2025-05-07T01:51:44
UNC file: 23AugICORE_0001_W2_mosaic-unc.fits | Header date: 2025-05-07T01:51:44
23AugICORE_0001_W2_mosaic-int.fits | NUMFRMS (header): 51 | TXT subframes: 51
INT file: 17AugICORE_0001_W1_mosaic-int.fits | Header date: 2025-05-07T01:00:56
UNC file: 17AugICORE_0001_W1_mosaic-unc.fits | Header date: 2025-05-07T01:00:56
17AugICORE_0001_W1_mosaic-int.fits | NUMFRMS (header): 41 | TXT subframes: 41
INT file: 20JanICORE_0001_W1_mosaic-int.fits | Header date: 2025-05-07T01:12:50
UNC file: 20JanICORE_0001_W1_mosaic-unc.fits | Header date: 2025-05-07T01:12:51
**ERROR**: Header date mismatch for 20JanICORE_0001_W1_mosaic-int.fits and 20JanICORE_0001_W1_mosaic-unc.fits. int date: 2025-05-07T01:12:50, unc date: 2025-05-07T01:12:51
20JanICORE_0001_W1_mosaic-int.fits | NUMFRMS (header): 30 | TXT subframes: 30
INT file: 21AugICORE_0001_W1_mosaic-int.fits | Header date: 2025-05-07T01:48:19
UNC file: 21AugICORE_0001_W1_mosai