In [10]:
import os
import glob
import pandas as pd
import re
from astropy.io import fits

def extract_columns_from_txt(target_path):
    """
    Processes all .txt files directly in the target folder, extracting columns based on IDL format.
    Args:
        target_path (str): Full path to the target folder.
    Returns:
        results (list): List of dicts with extracted columns for each file.
    """
    if not os.path.exists(target_path):
        print(f'Target directory not found: {target_path}')
        return []
    files = glob.glob(os.path.join(target_path, '*.txt'))
    results = []
    for file in files:
        try:
            df = pd.read_csv(file, delim_whitespace=True, skiprows=31, header=None)
        except Exception as e:
            print(f'Error reading {file}: {e}')
            continue
        if df.shape[1] >= 15:
            ra   = df.iloc[:, 1]   # float
            dec  = df.iloc[:, 2]   # float
            band = df.iloc[:, 6]   # float
            date = df.iloc[:, 12]  # string
            name = df.iloc[:, 14]  # string
            results.append({
                'file': file,
                'ra': ra,
                'dec': dec,
                'band': band,
                'date': date,
                'name': name
            })
        else:
            print(f'File {file} does not have expected number of columns.')
    return results

def get_fits_files(target_path, file_type='int'):
    """Return list of FITS files of a given type in the target folder."""
    pattern = f"*{'-int' if file_type == 'int' else '-unc'}*.fits"
    return glob.glob(os.path.join(target_path, pattern))

def parse_filename(filename):
    """Extract year, month, filter, etc. from filename using regex."""
    # Example pattern, adjust as needed
    match = re.search(r'(\d{2}[A-Za-z]{3})(\d{1,2})_.*?([A-Za-z0-9]+)?-int', filename)
    if match:
        year_month = match.group(1)
        day = match.group(2)
        filter_name = match.group(3) if match.group(3) else None
        return year_month, day, filter_name
    return None, None, None

def validate_fits_header(fits_path, txt_record):
    """Compare FITS header info to .txt record."""
    with fits.open(fits_path) as hdul:
        hdr = hdul[0].header
        date_obs = hdr.get('DATE-OBS')
        date = hdr.get('DATE')
        numfrms = hdr.get('NUMFRMS')
        # Compare with txt_record['date'] and txt_record['numfrms']
        return (date == txt_record['date']) and (numfrms == txt_record['numfrms'])

def process_target_folder(target_path):
    """Process one target folder for all validation steps."""
    txt_files = glob.glob(os.path.join(target_path, '*.txt'))
    fits_files = get_fits_files(target_path, file_type='int')
    for txt_file in txt_files:
        # Parse .txt file for expected filenames, dates, filters, numfrms
        # For each FITS file, parse filename and header, compare to .txt record
        pass  # Implement logic here

# Main logic to process all targets in Physics Data
base_dir = "/Users/tmzyece25/Desktop/Academia/Physics Data"
target_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

# Main loop
for target in target_folders:
    target_path = os.path.join(base_dir, target)
    process_target_folder(target_path)

In [11]:
import numpy as np
def parse_txt_file(txt_file):
    """Parse a .txt file and return a list of records with expected FITS filename, date, filter, numfrms, etc."""
    records = []
    try:
        # Read .txt file, skipping header lines as before
        df = pd.read_csv(txt_file, delim_whitespace=True, skiprows=31, header=None)
        # Example: columns 1,2,6,12,14 (adjust as needed)
        for idx, row in df.iterrows():
            record = {
                'ra': row[1],
                'dec': row[2],
                'band': row[6],
                'date': row[12],
                'name': row[14],
                # Add more fields if needed, e.g. numfrms if present
                # 'numfrms': row[XX]
            }
            records.append(record)
    except Exception as e:
        print(f'Error parsing {txt_file}: {e}')
    return records

def process_target_folder(target_path):
    """Process one target folder for all validation steps."""
    txt_files = glob.glob(os.path.join(target_path, '*.txt'))
    fits_files = get_fits_files(target_path, file_type='int')
    fits_files_dict = {os.path.basename(f): f for f in fits_files}
    for txt_file in txt_files:
        txt_records = parse_txt_file(txt_file)
        for rec in txt_records:
            # Attempt to match FITS file by name (or partial name)
            fits_name_candidates = [name for name in fits_files_dict if rec['name'] in name]
            if not fits_name_candidates:
                print(f"No FITS file found for record name '{rec['name']}' in {txt_file}")
                continue
            for fits_name in fits_name_candidates:
                fits_path = fits_files_dict[fits_name]
                # Validate FITS header against .txt record
                valid = validate_fits_header(fits_path, rec)
                if not valid:
                    print(f"Header mismatch for FITS file {fits_name} and record in {txt_file}")
                else:
                    print(f"Match: {fits_name} <-> {rec['name']} (date: {rec['date']})")

In [12]:
import matplotlib.pyplot as plt
def visualize_matches(matches):
    """Visualize distribution of matched FITS files by date and band."""
    if not matches:
        print("No matches to visualize.")
        return
    dates = [m['date'] for m in matches]
    bands = [m['band'] for m in matches]
    plt.figure(figsize=(10,5))
    plt.hist(dates, bins=20, color='skyblue', edgecolor='black')
    plt.title('Distribution of FITS File Dates')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.show()
    plt.figure(figsize=(8,4))
    plt.hist(bands, bins=len(set(bands)), color='salmon', edgecolor='black')
    plt.title('Distribution of Bands')
    plt.xlabel('Band')
    plt.ylabel('Count')
    plt.show()

# Example usage after collecting matches:
# visualize_matches(list_of_matched_records)

Matplotlib is building the font cache; this may take a moment.
