In [1]:
import csv
import io
from multiprocessing.pool import ThreadPool

from astropy.table import Table
from astropy.io import fits
import requests

In [2]:
def process_fits_file(url: str) -> tuple[int, int]:
    """Download FITS file from URL and remove unnecessary tables/columns.
    
    Returns
    -------
        2-tuple containing the original and final file sizes in bytes
    """
    # We could just pass the `url` directly to `fits.open`, but we're also
    # curious to see the original file size which isn't exposed by astropy
    with requests.get(url) as resp:
        original_size = int(resp.headers["content-length"])
        file = io.BytesIO(resp.content)

    with fits.open(file) as hdul:
        table = Table.read(hdul["COADD"])
        # Remove all other unnecessary columns
        table.keep_columns(["flux", "model"])

    final_size = fits.table_to_hdu(table).filebytes()

    return original_size, final_size


In [3]:
with open("./skyserver-dump.csv", newline="") as f:
    reader = csv.DictReader(f)
    urls = [row["url"] for row in reader]

# Create a thread pool so we aren't waiting for every single file to download
sizes = ThreadPool(10).imap_unordered(process_fits_file, urls)

original_size, final_size = tuple(sum(s) for s in zip(*sizes))

In [4]:
print(f"  Original: {original_size / 1e6:>5.2f} mb")
print(f"Compressed: {final_size / 1e6:>5.2f} mb")
print(f"     Ratio: {original_size / final_size:>5.2f} x")

  Original: 21.89 mb
Compressed:  4.03 mb
     Ratio:  5.43 x
