In [55]:
import requests
import xml.etree.ElementTree as ET
from pathlib import Path
from tarfile import open as taropen
from tqdm import tqdm
import gzip
import shutil

In [56]:
def get_file_url_from_thredds(
        catalog_url: str,
        model_tarname: str
    )-> str:
    """Fetch the correct tar file URL from the THREDDS catalog."""
    response: requests.Response = requests.get(catalog_url)
    if response.status_code != 200:
        raise ValueError(f"Failed to access catalog: {catalog_url}")

    root: ET.Element = ET.fromstring(response.content)

    # Define the namespace
    namespace: dict[str, str] = {"ns": "http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0"}

    # Find all dataset elements
    datasets: list[ET.Element] = root.findall(".//ns:dataset", namespaces=namespace)

    # Extract the URL path for each dataset
    available_files: list[str] = [dataset.attrib.get("urlPath", "") for dataset in datasets]

    for url_path in available_files:
        if model_tarname in url_path:  # Match the expected tar filename
            return f"https://thredds.nci.org.au/thredds/fileServer/{url_path}"

    raise ValueError(f"File {model_tarname} not found in THREDDS catalog.")

In [57]:
def download_tar(
        tar_url: str,
        download_path: Path,
        overwrite: bool = False,
    ) -> Path | None:  
    """Download a tar file."""
    print(f"Downloading tar file from {tar_url}")

    # Create the download path if it does not exist
    try:
        download_path.mkdir(parents=True, exist_ok=False)
    except FileExistsError:
        if not overwrite:
            print(f"Directory {download_path} already exists. To overwrite it, make sure to set the overwrite flag to True. If you don't want to overwrite, please provide a different path.")
            return None
        else:
            download_path.mkdir(parents=True, exist_ok=True)
            print(f"Successfully created directory {download_path}")

    # Download the tar file by chunks (large file)
    response: requests.Response = requests.get(tar_url, stream=True)
    response.raise_for_status()
    
    # Downloaded file name
    tar_filename: Path = download_path / Path(tar_url).name

    total_size = int(response.headers.get("content-length", 0))

    print(f"Downloading to {tar_filename}")

    # Download the tar file by chunks of chunk_size bytes
    with open(tar_filename, "wb") as file, tqdm(total=total_size, unit="B", unit_scale=True, desc="Downloading") as progress_bar:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                file.write(chunk)
                progress_bar.update(len(chunk))
                

    return tar_filename

In [None]:
def extract_tar(
        tar_filename: Path,
) -> None:
    # Extract contents
    print(f"Extracting tar file {tar_filename}")
    with taropen(tar_filename, "r") as tar:
        members = tar.getmembers()
        for member in tqdm(tar, desc="Extraction", unit="file", total=len(members)):
            tar.extract(member, path=tar_filename.parent)


In [65]:
def get_subset(
    tar_name: str,
    extract_path: str,
    catalog_url: str="https://thredds.nci.org.au/thredds/catalog/tm64/noddyverse/bulk_models/catalog.xml",
    overwrite: bool=False,
) -> Path | None:
    # 1. Get the tar file URL
    tar_url: str = get_file_url_from_thredds(catalog_url, tar_name)
    
    # 2. Download and extract the tar file
    tar_filename: Path | None = download_tar(tar_url, extract_path, overwrite=overwrite)

    if tar_filename is None:
        return None
    
    # 3. Extract the contents
    print(f"Extracting tar file {tar_filename}")
    extract_tar(tar_filename)
    print(f"Successfully extracted tar file {tar_filename}")
    return tar_filename

In [66]:
EVENTS: dict[int, str] = {
    1: "FOLD",
    2: "FAULT",
    3: "UNCONFORMITY",
    4: "SHEAR-ZONE",
    5: "DYKE",
    6: "PLUG",
    7: "TILT",
}
events_list: list[int] = [3, 6, 7]
events: str = f"{EVENTS[events_list[0]]}_{EVENTS[events_list[1]]}_{EVENTS[events_list[2]]}"
tar_name: str = f"{events}.tar"

dataset_folder: Path = Path("../../../dataset/test")
tar_filename: Path = get_subset(tar_name, dataset_folder, overwrite=True)  # Be careful, double check what you're doing if you use overwrite !!!

Downloading tar file from https://thredds.nci.org.au/thredds/fileServer/tm64/noddyverse/bulk_models/UNCONFORMITY_PLUG_TILT.tar
Successfully created directory ..\..\..\dataset\test
Downloading to ..\..\..\dataset\test\UNCONFORMITY_PLUG_TILT.tar


Downloading: 100%|██████████| 550M/550M [00:36<00:00, 15.1MB/s]   


Extracting tar file ..\..\..\dataset\test\UNCONFORMITY_PLUG_TILT.tar
Extracting tar file ..\..\..\dataset\test\UNCONFORMITY_PLUG_TILT.tar


Extraction: 100%|██████████| 5066/5066 [00:59<00:00, 85.20file/s]

Successfully extracted tar file ..\..\..\dataset\test\UNCONFORMITY_PLUG_TILT.tar





In [71]:
print(tar_filename.parent)
with gzip.open(tar_filename.parent / "models_by_code/models" / tar_filename.name.split('.')[0] / "20-09-04-15-14-12-970750888.g00.gz", 'rb') as f_in:
    with open(".", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

..\..\..\dataset\test


PermissionError: [Errno 13] Permission denied: '.'