# Download a HIF Dataset from GITHUB

In [37]:
import requests
import os
import re
from pathlib import Path
import fastjsonschema

In [39]:
def download_hif_dataset(dataset_name, download_path):
    # Step 1: get list of all HIF dataset names
    hif_dataset_files = get_hif_datasets()
    
    # Step 2: Check if dataset exists and determine file pattern
    dataset_files = find_dataset_files(hif_dataset_files, dataset_name)
    
    # Step 3: Download each file separately
    for file_name in dataset_files:
        download_hif_file(file_name, download_path=download_path)
    
    # Step 4: Join the files together
    if len(dataset_files) > 1:
        rejoin_files(dataset_name, directory=download_path)
    dataset_file = os.path.join(download_path, dataset_name + ".hif")
    
    # Step 5: Validate the final file to the HIF schma
    validate_hif_schema(dataset_file)

def get_file_size_mb(filepath):
    """Get file size in MB"""
    return os.path.getsize(filepath) / (1024 * 1024)

def get_hif_datasets():
    api_url = "https://api.github.com/repos/Jpickard1/HIF-datasets/contents/datasets"
    response = requests.get(api_url)
    
    if response.status_code == 200:
        files = response.json()
        file_list = [f['name'] for f in files if f['type'] == 'file']
    else:
        file_list = []
    return file_list

def find_dataset_files(hif_dataset_files, dataset_name):
    # Pattern 1: Single file (dataset_name.hif)
    single_file_pattern = f"{dataset_name}.hif"
    
    # Pattern 2: Multi-file (dataset_name_1of5.hif, dataset_name_2of5.hif, etc.)
    multi_file_pattern = re.compile(rf"{re.escape(dataset_name)}_(\d+)of(\d+)\.hif")
    
    # Check for single file
    single_file_found = None
    for file in hif_dataset_files:
        if file == single_file_pattern:
            return [file]
    
    # Check for multi-file pattern
    multi_files = []
    for file in hif_dataset_files:
        match = multi_file_pattern.match(file)
        if match:
            multi_files.append(file)

    if len(multi_files) > 0:
        return multi_files
    else:
        print(f"No HIF dataset found matching {dataset_name}. Please check the dataset name and spelling.")
        return None

def download_hif_file(file, download_path):
    url = "https://raw.githubusercontent.com/Jpickard1/HIF-datasets/main/datasets/" + file
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(os.path.join(download_path, file), "wb") as file:
            file.write(response.content)
        print(f"File {file} downloaded successfully")
    else:
        print(f"Failed to download file: {response.status_code}")

def rejoin_files(base_name, directory="."):
    """Rejoin split files back into original file"""
    directory = Path(directory)
    
    # Find all chunks for this base name
    pattern = f"{base_name}_*of*.hif"
    chunk_files = list(directory.glob(pattern))
    
    if not chunk_files:
        print(f"No chunk files found for {base_name}")
        return
    
    # Sort chunks by number
    def get_chunk_number(filename):
        # Extract number from filename like "name_1of3.hif"
        stem = filename.stem
        if "_" in stem and "of" in stem:
            try:
                number_part = stem.split("_")[-1]  # Get "1of3"
                return int(number_part.split("of")[0])  # Get "1"
            except:
                return 0
        return 0
    
    chunk_files.sort(key=get_chunk_number)
    
    print(f"Rejoining {len(chunk_files)} chunks for {base_name}")
    
    # Rejoin files
    output_file = directory / f"{base_name}.hif"
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for chunk_file in chunk_files:
            print(f"  Adding: {chunk_file.name}")
            with open(chunk_file, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
    
    print(f"  Rejoined file created: {output_file}")
    final_size = get_file_size_mb(output_file)
    print(f"  Final size: {final_size:.2f}MB")

    # Delete the individual chunk files
    print("  Cleaning up chunk files:")
    for chunk_file in chunk_files:
        try:
            chunk_file.unlink()
            print(f"    Deleted: {chunk_file.name}")
        except Exception as e:
            print(f"    Error deleting {chunk_file.name}: {e}")
    
    print(f"  Cleanup complete - removed {len(chunk_files)} chunk files")

def validate_hif_schema(file_path):
    url = "https://raw.githubusercontent.com/pszufe/HIF-standard/main/schemas/hif_schema.json"
    schema = requests.get(url).json()
    validator = fastjsonschema.compile(schema)
    hiftext = json.load(open(file_path,'r'))
    try:
      validator(hiftext)
      print("HIF-Compliant JSON.")
    except Exception as e:
       print(f"Invalid JSON: {e}")


In [40]:
dataset_name = "Allen_Brain_Atlas_down"
download_path = "/nfs/turbo/umms-indikar/Joshua/HIF-datasets/test_download"
download_hif_dataset(dataset_name, download_path)

File <_io.BufferedWriter name='/nfs/turbo/umms-indikar/Joshua/HIF-datasets/test_download/Allen_Brain_Atlas_down_1of5.hif'> downloaded successfully
File <_io.BufferedWriter name='/nfs/turbo/umms-indikar/Joshua/HIF-datasets/test_download/Allen_Brain_Atlas_down_2of5.hif'> downloaded successfully
File <_io.BufferedWriter name='/nfs/turbo/umms-indikar/Joshua/HIF-datasets/test_download/Allen_Brain_Atlas_down_3of5.hif'> downloaded successfully
File <_io.BufferedWriter name='/nfs/turbo/umms-indikar/Joshua/HIF-datasets/test_download/Allen_Brain_Atlas_down_4of5.hif'> downloaded successfully
File <_io.BufferedWriter name='/nfs/turbo/umms-indikar/Joshua/HIF-datasets/test_download/Allen_Brain_Atlas_down_5of5.hif'> downloaded successfully
Rejoining 5 chunks for Allen_Brain_Atlas_down
  Adding: Allen_Brain_Atlas_down_1of5.hif
  Adding: Allen_Brain_Atlas_down_2of5.hif
  Adding: Allen_Brain_Atlas_down_3of5.hif
  Adding: Allen_Brain_Atlas_down_4of5.hif
  Adding: Allen_Brain_Atlas_down_5of5.hif
  Rejoine