# URL data parser and parallel download

Given a url, all nested file urls are parsed and downloaded. Useful if file links are hyperlinks.

In [None]:
from bs4 import BeautifulSoup
import os
import requests
from tqdm import tqdm
from utils.download_utils import chunk_creator, download, download_checker

In [None]:
# Define destination folder
ddir = './Data/edrtest'
# Paste the base url of files, e.g. https://www.uahirise.org/ESP_076649_1650 -> EDR Products
url = "https://hirise-pds.lpl.arizona.edu/PDS/EDR/ESP/ORB_076600_076699/ESP_076649_1650/"

In [None]:
# Create the destination folder
os.makedirs(ddir,exist_ok=True)

# Fetch the webpage
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.content, "html.parser")

# Find all links in the HTML
links = soup.find_all("a")
img_links = [f'{url}{link.get("href")}' for link in links][1:-1]

print('Available files:\n')
img_links

In [None]:
def parallel_df(files, ddir, jobs):
    from joblib import Parallel, delayed
    results = Parallel (n_jobs=jobs)(delayed(download)(files[i], ddir)
                            for i in range(len(files)))    


In [None]:
# Chunk creation for parallelization and download
dlist = download_checker(img_links,ddir)
chunks, jobs = chunk_creator(dlist)
with tqdm(total=len(dlist),
         desc = 'Generating files',
         unit='File') as pbar:
    for i in range(len(chunks)):
        files = chunks[i]    
        #print(files)
        parallel_df(files,ddir, jobs)        
        pbar.update(jobs)