## Working with components

Line by line rerun of the download components rule

In [15]:
import os
import time
import hydra
import logging
import zipfile
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


In [16]:
# from webdriver_manager.chrome import ChromeDriverManager
import shutil

logger = logging.getLogger(__name__)

Diving into the main function

In [17]:
from hydra import initialize, compose
from omegaconf import OmegaConf

# unfortunately, we have to use the initialize function to load the config file
# this is because the @hydra decorator does not work with Notebooks very well
# this is a known issue with Hydra: https://gist.github.com/bdsaglam/586704a98336a0cf0a65a6e7c247d248
# 
# just use the relative path from the notebook to the config dir
with initialize(version_base=None, config_path="../conf"):
    cfg = compose(config_name='config.yaml')

hydra_cfg = cfg

In [18]:
hydra_cfg['temporal_freq']

'monthly'

In [49]:
component = cfg.component[7]
url_cfg = cfg.satellite_component[cfg.temporal_freq]
url = url_cfg.url[component]

In [50]:
url

'https://wustl.app.box.com/s/tfyt4uyuzbt4hbnw7bhos16aep9b5u7g/folder/257368204252'

In [51]:
download_dir = (
    f"../data/input/satellite_components/{cfg.temporal_freq}/{component}"
)
download_dir = os.path.abspath(download_dir)  # make absolute path
download_zip = f"{download_dir}/{url_cfg.zipname}.zip"
src_dir = f"{download_dir}/{url_cfg.zipname}"
dest_dir = f"{download_dir}/"

In [52]:
# == setup chrome driver
# Set up Chrome options for headless mode and automatic downloads
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_experimental_option(
    "prefs",
    {
        "download.default_directory": download_dir,
        "savefile.default_directory": download_dir,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
    },
)

In [53]:
# Setting up the Selenium WebDriver for Chrome using webdriver_manager
# ChromeDriverManager().install()
driver = webdriver.Chrome(options=chrome_options)
logger.info("Chrome driver setup completed.")

Into the try statement

In [54]:
driver.get(url)

In [55]:
driver.refresh()

In [56]:
download_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable(
        (By.CSS_SELECTOR, "button[aria-label='Download']")
    )
)

In [57]:
# Click the button
download_button.click()
logger.info("Downloading...")

# Wait to make sure the file has downloaded
while not os.path.exists(download_zip):
    time.sleep(5)  # seconds
logger.info("Download completed.")

In [58]:
with zipfile.ZipFile(download_zip, "r") as zip_ref:
    zip_ref.extractall(download_dir)

# Move all files from the src_dir to dest_dir
os.makedirs(dest_dir, exist_ok=True)
for file in os.listdir(src_dir):
    shutil.move(os.path.join(src_dir, file), dest_dir)

# Remove the zip file and the empty folder
os.remove(download_zip)
os.rmdir(src_dir)


This seems to work, so we should be able to run it in main.