In [None]:
# Standard library modules for hashing, regex parsing, timing, timestamps, and filesystem paths
import hashlib
import re
import time
from datetime import datetime, timezone
from pathlib import Path

# Third-party libraries for data handling and HTTP requests
import pandas as pd
import requests

# Selenium modules for browser automation and waiting for page elements
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [None]:
# Configuration: local directories, metadata file path, and runtime constants
BRONZE_DIR    = Path(r"D:\Data Engineer - Technical Test\Julian_Vallejo_DataEngTest\task1\bronze")
METADATA_PATH = BRONZE_DIR / "metadata_bronze.parquet"
new_records   = []                         # array for new metadata entries
YEARS         = ["2025", "2024", "2023", "2022", "2021"]
URL           = "https://www.mineros.com.co/investors/financial-reports"

# Ensure the bronze directory exists and load existing metadata (or initialize empty)
BRONZE_DIR.mkdir(parents=True, exist_ok=True)
md_old = (
    pd.read_parquet(METADATA_PATH)
    if METADATA_PATH.exists()
    else pd.DataFrame(columns=["filename", "filesize", "sha256", "download_timestamp"])
)

In [None]:
# Setup Selenium Chrome driver options and initialize in headless mode
opts = Options()
opts.add_argument("--headless")    # run browser without opening a window
driver = webdriver.Chrome(options=opts)

In [None]:
# Navigate to the target page and set up an explicit wait
driver.get(URL)
wait = WebDriverWait(driver, 10)

# Attempt to close the cookies popup if it appears
try:
    close_btn = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/div/div[1]/div[2]")))
    close_btn.click()
except TimeoutException:
    pass

# Iterate over each year tab to reveal its content
for idx, year in enumerate(YEARS, start=1):
    # Locate and click the year tab
    link = wait.until(EC.element_to_be_clickable((By.XPATH, f"/html/body/section[2]/div/div/div[2]/div[1]/a[{idx}]")))

    # Scroll to the top to ensure the tab is visible before clicking
    driver.execute_script("window.scrollTo(0,0);")
    time.sleep(0.2)

    link.click()
    pane_id = link.get_attribute("href").split("#")[1]

    # Wait for the corresponding panel to become visible
    panel = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, pane_id)))

    # Expand each accordion section within the panel
    headers = panel.find_elements(By.CSS_SELECTOR, "div.accordion-header-wrapper")
    for header in headers:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", header)
        time.sleep(0.2)
        driver.execute_script("arguments[0].click();", header)
        time.sleep(0.5)
        
    # Find only the “Consolidated Financial Statements” PDF links
    pdf_anchors = panel.find_elements(By.XPATH,".//a[contains(@href, '.pdf') and .//div[text()='Consolidated Financial Statements']]")

    # Download each PDF and record its metadata
    for a in pdf_anchors:
        desc = a.find_element(By.CSS_SELECTOR, "div.financial-reports-description").text
        m_q = re.search(r"Q([1-4])", desc)
        m_y = re.search(r"(\d{4})", desc)
        if not (m_q and m_y):
            continue

        quarter = m_q.group(1)
        yr      = int(m_y.group(1))
        url_pdf = a.get_attribute("href")
        fname   = url_pdf.split("/")[-1]

        data = requests.get(url_pdf).content
        sha  = hashlib.sha256(data).hexdigest()
        if sha in md_old["sha256"].values:
            continue

        out_dir = BRONZE_DIR / f"{yr}_Q{quarter}"
        out_dir.mkdir(exist_ok=True)
        out_path = out_dir / fname
        out_path.write_bytes(data)

        new_records.append({
            "filename":           fname,
            "filesize":           out_path.stat().st_size,
            "sha256":             sha,
            "download_timestamp": datetime.now(timezone.utc).isoformat()
        })

# Close the browser once all tasks are done
driver.quit()

In [None]:
# Update metadata_bronze.parquet by appending new records and removing duplicate hashes
if new_records:
    md_new    = pd.DataFrame(new_records)
    md_merged = pd.concat([md_old, md_new], ignore_index=True)\
                  .drop_duplicates(subset=["sha256"])
    md_merged.to_parquet(METADATA_PATH, index=False)