In [None]:
import yaml
from pathlib import Path

with open("../config.yaml") as f:
    config = yaml.safe_load(f)

project_root = Path.cwd().parent
PDFS_DIR = project_root / config["PDFS_DIR"]
METADATA_DIR = project_root / config["METADATA_DIR"]

In [None]:
import time
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

In [None]:
def setup_pdf_driver(download_dir: Path):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    )
    options.add_experimental_option(
        "prefs",
        {
            "download.default_directory": str(download_dir.absolute()),
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True,
            "plugins.plugins_disabled": ["Chrome PDF Viewer"],
        },
    )
    return webdriver.Chrome(options=options)

In [None]:
def download_pdf(driver, pdf_url: str):
    driver.get(pdf_url)
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    time.sleep(5)

In [None]:
download_stats = []

for lang_code, lang_config in config["LANGUAGES"].items():
    lang_pdf_dir = PDFS_DIR / lang_code
    lang_pdf_dir.mkdir(parents=True, exist_ok=True)

    # Load metadata
    metadata_path = METADATA_DIR / f"{lang_code}_article_data.csv"
    if not metadata_path.exists():
        print("No metadata found, skipping")
        continue

    articles_df = pd.read_csv(metadata_path)
    driver = setup_pdf_driver(lang_pdf_dir)

    # Download
    success_count = 0
    for idx, article in tqdm(
        articles_df.iterrows(),
        total=len(articles_df),
        desc=f"Downloading {lang_config['name']}",
    ):
        try:
            download_pdf(driver, article["pdf_url"])
            success_count += 1
        except Exception as e:
            print(f"Error downloading article PDF: {str(e)}")

    driver.quit()

    download_stats.append(
        {
            "Language": lang_config["name"],
            "Attempted": len(articles_df),
            "Downloaded": success_count,
        }
    )

In [None]:
display(pd.DataFrame(download_stats))