In [None]:
import time
import shutil
from pathlib import Path
from typing import Iterable, Dict, Any, Optional

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

In [None]:
# helper functions
# this works for Chrome and Windows.

def system_downloads() -> Path:
    return Path.home() / "Downloads"

def make_chrome(headless: bool = True) -> webdriver.Chrome:
    downloads_dir = str(system_downloads().resolve())
    opts = Options()
    prefs = {
        "plugins.always_open_pdf_externally": True,
        "download.default_directory": downloads_dir,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True,
    }
    opts.add_experimental_option("prefs", prefs)
    if headless:
        opts.add_argument("--headless=new")
        opts.add_argument("--disable-gpu")
        opts.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    try:
        driver.execute_cdp_cmd(
            "Page.setDownloadBehavior",
            {"behavior": "allow", "downloadPath": downloads_dir}
        )
    except Exception:
        pass

    return driver

# renames function to add (n) at the end should a file already exist
def safe_rename(dest_dir: Path, filename: str) -> Path:
    dest_dir.mkdir(parents=True, exist_ok=True)
    base = Path(filename).stem
    ext = Path(filename).suffix or ".pdf"
    candidate = dest_dir / f"{base}{ext}"
    i = 1
    while candidate.exists():
        candidate = dest_dir / f"{base} ({i}){ext}"
        i += 1
    return candidate

# check for newest pdf downloaded
def newest_pdf_since(folder: Path, since_epoch: float) -> Optional[Path]:
    pdfs = [p for p in folder.glob("*.pdf") if p.stat().st_mtime >= since_epoch]
    return max(pdfs, key=lambda p: p.stat().st_mtime) if pdfs else None

# wait for download to finish
def wait_for_download_in_downloads(timeout: int = 180, since_epoch: float = 0.0) -> Optional[Path]:
    downloads = system_downloads()
    end = time.time() + timeout
    candidate = None
    while time.time() < end:
        cr_in_progress = list(downloads.glob("*.crdownload"))
        candidate = newest_pdf_since(downloads, since_epoch)
        if candidate and not cr_in_progress:
            # confirm size is stable
            stable = 0
            last_size = -1
            for _ in range(6):
                sz = candidate.stat().st_size
                if sz == last_size and sz > 0:
                    stable += 1
                else:
                    stable = 0
                if stable >= 2:
                    return candidate
                last_size = sz
                time.sleep(0.5)
        time.sleep(0.5)
    return None

# moving the files
def move_to_target(src_file: Path, target_dir: Path) -> Path:
    dst = safe_rename(target_dir, src_file.name)
    shutil.move(str(src_file), str(dst))
    return dst

# download the file process
def download_direct(driver: webdriver.Chrome, url: str, target_dir: Path, timeout: int = 180) -> Dict[str, Any]:
    t0 = time.time()
    started_at = datetime.utcnow()
    result = {
        "url": url,
        "status": "500",
        "filename": None,
        "bytes": None,
        "duration_s": None,
        "saved_path": None,
        "note": "",
        "started_utc": started_at.isoformat() + "Z",
    }
    try:
        driver.get(url)
        finished = wait_for_download_in_downloads(timeout=timeout, since_epoch=t0)
        if not finished:
            result["note"] = "no pdf in system downloads."
            return result
        moved = move_to_target(finished, target_dir)
        result.update({
            "status": "200",
            "filename": moved.name,
            "bytes": moved.stat().st_size,
            "duration_s": round(time.time() - t0, 3),
            "saved_path": str(moved),
        })
        return result
    except Exception as e:
        result["note"] = f"{e.__class__.__name__}: {e}"
        return result

# run a batch of csvs
def run_batch(urls: Iterable[str], dest_dir: str, csv_out: str, headless: bool = True, per_timeout: int = 180) -> pd.DataFrame:
    target_dir = Path(dest_dir).expanduser().resolve()
    logs = []
    driver = make_chrome(headless=headless)
    try:
        for url in urls:
            logs.append(download_direct(driver, url, target_dir, timeout=per_timeout))
    finally:
        driver.quit()
    df = pd.DataFrame(logs)
    out = Path(csv_out).expanduser().resolve()
    out.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out, index=False)
    print(f"saved log to: {out}")
    return df

In [None]:
DEST_DIR = "./2025"
CSV_OUT  = "./download_status.csv"

# get list of dates - current year
today = datetime.today()
start = datetime(today.year, 1, 1)
dates = [(start + timedelta(days=i)).strftime("%m%d%y") for i in range((today - start).days + 1)]
dates_short = [(start + timedelta(days=i)).strftime("%y-%m") for i in range((today - start).days + 1)]

# add to urls
URLS = [f"https://www.hupd.harvard.edu/sites/g/files/omnuum2276/files/20{dates_short[x]}/{dates[x]}.pdf" for x in range(len(dates))]

In [None]:
# run batch! you can adjust timeout time if needed; harvard secure will download each file in 1-3 seconds
run_batch(URLS, dest_dir=DEST_DIR, csv_out=CSV_OUT, headless=True, per_timeout=25)

In [None]:
# also this weird url thing where sometimes the last few dates of the current month have the url for the next month 2025-02 for 1/30 and 1/31
# trying to adjust for that
df = pd.read_csv(CSV_OUT)
df_failed = df[df['status'] == 500]


df_failed['dates'] = df_failed['url'].apply(lambda x: x[-10:-4])
dates_alter = list(df_failed['dates'].unique())

try_urls_short =[]

for date in dates_alter:
    next_month = str(int(date[:2])+1).zfill(2)
    try_urls_short.append(f"https://www.hupd.harvard.edu/sites/g/files/omnuum2276/files/20{date[-2:]}-{next_month}/{date}.pdf")

In [None]:
run_batch(try_urls_short, dest_dir=DEST_DIR, csv_out="./try_download_status.csv", headless=True, per_timeout=25)