In [36]:
import os
import time
import requests, pathlib, re, urllib.parse
from bs4 import BeautifulSoup

In [25]:
html_url_template = 'https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page={page_idx}&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025'

In [28]:
page_idxes = [i for i in range(12)]

In [29]:
pdf_html_list = []
report_list = []

for idx in page_idxes:
    html_url = html_url_template.format(page_idx=idx)
    print('Crawling html page from', html_url)
    web_json = requests.get(html_url).json()
    
    results = web_json['results']
    url_list = [hit['url'] for hit in results]
    report_name_list = [hit['title'] for hit in results]
    pdf_html_list.extend(url_list)
    report_list.extend(report_name_list)
    print('Found', len(url_list), 'urls in page', idx)

print('Found', len(pdf_html_list), 'urls in total')

Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=0&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025
Found 10 urls in page 0
Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=1&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-languages%3Aen&minPublicationYear=2020&maxPublicationYear=2025
Found 10 urls in page 1
Crawling html page from https://aemint-search-client-funcapp-prod.azurewebsites.net/api/faceted-search?siteName=oecd&interfaceLanguage=en&orderBy=mostRecent&page=2&pageSize=10&hiddenFacets=oecd-search-config-pillars%3Apublications&facets=oecd-serials%3Ag1ghghge&facets=oecd-lang

In [30]:
pdf_html_list

['https://www.oecd.org/en/publications/oecd-economic-surveys-luxembourg-2025_803b3ea1-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-turkiye-2025_d01c660f-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-israel-2025_d6dd02bc-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-lithuania-2025_4abf1ea5-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-costa-rica-2025_048cf07b-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-czechia-2025_7a70af5c-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-ireland-2025_9a368560-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-poland-2025_483d3bb9-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-chile-2025_efad96ce-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-greece-2024_a35a56b6-en.html',
 'https://www.oecd.org/en/publications/oecd-economic-surveys-indone

In [32]:
def download_oecd_pdf(page_url: str, report_name: str, dest_dir: str) -> pathlib.Path:
    """
    Given an OECD publication page (HTML), fetch and save its companion PDF.
    Returns the local Path of the downloaded file.
    """
    # 1 ── grab the HTML page
    html = requests.get(page_url, timeout=30)
    html.raise_for_status()

    # 2 ── look for a <a> tag whose href ends in .pdf
    soup = BeautifulSoup(html.text, "html.parser")
    pdf_url = None
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.lower().endswith(".pdf"):
            pdf_url = urllib.parse.urljoin(page_url, href)
            break

    # fallback: regex search (catches edge cases if the DOM structure changes)
    if pdf_url is None:
        m = re.search(r'href=["\']([^"\']+\.pdf)["\']', html.text, re.I)
        if m:
            pdf_url = urllib.parse.urljoin(page_url, m.group(1))

    if pdf_url is None:
        raise RuntimeError("No PDF link found on the page!")

    # 3 ── stream-download the PDF
    file_path = os.path.join(dest_dir, report_name + ".pdf")

    with requests.get(pdf_url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(file_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    print(f"Saved → {file_path}")
    return file_path


In [38]:
HEADERS = {
    # any recent browser UA string will do
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) "
        "Gecko/20100101 Firefox/125.0"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Referer": "https://www.oecd.org/",
}

def download_oecd_pdf(page_url: str, report_name: str, dest_dir: str) -> pathlib.Path:
    """Download the first PDF linked from an OECD publication page."""
    with requests.Session() as s:
        s.headers.update(HEADERS)

        # 1 ── fetch the HTML
        resp = s.get(page_url, timeout=30)
        resp.raise_for_status()

        # 2 ── parse out the PDF link
        soup = BeautifulSoup(resp.text, "html.parser")
        pdf_url = None
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if href.lower().endswith(".pdf"):
                pdf_url = urllib.parse.urljoin(page_url, href)
                break
        if not pdf_url:
            # fallback regex, in case the DOM structure changes
            m = re.search(r'href=["\']([^"\']+\.pdf)["\']', resp.text, re.I)
            if m:
                pdf_url = urllib.parse.urljoin(page_url, m.group(1))
        if not pdf_url:
            raise RuntimeError("No PDF link found on the page!")

        # 3 ── stream-download the PDF (pass Referer again)
        out_path = os.path.join(dest_dir, report_name + ".pdf")

        with s.get(pdf_url, stream=True, timeout=60, headers={"Referer": page_url}) as r:
            r.raise_for_status()
            with open(out_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

    print(f"✓  Saved → {out_path}")
    return out_path


In [41]:
save_dir = '/data/group_data/rag-robust-eval/data/economic/pdf_files'
os.makedirs(save_dir, exist_ok=True)

for pdf_html, report_name in zip(pdf_html_list[:3], report_list[:3]):
    # Clean up the report name to make it a valid filename
    report_name = re.sub(r'[<>:"/\\|?*]', '_', report_name)
    report_name = re.sub(r'\s+', '_', report_name)
    report_name = re.sub(r'_{2,}', '_', report_name)  # Remove duplicate underscores
    report_name = report_name.strip('_')  # Remove leading/trailing underscores

    try:
        download_oecd_pdf(pdf_html, report_name, save_dir)
        time.sleep(2)
    except Exception as e:
        print(f"Failed to download {report_name}: {e}")

Failed to download OECD_Economic_Surveys_Luxembourg_2025: 403 Client Error: Forbidden for url: https://www.oecd.org/en/publications/oecd-economic-surveys-luxembourg-2025_803b3ea1-en.html
Failed to download OECD_Economic_Surveys_Türkiye_2025: 403 Client Error: Forbidden for url: https://www.oecd.org/en/publications/oecd-economic-surveys-turkiye-2025_d01c660f-en.html
Failed to download OECD_Economic_Surveys_Israel_2025: 403 Client Error: Forbidden for url: https://www.oecd.org/en/publications/oecd-economic-surveys-israel-2025_d6dd02bc-en.html


In [1]:
import pandas as pd

In [4]:
content = pd.read_json('/data/group_data/rag-robust-eval/data/economic/json_files/803b3ea1-en/auto/803b3ea1-en_content_list.json')

In [7]:
content.iloc[46]

type                                                          table
text                                                            NaN
text_level                                                      NaN
page_idx                                                          8
img_path          images/7bfd4f4b10a6850eec4f70ff0d6512725699f1d...
img_caption                                                     NaN
img_footnote                                                    NaN
table_caption     [(Numbers in parentheses refer to the OECD ave...
table_footnote                                                   []
table_body        \n\n<html><body><table><tr><td colspan="6">N D...
Name: 46, dtype: object

In [9]:
table_idx = int(content[content['type'] == 'table'].index[0])
table_idx

46