# PDF Download

This notebook downloads PDF files from the URLs collected in the metadata. It uses Selenium WebDriver with headless Chrome to download PDFs for each language.

## Setup

### Imports

In [7]:
import yaml
import time
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

### Configuration and Paths

In [8]:
# Load configuration
with open("../config.yaml") as f:
    config = yaml.safe_load(f)

# Set up project paths
project_root = Path.cwd().parent
PDFS_DIR = project_root / config["PDFS_DIR"]
METADATA_DIR = project_root / config["METADATA_DIR"]

# Progress bar format
PROGRESS_BAR_FORMAT = "{desc:<25}{percentage:3.0f}%|{bar:20}{r_bar}"

## Helper Functions

In [9]:
def setup_pdf_driver(download_dir: Path):
    """Configure Chrome WebDriver for PDF downloads"""
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    )
    options.add_experimental_option(
        "prefs",
        {
            "download.default_directory": str(download_dir.absolute()),
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True,
            "plugins.plugins_disabled": ["Chrome PDF Viewer"],
        },
    )
    return webdriver.Chrome(options=options)


def download_pdf(driver, pdf_url: str):
    """Download a single PDF using Selenium"""
    driver.get(pdf_url)
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    time.sleep(5)

## Download PDFs

Download PDFs for all configured languages and display download statistics.

In [10]:
download_stats = []

for lang_code, lang_config in config["LANGUAGES"].items():
    lang_pdf_dir = PDFS_DIR / lang_code
    lang_pdf_dir.mkdir(parents=True, exist_ok=True)

    # Load metadata
    metadata_path = METADATA_DIR / f"{lang_code}_article_data.csv"
    if not metadata_path.exists():
        print(f"No metadata found for {lang_code}, skipping")
        continue

    articles_df = pd.read_csv(metadata_path)
    driver = setup_pdf_driver(lang_pdf_dir)

    # Download PDFs
    success_count = 0
    for idx, article in tqdm(
        articles_df.iterrows(),
        total=len(articles_df),
        desc=f"Downloading {lang_config['name']}",
        bar_format=PROGRESS_BAR_FORMAT
    ):
        try:
            download_pdf(driver, article["pdf_url"])
            success_count += 1
        except Exception:
            pass

    driver.quit()

    download_stats.append(
        {
            "Language": lang_config["name"],
            "Attempted": len(articles_df),
            "Downloaded": success_count,
            "Success Rate": f"{success_count / len(articles_df) * 100:.1f}%" if len(articles_df) > 0 else "0%",
        }
    )

Downloading Tamil        100%|████████████████████| 460/460 [48:38<00:00,  6.34s/it]  
Downloading Bengali        4%|▊                   | 35/850 [04:33<1:05:34,  4.83s/it]

Error downloading article PDF: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=143.0.7499.146)
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff731968895
	0x7ff7319688f0
	0x7ff73174165d
	0x7ff73173e1dd
	0x7ff73172eb69
	0x7ff731730974
	0x7ff73172f10f
	0x7ff73172e871
	0x7ff73172e5bd
	0x7ff73172c20a
	0x7ff73172c9f2
	0x7ff731745950
	0x7ff7317ebad4
	0x7ff7317c1fda
	0x7ff7317eac97
	0x7ff73178ac29
	0x7ff73178ba93
	0x7ff731c805f0
	0x7ff731c7af30
	0x7ff731c99696
	0x7ff731985d94
	0x7ff73198ed3c
	0x7ff731971fb4
	0x7ff731972165
	0x7ff731957e92
	0x7ff8c87ae8d7
	0x7ff8c8c0c53c



Downloading Bengali      100%|████████████████████| 850/850 [1:35:52<00:00,  6.77s/it]
Downloading Thai          21%|████▏               | 176/850 [17:20<1:18:39,  7.00s/it]

Error downloading article PDF: Message: unknown error: net::ERR_CONNECTION_RESET
  (Session info: chrome=143.0.7499.146)
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff731968895
	0x7ff7319688f0
	0x7ff73174165d
	0x7ff73173e1dd
	0x7ff73172eb69
	0x7ff731730974
	0x7ff73172f10f
	0x7ff73172e871
	0x7ff73172e5bd
	0x7ff73172c20a
	0x7ff73172c9f2
	0x7ff731745950
	0x7ff7317ebad4
	0x7ff7317c1fda
	0x7ff7317eac97
	0x7ff73178ac29
	0x7ff73178ba93
	0x7ff731c805f0
	0x7ff731c7af30
	0x7ff731c99696
	0x7ff731985d94
	0x7ff73198ed3c
	0x7ff731971fb4
	0x7ff731972165
	0x7ff731957e92
	0x7ff8c87ae8d7
	0x7ff8c8c0c53c



Downloading Thai          86%|█████████████████▏  | 731/850 [1:12:22<09:07,  4.60s/it]

Error downloading article PDF: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: chrome=143.0.7499.146)
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff731968895
	0x7ff7319688f0
	0x7ff73174165d
	0x7ff73173e1dd
	0x7ff73172eb69
	0x7ff731730974
	0x7ff73172f10f
	0x7ff73172e871
	0x7ff73172e5bd
	0x7ff73172c20a
	0x7ff73172c9f2
	0x7ff731745950
	0x7ff7317ebad4
	0x7ff7317c1fda
	0x7ff7317eac97
	0x7ff73178ac29
	0x7ff73178ba93
	0x7ff731c805f0
	0x7ff731c7af30
	0x7ff731c99696
	0x7ff731985d94
	0x7ff73198ed3c
	0x7ff731971fb4
	0x7ff731972165
	0x7ff731957e92
	0x7ff8c87ae8d7
	0x7ff8c8c0c53c



Downloading Thai         100%|████████████████████| 850/850 [1:24:52<00:00,  5.99s/it]


In [11]:
display(pd.DataFrame(download_stats))

Unnamed: 0,Language,Attempted,Downloaded,Success Rate
0,Tamil,460,460,100.0%
1,Bengali,850,849,99.9%
2,Thai,850,848,99.8%
