In [2]:
import zipfile
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path

In [1]:
import requests

In [4]:
def download_pdf(pdf_id, base_url, temp_dir):
    """Download a single PDF file."""
    url = f"{base_url}{pdf_id}.pdf"
    filename = f"{pdf_id}.pdf"
    filepath = os.path.join(temp_dir, filename)
    
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        
        with open(filepath, 'wb') as f:
            f.write(response.content)
        
        return pdf_id, True, None
    except requests.exceptions.RequestException as e:
        return pdf_id, False, str(e)

def download_pdfs_batch(start_id=542, end_id=642, output_dir=r"D:\Y4 Research", output_filename="nih_pdfs.zip"):
    """Download multiple PDFs and save them to a zip file."""
    base_url = "https://api.ods.od.nih.gov/dsld/s3/pdf/"
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Full path for the zip file
    output_zip = os.path.join(output_dir, output_filename)
    
    # Create temporary directory in the same location
    temp_dir = os.path.join(output_dir, "temp_pdfs")
    Path(temp_dir).mkdir(exist_ok=True)
    
    print(f"Starting download of PDFs {start_id} to {end_id}...")
    print(f"Total files: {end_id - start_id + 1}")
    
    successful = []
    failed = []
    
    # Download PDFs concurrently
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {
            executor.submit(download_pdf, pdf_id, base_url, temp_dir): pdf_id 
            for pdf_id in range(start_id, end_id + 1)
        }
        
        for future in as_completed(futures):
            pdf_id, success, error = future.result()
            if success:
                successful.append(pdf_id)
                print(f"✓ Downloaded: {pdf_id}.pdf ({len(successful)}/{end_id - start_id + 1})")
            else:
                failed.append((pdf_id, error))
                print(f"✗ Failed: {pdf_id}.pdf - {error}")
    
    # Create zip file
    print(f"\nCreating zip file: {output_zip}")
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for pdf_id in successful:
            filepath = os.path.join(temp_dir, f"{pdf_id}.pdf")
            zipf.write(filepath, f"{pdf_id}.pdf")
            os.remove(filepath)  # Clean up temp file
    
    # Remove temp directory
    os.rmdir(temp_dir)
    
    # Summary
    print(f"\n{'='*50}")
    print(f"Download Complete!")
    print(f"{'='*50}")
    print(f"Successfully downloaded: {len(successful)} PDFs")
    print(f"Failed downloads: {len(failed)} PDFs")
    print(f"Output file: {output_zip}")
    print(f"File size: {os.path.getsize(output_zip) / (1024*1024):.2f} MB")
    
    if failed:
        print(f"\nFailed IDs: {[pdf_id for pdf_id, _ in failed]}")
    
    return successful, failed

if __name__ == "__main__":
    # Download PDFs from ID 542 to 642 to D:\Y4 Research
    successful, failed = download_pdfs_batch(
        start_id=542,
        end_id=642,
        output_dir=r"D:\Y4 Research",
        output_filename="nih_pdfs_542_642.zip"
    )

Starting download of PDFs 542 to 642...
Total files: 101
✓ Downloaded: 544.pdf (1/101)
✓ Downloaded: 551.pdf (2/101)
✓ Downloaded: 549.pdf (3/101)
✓ Downloaded: 547.pdf (4/101)
✓ Downloaded: 550.pdf (5/101)
✓ Downloaded: 543.pdf (6/101)
✓ Downloaded: 545.pdf (7/101)
✓ Downloaded: 546.pdf (8/101)
✓ Downloaded: 542.pdf (9/101)
✓ Downloaded: 548.pdf (10/101)
✓ Downloaded: 552.pdf (11/101)
✓ Downloaded: 553.pdf (12/101)
✓ Downloaded: 561.pdf (13/101)
✓ Downloaded: 554.pdf (14/101)
✓ Downloaded: 555.pdf (15/101)
✓ Downloaded: 557.pdf (16/101)
✓ Downloaded: 560.pdf (17/101)
✓ Downloaded: 558.pdf (18/101)
✓ Downloaded: 556.pdf (19/101)
✓ Downloaded: 562.pdf (20/101)
✓ Downloaded: 559.pdf (21/101)
✓ Downloaded: 563.pdf (22/101)
✓ Downloaded: 564.pdf (23/101)
✓ Downloaded: 565.pdf (24/101)
✓ Downloaded: 566.pdf (25/101)
✓ Downloaded: 568.pdf (26/101)
✓ Downloaded: 567.pdf (27/101)
✓ Downloaded: 569.pdf (28/101)
✓ Downloaded: 570.pdf (29/101)
✓ Downloaded: 571.pdf (30/101)
✓ Downloaded: 572.pdf 

In [5]:
from pdf2image import convert_from_path

In [6]:
from PIL import Image

Extracting PDFs from zip file...
Found 101 PDF files
Starting conversion to PNG (DPI: 200)...

✗ Failed: 544 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 551 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 549 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 550 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 543 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 547 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 545 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 546 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 542 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 548 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 552 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 553 - Unable to get page count. Is poppler installed and in

In [8]:
def convert_pdf_to_png(pdf_path, output_dir, dpi=200):
    """Convert a single PDF file to PNG images."""
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    try:
        # Convert PDF to images
        images = convert_from_path(pdf_path, dpi=dpi)
        
        png_files = []
        for i, image in enumerate(images):
            # If multiple pages, add page number to filename
            if len(images) > 1:
                png_filename = f"{pdf_name}_page_{i+1}.png"
            else:
                png_filename = f"{pdf_name}.png"
            
            png_path = os.path.join(output_dir, png_filename)
            image.save(png_path, 'PNG')
            png_files.append(png_path)
        
        return pdf_name, True, png_files, None
    except Exception as e:
        return pdf_name, False, [], str(e)

def convert_pdfs_to_png_batch(input_folder, output_dir=r"D:\Y4 Research", output_zip_name="nih_pdfs_as_pngs.zip", dpi=200):
    """Convert all PDFs in a folder to PNGs and create a new zip file."""
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Create temporary directory for PNGs
    temp_png_dir = os.path.join(output_dir, "temp_pngs")
    Path(temp_png_dir).mkdir(exist_ok=True)
    
    print(f"Scanning folder for PDFs: {input_folder}")
    
    # Get all PDF files from the folder
    pdf_files = []
    for file in os.listdir(input_folder):
        if file.endswith('.pdf'):
            pdf_files.append(os.path.join(input_folder, file))
    
    print(f"Found {len(pdf_files)} PDF files")
    print(f"Starting conversion to PNG (DPI: {dpi})...\n")
    
    successful = []
    failed = []
    all_png_files = []
    
    # Convert PDFs to PNGs concurrently
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = {
            executor.submit(convert_pdf_to_png, pdf_path, temp_png_dir, dpi): pdf_path 
            for pdf_path in pdf_files
        }
        
        completed = 0
        for future in as_completed(futures):
            pdf_name, success, png_files, error = future.result()
            completed += 1
            
            if success:
                successful.append(pdf_name)
                all_png_files.extend(png_files)
                print(f"✓ Converted: {pdf_name} ({len(png_files)} page(s)) [{completed}/{len(pdf_files)}]")
            else:
                failed.append((pdf_name, error))
                print(f"✗ Failed: {pdf_name} - {error}")
    
    # Create output zip file with PNGs
    output_zip_path = os.path.join(output_dir, output_zip_name)
    print(f"\nCreating zip file: {output_zip_path}")
    
    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for png_file in all_png_files:
            zipf.write(png_file, os.path.basename(png_file))
    
    # Cleanup temporary files
    print("Cleaning up temporary files...")
    for png_file in all_png_files:
        os.remove(png_file)
    os.rmdir(temp_png_dir)
    
    # Summary
    print(f"\n{'='*50}")
    print(f"Conversion Complete!")
    print(f"{'='*50}")
    print(f"Successfully converted: {len(successful)} PDFs")
    print(f"Total PNG images created: {len(all_png_files)}")
    print(f"Failed conversions: {len(failed)} PDFs")
    print(f"Output file: {output_zip_path}")
    print(f"File size: {os.path.getsize(output_zip_path) / (1024*1024):.2f} MB")
    
    if failed:
        print(f"\nFailed files: {[pdf_name for pdf_name, _ in failed]}")
    
    return successful, failed

if __name__ == "__main__":
    # Convert PDFs from the extracted folder
    input_folder_path = r"D:\Y4 Research\nih_pdfs_542_642"
    
    # Check if input folder exists
    if not os.path.exists(input_folder_path):
        print(f"Error: Input folder not found: {input_folder_path}")
        print("Please make sure the folder path is correct.")
    else:
        successful, failed = convert_pdfs_to_png_batch(
            input_folder=input_folder_path,
            output_dir=r"D:\Y4 Research",
            output_zip_name="nih_pdfs_as_pngs.zip",
            dpi=200  # Higher DPI = better quality but larger file size
        )

Scanning folder for PDFs: D:\Y4 Research\nih_pdfs_542_642
Found 101 PDF files
Starting conversion to PNG (DPI: 200)...

✗ Failed: 543 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 542 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 544 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 545 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 546 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 547 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 548 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 549 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 550 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 551 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 552 - Unable to get page count. Is poppler installed and in PATH?
✗ Failed: 553 - Unable to get page count. Is

In [1]:
import os
import gzip
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---------------------------------------------------
# SETTINGS
# ---------------------------------------------------
DATA_KEYS_URL = "https://openfoodfacts-images.s3.eu-west-3.amazonaws.com/data/data_keys.gz"
BUCKET_URL = "https://openfoodfacts-images.s3.eu-west-3.amazonaws.com/data/"

# Output folders
BASE_DIR = r"D:\Y4 Research\datasets"
IMG_DIR = os.path.join(BASE_DIR, "images_400")
OCR_DIR = os.path.join(BASE_DIR, "ocr_json")
ERROR_LOG = os.path.join(BASE_DIR, "download_errors.log")

THREADS = 20
MAX_IMAGES = 1000
# ---------------------------------------------------

os.makedirs(IMG_DIR, exist_ok=True)
os.makedirs(OCR_DIR, exist_ok=True)

# Clear previous log
open(ERROR_LOG, "w").close()

def log_error(message):
    with open(ERROR_LOG, "a") as f:
        f.write(message + "\n")


def download_file(key, out_dir):
    """Download a single file from the bucket."""
    url = BUCKET_URL + key
    filename = key.replace("/", "_")
    filepath = os.path.join(out_dir, filename)

    if os.path.exists(filepath):
        return f"SKIPPED: {filename}"

    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code == 200:
            with open(filepath, "wb") as f:
                f.write(resp.content)
            return f"DOWNLOADED: {filename}"
        else:
            msg = f"FAILED ({resp.status_code}): {filename}"
            log_error(msg)
            return msg
    except Exception as e:
        msg = f"ERROR: {filename} — {e}"
        log_error(msg)
        return msg


def main():
    print("Downloading index file data_keys.gz ...")
    r = requests.get(DATA_KEYS_URL)
    with open("data_keys.gz", "wb") as f:
        f.write(r.content)

    print("Extracting first 1000 image keys...")
    img_keys = []
    ocr_keys = []

    with gzip.open("data_keys.gz", "rt") as f:
        for line in f:
            key = line.strip()

            if key.endswith(".400.jpg"):
                img_keys.append(key)

                parent_dir = key.rsplit("/", 1)[0]
                ocr_keys.append(parent_dir + "/ocr.json.gz")

                if len(img_keys) >= MAX_IMAGES:
                    break

    print(f"Images selected: {len(img_keys)}")
    print(f"OCR files selected: {len(ocr_keys)}")

    tasks = []

    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        # Progress bar length = total files (images + OCR)
        total_files = len(img_keys) + len(ocr_keys)

        with tqdm(total=total_files, desc="Downloading", unit="file") as pbar:

            for key in img_keys:
                tasks.append(executor.submit(download_file, key, IMG_DIR))

            for key in ocr_keys:
                tasks.append(executor.submit(download_file, key, OCR_DIR))

            for future in as_completed(tasks):
                future.result()
                pbar.update(1)

    print("\nDownload completed.")
    print(f"Failed downloads logged at: {ERROR_LOG}")


if __name__ == "__main__":
    main()


Downloading index file data_keys.gz ...
Extracting first 1000 image keys...
Images selected: 1000
OCR files selected: 1000


Downloading: 100%|██████████| 2000/2000 [03:18<00:00, 10.09file/s]


Download completed.
Failed downloads logged at: D:\Y4 Research\datasets\download_errors.log





In [None]:
import os
from pdf2image import convert_from_path
from tqdm import tqdm

# -----------------------------
# PATHS
# -----------------------------
pdf_dir = r"D:\Y4 Research\nih_pdfs_542_642"
output_dir = r"D:\Y4 Research\datasets\ingredient & nutrition Images"
poppler_path = r"C:\Program Files\poppler-23.11.0\Library\bin"  # change if needed

os.makedirs(output_dir, exist_ok=True)

# -----------------------------
# GET PDF LIST
# -----------------------------
pdf_files = [
    f for f in os.listdir(pdf_dir)
    if f.lower().endswith(".pdf")
]

# -----------------------------
# CONVERT WITH PROGRESS BAR
# -----------------------------
for pdf_file in tqdm(pdf_files, desc="Converting PDFs", unit="pdf"):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    pdf_name = os.path.splitext(pdf_file)[0]

    try:
        pages = convert_from_path(
            pdf_path,
            dpi=300,
            poppler_path=poppler_path
        )

        for i, page in enumerate(pages):
            output_path = os.path.join(
                output_dir,
                f"{pdf_name}_page_{i+1}.png"
            )
            page.save(output_path, "PNG")

    except Exception as e:
        print(f"\n❌ Failed to convert: {pdf_file}")
        print(f"   Reason: {e}")
