These is a web scraped designed to work in distributed fashion and is used the process of scaping data is made faster by using the multithreading.

It could be made even faster using rotating proxies which can change IP per request and helps you getting detected as a bot by the website owner. But, to respect them and not accidently DDOS attacking them scrape with caution.

In [None]:
import asyncio
import csv
import os
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from playwright.async_api import async_playwright

In [None]:
# CONFIG
OUTPUT_FILE = "boereport_articles.csv"
BASE_URL = "https://boereport.com/category/oil-and-gas-news-headlines/page/{}"
START_PAGE = 1
END_PAGE = 3388
CONCURRENCY = max(2, os.cpu_count())

In [None]:
# scrape one page
async def scrape_single_page(browser, page_number):
    url = BASE_URL.format(page_number)
    try:
        page = await browser.new_page()
        await page.goto(url, timeout=120000)
        await page.wait_for_selector("article", timeout=20000)

        articles = await page.query_selector_all("article")
        results = []

        for article in articles:
            title_el = await article.query_selector("h2.entry-title a")
            date_el = await article.query_selector("time.entry-time")
            desc_el = await article.query_selector(".entry-content p")

            title = (await title_el.inner_text()) if title_el else ""
            date_raw = (await date_el.get_attribute("datetime")) if date_el else ""
            description = (await desc_el.inner_text()) if desc_el else ""

            # Extract only the date (YYYY-MM-DD)
            date = ""
            if date_raw:
                try:
                    date = datetime.fromisoformat(date_raw.replace("Z", "")).date().isoformat()
                except ValueError:
                    if "T" in date_raw:
                        date = date_raw.split("T")[0]

            if title and date:
                results.append({
                    "date": date.strip(),
                    "title": title.strip(),
                    "description": description.strip()
                })

        await page.close()
        print(f"Page {page_number}: {len(results)} articles")
        return results

    except Exception as e:
        print(f"Error scraping page {page_number}: {e}")
        return []

In [None]:
# WORKER GROUP FUNCTION
async def scrape_pages_group(playwright, page_numbers):
    browser = await playwright.firefox.launch(headless=True)
    group_results = []

    for num in page_numbers:
        res = await scrape_single_page(browser, num)
        group_results.extend(res)

    await browser.close()
    return group_results

In [None]:
# PARALLEL COORDINATOR
async def main():
    os.makedirs(os.path.dirname(OUTPUT_FILE) or ".", exist_ok=True)
    file_exists = os.path.exists(OUTPUT_FILE)

    async with async_playwright() as p:
        # divide pages into batches based on concurrency
        all_pages = list(range(START_PAGE, END_PAGE + 1))
        page_groups = [all_pages[i:i + CONCURRENCY] for i in range(0, len(all_pages), CONCURRENCY)]

        # open CSV file once
        with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=["date", "title", "description"])
            if not file_exists:
                writer.writeheader()

            # process groups sequentially, each group runs pages concurrently
            for idx, group in enumerate(page_groups, start=1):
                print(f"\nProcessing batch {idx}/{len(page_groups)} with {len(group)} pages...")

                # run this group in parallel using asyncio.gather
                results_per_group = await asyncio.gather(*[scrape_pages_group(p, [num]) for num in group])

                # flatten all results
                all_results = [r for group_res in results_per_group for r in group_res]

                if all_results:
                    writer.writerows(all_results)
                    print(f"Saved {len(all_results)} articles from batch {idx}")
                else:
                    print(f"No data from batch {idx}")

    print(f"\nDONE â€” Data saved to {OUTPUT_FILE}")

In [None]:
if __name__ == "__main__":
    asyncio.run(main())