In [4]:
import os
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
#https://books.flotwiskunde.nl



In [5]:


TOP_URL = "https://books.flotwiskunde.nl/browse/scores/top"
HEADERS = {"User-Agent": "Python Requests"}

OUT_DIR = "data"
TEXT_DIR = os.path.join(OUT_DIR, "texts")

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)


# Get top 100 book page links

r = requests.get(TOP_URL, headers=HEADERS)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

book_links = []
seen = set()

for a in soup.find_all("a", href=True):
    href = a["href"].strip()

    # Keep only links like /ebooks/12345
    m = re.search(r"/ebooks/\d+", href)
    if m:
        book_url = urljoin(TOP_URL, m.group(0))
        if book_url not in seen:
            seen.add(book_url)
            book_links.append(book_url)

    if len(book_links) >= 100:
        break

print("Collected book links:", len(book_links))


# Visit each book page and extract metadata + plaintext link

rows = []

for i, book_url in enumerate(book_links, start=1):
    print(f"[{i}/100] {book_url}")

    row = {
        "plaintext_url": "",
        "title": "",
        "author": "",
        "reading_level": "",
        "ebook_no": "",
        "language": "",
        "most_recently_updated": "",
        "book_page_url": book_url,
    }

    # fetch book page
    try:
        r = requests.get(book_url, headers=HEADERS)
        r.raise_for_status()
        s = BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        print("  Failed to fetch book page:", e)
        rows.append(row)
        continue

    # title
    h1 = s.find("h1")
    if h1:
        row["title"] = h1.get_text(strip=True)
    else:
        if s.title:
            row["title"] = s.title.get_text(strip=True)

    # ebook_no from URL if needed
    m = re.search(r"/ebooks/(\d+)", book_url)
    if m:
        row["ebook_no"] = m.group(1)




    # --- Try table rows first: <tr><th>Label</th><td>Value</td></tr>
    for tr in s.find_all("tr"):
        th = tr.find("th")
        td = tr.find("td")
        if not th or not td:
            continue

        key = th.get_text(" ", strip=True).lower().strip(":")
        val = td.get_text(" ", strip=True)

        if "author" == key or "authors" == key:
            row["author"] = val
        elif "language" == key:
            row["language"] = val
        elif "reading level" == key or "level" == key:
            row["reading_level"] = val
        elif "most recently updated" == key or "last updated" == key:
            row["most_recently_updated"] = val
        elif "ebook" in key and ("no" in key or "number" in key):
            row["ebook_no"] = val

    # --- If still missing, try dt/dd: <dt>Label</dt><dd>Value</dd>
    for dt in s.find_all("dt"):
        dd = dt.find_next_sibling("dd")
        if not dd:
            continue

        key = dt.get_text(" ", strip=True).lower().strip(":")
        val = dd.get_text(" ", strip=True)

        if not row["author"] and ("author" == key or "authors" == key):
            row["author"] = val
        elif not row["language"] and ("language" == key):
            row["language"] = val
        elif not row["reading_level"] and ("reading level" == key or "level" == key):
            row["reading_level"] = val
        elif not row["most_recently_updated"] and ("most recently updated" == key or "last updated" == key):
            row["most_recently_updated"] = val
        elif (not row["ebook_no"]) and ("ebook" in key and ("no" in key or "number" in key)):
            row["ebook_no"] = val

    # plaintext download link:

    plaintext_url = ""
    best_score = -1

    for a in s.find_all("a", href=True):
        link_text = a.get_text(" ", strip=True).lower()
        href = a["href"].strip()
        abs_url = urljoin(book_url, href)

        score = -1
        if "plain text" in link_text and "utf" in link_text:
            score = 100
        elif "plain text" in link_text:
            score = 80
        elif abs_url.lower().endswith(".txt"):
            score = 60

        if score > best_score:
            best_score = score
            plaintext_url = abs_url

    if best_score > 0:
        row["plaintext_url"] = plaintext_url

    rows.append(row)


    # Download ONLY English books as plaintext

    if row["language"].strip().lower() == "english" and row["plaintext_url"]:
        try:
            txt_resp = requests.get(row["plaintext_url"], headers=HEADERS)
            txt_resp.raise_for_status()

            ebook_no = row["ebook_no"] if row["ebook_no"] else f"book_{i}"
            out_path = os.path.join(TEXT_DIR, f"{ebook_no}.txt")

            with open(out_path, "w", encoding="utf-8") as f:
                f.write(txt_resp.text)

        except Exception as e:
            print("  Failed to download text:", e)


# Save metadata

df = pd.DataFrame(rows)
df.to_csv(os.path.join(OUT_DIR, "metadata.csv"), index=False)

with open(os.path.join(OUT_DIR, "metadata.jsonl"), "w", encoding="utf-8") as f:
    for rec in rows:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print("Done.")
print("Saved:", os.path.join(OUT_DIR, "metadata.csv"))
print("Saved:", os.path.join(OUT_DIR, "metadata.jsonl"))
print("Texts in:", TEXT_DIR)


Collected book links: 100
[1/100] https://books.flotwiskunde.nl/ebooks/84
[2/100] https://books.flotwiskunde.nl/ebooks/2701
[3/100] https://books.flotwiskunde.nl/ebooks/1342
[4/100] https://books.flotwiskunde.nl/ebooks/1513
[5/100] https://books.flotwiskunde.nl/ebooks/26184
[6/100] https://books.flotwiskunde.nl/ebooks/100
[7/100] https://books.flotwiskunde.nl/ebooks/2641
[8/100] https://books.flotwiskunde.nl/ebooks/43
[9/100] https://books.flotwiskunde.nl/ebooks/145
[10/100] https://books.flotwiskunde.nl/ebooks/11
[11/100] https://books.flotwiskunde.nl/ebooks/37106
[12/100] https://books.flotwiskunde.nl/ebooks/2554
[13/100] https://books.flotwiskunde.nl/ebooks/768
[14/100] https://books.flotwiskunde.nl/ebooks/1260
[15/100] https://books.flotwiskunde.nl/ebooks/67979
[16/100] https://books.flotwiskunde.nl/ebooks/16389
[17/100] https://books.flotwiskunde.nl/ebooks/16328
[18/100] https://books.flotwiskunde.nl/ebooks/64317
[19/100] https://books.flotwiskunde.nl/ebooks/394
[20/100] https://b