# Advanced Machine Learning â€“ Natural Language Processing
By: **Ivan Runov** & **Sebastiaan Craens** (group 8)

### Practical Assignment 1: Web Scraping

Build an automated pipeline that:
- Collects book links from the public ranking page
- Visits individual book pages to extract relevant properties (metadata)
- Downloads the plain text versions of all English books
- Stores the data in a structured and reproducible way
By doing so, you will simulate a simplified version of a web data ingestion workflow as used in industry and research.

You will use the stored texts and properties in subsequent assignments in this course.

### Imports

In [138]:
import io
import os
import re
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
#https://books.flotwiskunde.nl

### Settings

In [139]:
# Page containing top 100 EBooks of the last 30 days
TOP_URL = "https://books.flotwiskunde.nl/browse/scores/top"
# Our target list is below this header (id determined using the browser's developer tools)
LIST_HEADER_TAG = "h2"
LIST_HEADER_ID = "books-last30"
# Metadata table (id determined using the browser's developer tools)
METADATA_TABLE_ID = "about_book_table"

# User-Agent header identifies the scraper as a Python script.
HEADERS = {"User-Agent": "Python Requests"}

# Output directories
OUT_DIR = "data" # Stores metadata
TEXT_DIR = os.path.join(OUT_DIR, "texts") # Stores downloaded plaintext files

### Setup


In [140]:
# Ensure existence of output directories
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)

# Function for getting soup (requesting and parsing a page)
def get_soup(url: str) -> BeautifulSoup:
    """Request a page and return its BeautifulSoup object."""
    response = requests.get(url, headers=HEADERS)
    response.raise_for_status() # Raise an error for bad responses
    return BeautifulSoup(response.text, "html.parser")

# Functions for coloring terminal output (for better readability)
def red(text: str) -> str:
    """Return text wrapped in ANSI codes for red color."""
    return f"\033[91m{text}\033[0m"
def green(text: str) -> str:
    """Return text wrapped in ANSI codes for green color."""
    return f"\033[92m{text}\033[0m"
def red_or_green(text: str, condition: bool) -> str:
    """Return text colored green if condition is True, else red."""
    return green(text) if condition else red(text)

### Scraping the list

In [141]:
# Get the top page and parse it using BeautifulSoup.
soup = get_soup(TOP_URL)

# Find the specific header
target_header = soup.find(LIST_HEADER_TAG, id=LIST_HEADER_ID)
assert target_header is not None, f"Could not find header '{LIST_HEADER_ID}' in the page"
print(f"Found target header: {target_header}")

# From our scouting we know that this header is followed by an ordered list (<ol>) containing the top 100 books.
for sibling in target_header.find_next_siblings():
    if sibling.name == "ol":
        top_books = sibling.find_all("a", href=True)
        print(f"List containing {len(top_books)} links found:")
        print('\n'.join([str(book) for book in top_books[:3]]) + '\n...')
        break
else:
    raise ValueError("Could not find the ordered list of top 100 books.")

Found target header: <h2 id="books-last30">Top 100 EBooks last 30 days</h2>
List containing 100 links found:
<a href="/ebooks/84.html">Frankenstein; Or, The Modern Prometheus by Mary Wollstonecraft Shelley (145303)</a>
<a href="/ebooks/2701.html">Moby Dick; Or, The Whale by Herman Melville (115857)</a>
<a href="/ebooks/1342.html">Pride and Prejudice by Jane Austen (85439)</a>
...


### Scraping the books

In [None]:
print(f"Processing books...")

metadata_list = []
# Iterate over the list of top books and process each one.
# The metadata is saved in a list of dictionaries, which we will save at the end.
for i, book in enumerate(top_books, start=1):
    url = urljoin(TOP_URL, book['href'])
    print(f"[{i}/{len(top_books)}] ({url})", end = '')
        
    # Get the book's page and parse it
    soup = get_soup(url)
    
    # Find metadata table
    table_id = "about_book_table"
    meta_table = soup.find("table", id=table_id)
    assert meta_table is not None, f"Could not find metadata table with id '{table_id}' in page {url}"
    
    # Parse metadata table using pandas and convert into dictionary
    meta = pd.read_html(io.StringIO(str(meta_table)))[0].set_index(0).loc[:, 1].to_dict()
    
    # Skip book if not English
    if meta["Language"] != "English":
        print(red(f" Not English => Skipping..."))
        continue
        
    # Get relevant metadata (using .get() to avoid KeyErrors for missing fields)
    meta_relevant = {
        "url":                   url,
        "title":                 meta.get("Title", ""),
        # "author":                meta.get("Author", ""),
        # "credits":               meta.get("Credits", ""),
        "reading_level":         meta.get("Reading Level", ""),
        "ebook_no":              meta.get("EBook-No.", ""),
        "language":              meta.get("Language", ""),
        "release_date":          meta.get("Release Date", ""),
        "most_recently_updated": meta.get("Most Recently Updated", ""),
        "downloads":             meta.get("Downloads", ""),
    }
    
    # Find plaintext download link (if available)
    download_link = soup.find("a", string="Plain Text UTF-8")
    if download_link is not None:
        meta_relevant["plaintext_url"] = urljoin(url, download_link['href'])
    else:
        print(red(f" No plaintext download => Skipping..."))
        continue
    
    # Download plaintext file
    try:
        # Request the plaintext file
        txt_resp = requests.get(meta_relevant["plaintext_url"], headers=HEADERS)
        txt_resp.raise_for_status()
        txt_resp.encoding = "utf-8" # Ensure correct encoding
        
        # Save the plaintext file
        out_path = os.path.join(TEXT_DIR, f"{meta_relevant['ebook_no']}.txt")
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(txt_resp.text)

        # Print success message
        print(green(f" Downloaded text => Saved to {out_path}"))
    
    # May fail for whatever reason
    except Exception as e:
        print(red(f" Failed to download text: {e}"))
    
    # Append relevant metadata to list
    metadata_list.append(meta_relevant)

print(f"Books processed: {len(metadata_list)}/{len(top_books)}")


# Save metadata as csv
print(f"Saving metadata as csv...", end="\r")
meta_df = pd.DataFrame(metadata_list)
meta_df.to_csv(os.path.join(OUT_DIR, "metadata.csv"), index=False)
print(f"Saving metadata as csv... Done.")

# Save metadata as jsonl (one JSON record per line)
print(f"Saving metadata as jsonl...", end="\r")
with open(os.path.join(OUT_DIR, "metadata.jsonl"), "w", encoding="utf-8") as f:
    for rec in metadata_list:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"Saving metadata as jsonl... Done.")


Processing books...
[1/100] (https://books.flotwiskunde.nl/ebooks/84.html)[92m Downloaded text => Saved to data\texts\84.txt[0m
[2/100] (https://books.flotwiskunde.nl/ebooks/2701.html)[92m Downloaded text => Saved to data\texts\2701.txt[0m
[3/100] (https://books.flotwiskunde.nl/ebooks/1342.html)[92m Downloaded text => Saved to data\texts\1342.txt[0m
[4/100] (https://books.flotwiskunde.nl/ebooks/1513.html)[92m Downloaded text => Saved to data\texts\1513.txt[0m
[5/100] (https://books.flotwiskunde.nl/ebooks/26184.html)[92m Downloaded text => Saved to data\texts\26184.txt[0m
[6/100] (https://books.flotwiskunde.nl/ebooks/100.html)[92m Downloaded text => Saved to data\texts\100.txt[0m
[7/100] (https://books.flotwiskunde.nl/ebooks/2641.html)[92m Downloaded text => Saved to data\texts\2641.txt[0m
[8/100] (https://books.flotwiskunde.nl/ebooks/43.html)[92m Downloaded text => Saved to data\texts\43.txt[0m
[9/100] (https://books.flotwiskunde.nl/ebooks/145.html)[92m Downloaded text