# Collect all links

In [5]:
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

import yaml


def collect_links(start_url):
    """collect all links from a given URL below it"""
    # Send a GET request to the starting URL
    response = requests.get(start_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract all anchor tags
    anchor_tags = soup.find_all("a", href=True)

    # Filter links that start with the specified pattern
    start_url_parsed = urlparse(start_url)
    base_url = f"{start_url_parsed.scheme}://{start_url_parsed.netloc}"
    filtered_links = set()

    for tag in anchor_tags:
        href = tag["href"]
        full_url = urljoin(base_url, href)
        if full_url.startswith(start_url):
            filtered_links.add(full_url)

    return filtered_links


def collect_ordered_list_links(start_url):
    # Send a GET request to the starting URL
    response = requests.get(start_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the <ol> within the specified path
    ol_tag = soup.select_one("body > div.seite > div > div > nav.ausgabe > ol")

    if not ol_tag:
        return []

    # Extract all anchor tags within the <ol>
    anchor_tags = ol_tag.find_all("a", href=True)

    # Collect the links
    base_url = response.url
    links = [urljoin(base_url, tag["href"]) for tag in anchor_tags]

    return links


def save_links_to_yaml(links: list[str], file_path: Path):
    """save links to a YAML file"""
    data = {"links": list(links)}
    with open(file_path, "w") as file:
        yaml.dump(data, file)


# Example usage
start_url = "https://fahrradzukunft.de/39/"
links = collect_ordered_list_links(start_url)
links.insert(0, start_url)
for link in links:
    print(link)

save_links_to_yaml(links, Path("data") / "links.yml")

https://fahrradzukunft.de/39/
https://fahrradzukunft.de/39/leserbriefe
https://fahrradzukunft.de/39/hydraulische-bremsschalthebel-mit-mtb-komponenten-verbinden
https://fahrradzukunft.de/39/inkompatible-schaltkomponenten-mit-adaptern-verbinden
https://fahrradzukunft.de/39/fahrradanhaenger-und-fahrradtransportbox
https://fahrradzukunft.de/39/lieblingsrad
https://fahrradzukunft.de/39/leserumfrage
https://fahrradzukunft.de/39/wissenschaft-hamburgs-wetter-ist-perfekt-zum-radfahren
https://fahrradzukunft.de/39/tobis-fahrradgeschichten
https://fahrradzukunft.de/39/hohlspeiche
https://fahrradzukunft.de/39/impressum


In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def collect_ordered_list_links(start_url):
    # Send a GET request to the starting URL
    response = requests.get(start_url)
    response.raise_for_status()  # Raise an exception for HTTP errors

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the <ol> within the specified path
    ol_tag = soup.select_one('body > div.seite > div > div > nav.ausgabe > ol')

    if not ol_tag:
        return []

    # Extract all anchor tags within the <ol>
    anchor_tags = ol_tag.find_all('a', href=True)

    # Collect the links
    base_url = response.url
    links = [urljoin(base_url, tag['href']) for tag in anchor_tags]

    return links

# Example usage
start_url = 'https://fahrradzukunft.de/39/'
links = collect_ordered_list_links(start_url)

for index, link in enumerate(links, start=1):
    print(f"{index}. {link}")

1. https://fahrradzukunft.de/39/leserbriefe
2. https://fahrradzukunft.de/39/hydraulische-bremsschalthebel-mit-mtb-komponenten-verbinden
3. https://fahrradzukunft.de/39/inkompatible-schaltkomponenten-mit-adaptern-verbinden
4. https://fahrradzukunft.de/39/fahrradanhaenger-und-fahrradtransportbox
5. https://fahrradzukunft.de/39/lieblingsrad
6. https://fahrradzukunft.de/39/leserumfrage
7. https://fahrradzukunft.de/39/wissenschaft-hamburgs-wetter-ist-perfekt-zum-radfahren
8. https://fahrradzukunft.de/39/tobis-fahrradgeschichten
9. https://fahrradzukunft.de/39/hohlspeiche
10. https://fahrradzukunft.de/39/impressum
