In [None]:
import os
import re
import requests
from pathlib import Path
from typing import List, Tuple


def find_markdown_files(directory: Path) -> List[Path]:
    """
    Recursively find all Markdown files in the given directory.

    Args:
        directory (Path): The directory to search.

    Returns:
        List[Path]: A list of paths to Markdown files.
    """
    return list(directory.rglob("*.md"))


def extract_image_urls(markdown_content: str) -> List[str]:
    """
    Extract all image URLs ending with .jpg from the Markdown content.

    Args:
        markdown_content (str): The content of a Markdown file.

    Returns:
        List[str]: A list of image URLs.
    """
    return re.findall(r'!\[.*?\]\((https?://.*?\.(?:jpg|png))\)', markdown_content)

def download_image(url: str, destination: Path) -> None:
    """
    Download an image from a URL and save it to the destination.

    Args:
        url (str): The URL of the image.
        destination (Path): The path to save the image.
    """
    try:
        response = requests.get(url, stream=True, timeout=10)
        response.raise_for_status()
        with open(destination, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
    except Exception as e:
        log_file = Path("error_log.txt")
        with log_file.open("a", encoding="utf-8") as log:
            log.write(f"Failed to download {url}: {e}\n")
        print(f"Error downloading {url}. Logged to {log_file}.")

def replace_image_urls(markdown_content: str, url_map: List[Tuple[str, str]]) -> str:
    """
    Replace image URLs in the Markdown content with local paths.

    Args:
        markdown_content (str): The content of a Markdown file.
        url_map (List[Tuple[str, str]]): A list of tuples mapping old URLs to new local paths.

    Returns:
        str: The updated Markdown content.
    """
    for old_url, new_path in url_map:
        markdown_content = markdown_content.replace(old_url, new_path)
    return markdown_content


def process_markdown_files(directory: Path, assets_folder: Path) -> None:
    """
    Process all Markdown files in the directory:
    - Extract image URLs.
    - Download images to the assets folder.
    - Replace URLs in the Markdown files with local paths.

    Args:
        directory (Path): The directory containing Markdown files.
        assets_folder (Path): The folder to store downloaded images.
    """
    assets_folder.mkdir(parents=True, exist_ok=True)
    md_files = find_markdown_files(directory)
    print(f"found {len(md_files)} markdown files")
    for markdown_file in md_files:
        print(f"Processing {markdown_file}")
        with open(markdown_file, "r", encoding="utf-8") as file:
            content = file.read()

        image_urls = extract_image_urls(content)
        print(f"found {len(image_urls)} images in {markdown_file.name}")
        url_map = []

        for url in image_urls:
            image_name = url.split("/")[-1]
            local_path = assets_folder / image_name
            if not local_path.exists():
                download_image(url, local_path)
            url_map.append((url, str(local_path.relative_to(directory))))

        updated_content = replace_image_urls(content, url_map)

        with open(markdown_file, "w", encoding="utf-8") as file:
            file.write(updated_content)


if __name__ == "__main__":
    # Define the directory containing Markdown files and the assets folder
    markdown_directory = Path().cwd()
    assets_directory = markdown_directory / "assets"

    # Process the Markdown files
    process_markdown_files(markdown_directory, assets_directory)

found 68 markdown files
Processing /home/richard/programmieren/spurtreu/posts/2022-08-27-wie-reist-man-heutzutage.md
found 0 images in 2022-08-27-wie-reist-man-heutzutage.md
Processing /home/richard/programmieren/spurtreu/posts/2023-05-01-heraus-zum-ersten-mai.md
found 0 images in 2023-05-01-heraus-zum-ersten-mai.md
Processing /home/richard/programmieren/spurtreu/posts/2022-09-27-overrated-overtouristed.md
found 0 images in 2022-09-27-overrated-overtouristed.md
Processing /home/richard/programmieren/spurtreu/posts/2022-10-17-salve.md
found 0 images in 2022-10-17-salve.md
Processing /home/richard/programmieren/spurtreu/posts/2022-10-30-korrumpel.md
found 0 images in 2022-10-30-korrumpel.md
Processing /home/richard/programmieren/spurtreu/posts/2023-06-12-sachsen-rundfahrt-.md
found 0 images in 2023-06-12-sachsen-rundfahrt-.md
Processing /home/richard/programmieren/spurtreu/posts/2022-10-05-baggern.md
found 3 images in 2022-10-05-baggern.md
Processing /home/richard/programmieren/spurtreu/

HTTPError: 400 Client Error: Bad Request for url: https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg6LluVrS1hp3O-Inii7-0y2IQuXLeJhLhP5lxwGo4XkqOcvSyPw_8NhKuNFuQQdvQg7x8R-oWplNQqmXqbCB1_6TlqISyv1ZeAmrA4jdEIe-x1wucbo5fiLLL43vBlwFJ5L0ZZMow7CL4/w386-h640/1664641343013793-0.png%20%22Radweg%20in%20Tirana%22)%5D(https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEg6LluVrS1hp3O-Inii7-0y2IQuXLeJhLhP5lxwGo4XkqOcvSyPw_8NhKuNFuQQdvQg7x8R-oWplNQqmXqbCB1_6TlqISyv1ZeAmrA4jdEIe-x1wucbo5fiLLL43vBlwFJ5L0ZZMow7CL4/s1600/1664641343013793-0.png