# To-Do
- Improve the parsing errors in the code
- Check the consistency of the output compared to the web source

# Code

In [27]:
import requests
from bs4 import BeautifulSoup
import time
import inspect
from html2text import HTML2Text
from datetime import datetime

**Exemple code from chatgpt**

In [None]:
BASE_URL = "https://docs.python.org/3/"
START_PAGE = "howto/logging.html"

def get_links_from_index(start_url):
    """Récupère tous les liens internes pointant vers les pages de la doc."""
    response = requests.get(start_url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    links = []
    for a in soup.select("a[href]"):
        href = a["href"]
        if href.startswith("howto/") and href.endswith(".html"):  # Filtrer les pages utiles
            full_url = BASE_URL + href
            links.append(full_url)
    
    return list(set(links))  # Supprimer les doublons

def scrape_page(url):
    """Scrape une page et extrait le contenu utile."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    title = soup.find("h1").get_text(strip=True) if soup.find("h1") else "Untitled"
    content = "\n".join(p.get_text(strip=True) for p in soup.find_all("p"))  # Extraire le texte des <p>
    
    return f"# {title}\n\n{content}\n\n---\n"

def save_to_file(content, filename="docs_python.md"):
    """Enregistre le texte extrait dans un fichier."""
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)

def main():
    start_url = BASE_URL + START_PAGE
    links = get_links_from_index(start_url)
    
    all_content = ""
    for idx, link in enumerate(links):
        print(f"Scraping {idx+1}/{len(links)}: {link}")
        all_content += scrape_page(link)
        time.sleep(1)  # Éviter de surcharger le serveur
    
    save_to_file(all_content)
    print("Scraping terminé ! Fichier `docs_python.md` généré.")


**Fonctionnal test code**

In [None]:
def get_html_element(element,soup) -> str:
    """
    Searches for the first occurrence of a specified HTML element in a BeautifulSoup object and returns its text.

    Parameters:
    - element (str): The tag name of the HTML element to search for (e.g., 'h1', 'div').
    - soup (BeautifulSoup): A BeautifulSoup object containing the parsed HTML document.

    Returns:
    - str: The text of the first occurrence of the specified element if found; otherwise, an empty string.
    """
    result = soup.find(element)
    if result:
        return result.text
    else:
        print(f"No element ${element} found.")
        return ""

url = 'https://docs.python.org/3/howto/logging.html'
response = requests.get(url)
if (error := response.status_code) == 200:
    html_content = response.text
else:
    raise f"Status code error: {error}"

### define soup
soup = BeautifulSoup(response.text, "html.parser")

### get title
title = get_html_element('h1',soup) # for front matter
title_name = title.lower().replace(" ","-") # for filename

### get subtitle
subtitle = get_html_element('h2',soup) # for front matter

### code blocks
html_content = html_content.replace("<pre", "```<pre")
html_content = html_content.replace("</pre>", "</pre>```")

### text separators
# Find all elements with role="separator"
separator_elements = soup.find_all(attrs={"role": "separator"})

# replace with <hr> element, markdown recognizes this
for element in separator_elements:
    html_content = html_content.replace(str(element), "<hr>")

html_converter = HTML2Text()
html_converter.ignore_links = False
markdown_content = html_converter.handle(html_content)

### get formatted date
today = datetime.now()
formatted_date_str = today.strftime("%Y-%m-%d")

### save file to _posts folder
filename = f"{formatted_date_str}-{title_name}.md"

with open(f"{filename}", 'w', encoding='utf-8') as file:
    file.write(markdown_content)