In [1]:
import pandas as pd

from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement

####
def get_head_and_table(driver):
    """
    Extracts tables and their associated section headers from a web page using Selenium.

    This function scans the webpage loaded in the provided Selenium WebDriver object for
    headers (`<h2>`, `<h3>`, `<h4>`) and tables with the class `"wikitable"`. It pairs each
    table with the most recent header and returns a list of tuples where each tuple contains
    the header text and the corresponding table as a WebElement.

    Parameters:
    ----------
    driver : selenium.webdriver.remote.webdriver.WebDriver
        The Selenium WebDriver instance used to control the web browser. The page should
        already be loaded in this WebDriver.

    Returns:
    -------
    list of tuple
        A list of tuples, where each tuple contains:
        - The header text (str) from the most recent `<h2>`, `<h3>`, or `<h4>` element preceding the table.
        - The table element (selenium.webdriver.remote.webelement.WebElement) associated with that header.

    Example Usage:
    --------------
    driver = webdriver.Firefox()  # Or another browser of choice
    driver.get("https://wiki.openstreetmap.org/wiki/Map_features#Primary_features")
    
    tables_with_sections = get_head_and_table(driver)
    
    for section, table in tables_with_sections:
        print(f"Header: {section}")
        print(f"Table HTML: {table.get_attribute('outerHTML')}")
    
    driver.quit()
    """
    
    # Find all tables and their preceding headers
    elements = driver.find_elements(By.XPATH, "//h2 | //h3 | //h4 | //table[contains(@class, 'wikitable')]")
    # Initialize variables to track the current headers
    current_section = ""
    tables_with_sections = []    
    # Iterate through the elements to pair headers with tables
    for element in elements:
        tag_name = element.tag_name
        if tag_name in ["h2", "h3", "h4"]:
            # Update the current section header
            current_section = element.text.strip()
        elif tag_name == "table":
            # Add the table and its associated header to the list
            tables_with_sections.append((current_section, element))
    return tables_with_sections

####
def parse_table(table_name: str, table_element: WebElement) -> pd.DataFrame:
    """
    Parses a table from a Selenium WebElement into a Pandas DataFrame.

    This function handles tables with the class "wikitable" and "wikitable taginfo-taglist".
    It processes the tables by extracting relevant columns and associating rows with headers.

    Parameters:
    ----------
    table_name : str
        The name to assign to the "Header" column for rows in the table, especially used for "wikitable taginfo-taglist" tables.
    
    table_element : WebElement
        A Selenium WebElement object representing the table to be parsed.

    Returns:
    -------
    pd.DataFrame
        A DataFrame containing the parsed data with the following columns:
        - "Key": The key from the table's first column.
        - "Value": The value from the table's second column.
        - "Element": The last part of the image src values from the table's third column, joined by " / " if multiple images are present, or "None" if no "element" column exists.
        - "Description": The description or comment from the table's third or fourth column (depending on the presence of an "element" column).
        - "Header": The associated header name (either from an internal headline or provided table name).

    Notes:
    -----
    - For tables with the class "wikitable", the function iterates over each row (`tr`). If a row contains `th` elements, 
      the "Header" text is updated with the text of the `th` element (which may contain a span with the class "mw-headline"). 
      Rows containing `td` elements are parsed and associated with the current "Header" text.
    - For tables with the class "wikitable taginfo-taglist", the entire table is processed row by row, 
      and the "Header" column is populated with the `table_name` provided as an input.
    - Only rows that have at least four columns (Key, Value, Element, Description) are processed. 
      The "Map rendering" and "Image" columns, if present, are ignored.
    - If the table has only two columns, the function checks if the first column's values 
      can be split into two parts using "=". If they can be split, the first part is assigned 
      to "Key", the second part to "Value", and the second column's value to "Description". 
      The "Element" column is left empty in this case.
    - In the case of standard four-column tables, the "Element" column is populated with the last part of the `src` 
      attribute of the `img` tags found in the third column (split on "/"). If multiple images are found, 
      their names are joined by " / ". If no `img` tag is found, the "Element" column is filled with an empty string.
    - The function dynamically adapts to whether an "element" column is present in the table.
    """
    # Determine the class of the table
    table_class = table_element.get_attribute("class")
    
    # Initialize an empty list to hold row data
    rows_data = []
    header_text = table_name  # Initialize with provided table name
    has_element_column = False  # Flag to check if "element" column exists

    # Case 1: Handling "wikitable" class tables
    if "wikitable" in table_class and "taginfo-taglist" not in table_class:
        rows = table_element.find_elements(By.TAG_NAME, "tr")  # Get all rows

        # Check the first row to see if it contains headers
        first_row = rows[0]
        header_columns = first_row.find_elements(By.TAG_NAME, "th")
        
        if len(header_columns) >= 3:
            header_texts = [th.text.strip().lower() for th in header_columns]
            if "element" in header_texts:
                has_element_column = True

        for row in rows:
            # Check if the row contains a header (th element)
            th_elements = row.find_elements(By.TAG_NAME, "th")
            if th_elements:
                # Only update header_text if "mw-headline" is found in the th element
                span_elements = th_elements[0].find_elements(By.CLASS_NAME, "mw-headline")
                if span_elements:
                    header_text = span_elements[0].text.strip()
                continue  # Skip this row as it is a header row or doesn't have a valid header

            columns = row.find_elements(By.TAG_NAME, "td")
            
            if len(columns) == 2:  # Handle two-column tables
                key_value = columns[0].text.strip()
                if "=" in key_value:
                    key, value = map(str.strip, key_value.split("=", 1))
                    description = columns[1].text.strip()
                    # Add the row data with an empty "Element" column
                    rows_data.append({
                        "Key": key,
                        "Value": value,
                        "Element": "None",
                        "Description": description,
                        "Header": header_text.lower(),
                        "Category": table_name.lower()
                    })
            elif len(columns) >= 4:  # Handle standard four-column tables
                key = columns[0].text.strip()
                value = columns[1].text.strip()

                if has_element_column:
                    # Handle as before with "Element" column
                    img_elements = columns[2].find_elements(By.TAG_NAME, "img")
                    if img_elements:
                        element = " / ".join(img.get_attribute("src").split("/")[-1].split('_')[-1].split('.')[0] for img in img_elements)
                    else:
                        element = "None"  # If no image is found, use "None"
                    description = columns[3].text.strip()
                else:
                    # Assume third column is "Description" and no "Element" column
                    element = "None"
                    description = columns[2].text.strip()

                # Add the row data and the header name
                rows_data.append({
                    "Key": key,
                    "Value": value,
                    "Element": element,
                    "Description": description,
                    "Header": header_text.lower(),
                    "Category": table_name.lower()
                })

    # Case 2: Handling "wikitable taginfo-taglist" class tables
    elif "wikitable taginfo-taglist" in table_class:
        rows = table_element.find_elements(By.TAG_NAME, "tr")  # Get all rows

        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            if len(columns) >= 4:  # Ensure the row has enough columns
                key = columns[0].text.strip()
                value = columns[1].text.strip()
                # Attempt to find all img tags in the third column
                img_elements = columns[2].find_elements(By.TAG_NAME, "img")
                if img_elements:
                    element = " / ".join(img.get_attribute("src").split("/")[-1].split('_')[-1].split('.')[0] for img in img_elements)
                else:
                    element = "None"  # If no image is found, use "None"
                description = columns[3].text.strip()
                # Add the row data and the table name as header
                rows_data.append({
                    "Key": key,
                    "Value": value,
                    "Element": element,
                    "Description": description,
                    "Header": table_name.lower(),
                    "Category": table_name.lower()
                })

    # Convert the collected data to a Pandas DataFrame
    df = pd.DataFrame(rows_data, columns=["Key", "Value", "Element", "Description", "Header", "Category"])

    return df

In [7]:
# Path to your geckodriver
driver_path = r'C:\Users\lilou\geckodriver\geckodriver-v0.35.0-win64\geckodriver.exe'  # Adjust if your path is different
# Initialize the Firefox service
service = Service(executable_path=driver_path)
# Initialize the Firefox WebDriver
driver = webdriver.Firefox(service=service)
# Open the page
url = "https://wiki.openstreetmap.org/wiki/Map_features#Primary_features"
driver.get(url)
# Explicitly wait for the entire body to load (with JS enabled it takes time to load all taginfo-taglist tables)
try: WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
except Exception as e: print("Error: Page took too long to load or element was not found."); driver.quit()

# get all tables with their page section
tables = []
while len(tables)<30: tables = get_head_and_table(driver); print('DRIVER: reading again')

DRIVER: reading again
DRIVER: reading again


In [8]:
exclude_tables = ['Additional attributes','Addresses','Annotations','Name','Properties','References','Restrictions']
tables_run = [tab for tab in tables if not tab[0] in exclude_tables]
df = pd.DataFrame()
for i, (name, element) in enumerate(tables_run, start=1):
    print(f'Parsing Table {i}/{len(tables_run)}: {name}')
    tdf = parse_table(name, element)
    if df.empty: df= tdf
    else: df = pd.concat([df,tdf], ignore_index=True)
driver.quit()

df = df[['Category','Header','Key','Value','Description','Element']].rename(columns={"Category":"Key_category",'Header':"Key_subcategory"})
df.to_csv('osm_tags_20240823.csv', index=False)

df.head()

Parsing Table 1/43: Aerialway
Parsing Table 2/43: Aeroway
Parsing Table 3/43: Amenity
Parsing Table 4/43: Linear barriers
Parsing Table 5/43: Access control on highways
Parsing Table 6/43: Boundary
Parsing Table 7/43: Building
Parsing Table 8/43: Craft
Parsing Table 9/43: Medical rescue
Parsing Table 10/43: Firefighters
Parsing Table 11/43: Lifeguards
Parsing Table 12/43: Lifeguards
Parsing Table 13/43: Assembly point
Parsing Table 14/43: Other structure
Parsing Table 15/43: Geological
Parsing Table 16/43: Healthcare
Parsing Table 17/43: Highway
Parsing Table 18/43: Historic
Parsing Table 19/43: Landuse
Parsing Table 20/43: Leisure
Parsing Table 21/43: Man made
Parsing Table 22/43: Military
Parsing Table 23/43: Vegetation
Parsing Table 24/43: Water related
Parsing Table 25/43: Geology related
Parsing Table 26/43: Office
Parsing Table 27/43: Administratively declared places
Parsing Table 28/43: Populated settlements, urban
Parsing Table 29/43: Populated settlements, urban and rural
Pars

Unnamed: 0,Key_category,Key_subcategory,Key,Value,Description,Element
0,aerialway,aerialway,aerialway,cable_car,A cable car run. Just one or two large cars. T...,way
1,aerialway,aerialway,aerialway,gondola,An aerialway where the cabins go around in a c...,way
2,aerialway,aerialway,aerialway,mixed_lift,"A mixed lift, containing both gondolas and cha...",way
3,aerialway,aerialway,aerialway,chair_lift,An open chairlift run. These have one or more ...,way
4,aerialway,aerialway,aerialway,drag_lift,An overhead tow-line for skiers and riders.,way
