In [4]:
import requests
import bs4
import pandas
import json
import selenium
from pprint import pprint
import random
from datetime import timedelta

In [5]:
base_url = "https://www.bergsteigen.com/"

In [6]:
with open("data/urls.txt", "r") as f:
    urls = f.readlines()

urls = [url.strip().replace("\n", "") for url in urls if len(url) > 20]
urls[0:3]

['https://www.bergsteigen.com/touren/klettersteig/goldgrat-klettersteig-nauders/',
 'https://www.bergsteigen.com/touren/klettersteig/klettersteig-der-24er-hochgebirgsjaeger/',
 'https://www.bergsteigen.com/touren/klettersteig/tauernblick-klettersteig/']

In [7]:
klettersteige = {}
for url in urls:
    klettersteige[url] = {}
    response = requests.get(url)
    soup = bs4.BeautifulSoup(response.content, "html.parser")
    klettersteige[url]["soup"] = soup

In [8]:
def p():
    key = random.choice(list(klettersteige.keys()))
    print(key)
    pprint({k: v for k, v in klettersteige[key].items() if k != "soup"})

def parse_first_non_nan(string_to_parse):
    string_to_parse = string_to_parse.replace("\\n", "\n")
    parsed_data = string_to_parse.split("\n")

    for item in parsed_data:
        if item.strip() != "":
            return item.strip()

    return None

def parse_timedelta_string(timedelta_string):
    parts = timedelta_string.split()
    hours, minutes = map(int, parts[0].split(":"))
    delta = timedelta(hours=hours, minutes=minutes)
    return delta

def parse_times_and_lengths(soup):
    div = klettersteige[url]["soup"].find("div", {"data-tip": "Steighöhe / Gesamthöhe"})
    icon_info_value = div.find('span', class_='iconInfoValue').get_text(separator=" ").strip()
    info_parts = icon_info_value.split('\n')
    info_parts = [info.strip().replace('\n', ' ').strip().split()[0] for info in info_parts if len(info.strip()) > 3]

    if len(info_parts) == 4:
        kletterlänge, gesamthöhe, kletterzeit, gesamtzeit = info_parts
    else:
        kletterlänge, kletterzeit, gesamtzeit = info_parts
        gesamthöhe = None

    return kletterlänge, gesamthöhe, parse_timedelta_string(kletterzeit), parse_timedelta_string(gesamtzeit)


def parse_activity_info(soup):
    activity_info = {}
    div = klettersteige[url]["soup"].find("div", {"class": "scalaInfoContainer"})
    scala_items = soup.find_all('div', class_='scalaItem')
    
    for item in scala_items:
        label = item.find('div', class_='itemLabel').text.strip()
        if "Beste Jahreszeit" not in label:
            # Assume the rating is indicated by the size-X class
            value = item.find('div', class_='itemValue').div['class'][0].split('-')[-1]
        else:
            # Extract best months for activity
            months = item.find_all('div', class_='seasonYear')
            best_months = [month.text.strip() for month in months if 'bgGreen' in month['class']]
            value = ', '.join(best_months)
        
        activity_info[label] = value
    
    return activity_info

def get_topo_url(soup):
    div = soup.find("div", {"class": "topo"})
    a = div.find('a')
    return base_url + a['href']

def get_description(soup):
    item_wraps = soup.find_all('div', class_='itemWrap')
    
    # Create a dictionary to store the extracted details
    details = {}
    
    # Loop through each 'itemWrap' to extract information
    for item in item_wraps:
        try:
            label = item.find('div', class_='itemLabel').text.strip().rstrip(':')
            value_container = item.find('div', class_='itemValue')
            
            # Check if value contains links
            if value_container.find('a'):
                # Extract all links with their text
                links = value_container.find_all('a')
                value = {link.text.strip(): link['href'] for link in links}
            else:
                # Get text without child tags (like <small>)
                value = value_container.text.strip()
            
            # Store the label and value in the dictionary
            details[label] = value
        except AttributeError:
            pass
    
    return details

print(url)
get_description(klettersteige[url]["soup"])

https://www.bergsteigen.com/touren/klettersteig/elfer-nordwand-klettersteig/


{'Kondition': '',
 'Regionen': {'Österreich': '/region/?tx_webxbergsteigen_bergsteigen%5Baction%5D=show&tx_webxbergsteigen_bergsteigen%5Bcontroller%5D=Region&tx_webxbergsteigen_bergsteigen%5Blevel_0%5D=2&tx_webxbergsteigen_bergsteigen%5Bregion%5D=2&cHash=994ef274939f3d654331ef441aae07a1',
  'Tirol': '/region/?tx_webxbergsteigen_bergsteigen%5Baction%5D=show&tx_webxbergsteigen_bergsteigen%5Bcontroller%5D=Region&tx_webxbergsteigen_bergsteigen%5Blevel_0%5D=2&tx_webxbergsteigen_bergsteigen%5Blevel_1%5D=11&tx_webxbergsteigen_bergsteigen%5Bregion%5D=11&cHash=565fa854c4945fdba70b1c083bee8d06'},
 'Gebirge': {'Stubaier Alpen': '/gebirge-1/?tx_webxbergsteigen_bergsteigen%5Baction%5D=show&tx_webxbergsteigen_bergsteigen%5Bcontroller%5D=Mountain&tx_webxbergsteigen_bergsteigen%5Bmountain%5D=629&cHash=fb474db486103d2bb945fe08cbdaba55'},
 'Berg': 'Westlicher Elferturm\n\xa0(2483 m)',
 'Charakter': 'Der Elfer Nordwand Klettersteig ist einer der Ferrata-Klassiker im Stubaital. Dieser schöne, steile Klett

In [9]:
for url in klettersteige.keys():
    try:
        klettersteige[url]["titel"] = klettersteige[url]["soup"].find("h1").text
        klettersteige[url]["schwierigkeit"] = parse_first_non_nan(klettersteige[url]["soup"].find("div", {"data-tip": "Schwierigkeit"}).find("span", {"class": "iconInfoValue"}).text)
        klettersteige[url]["zustieg"] = parse_timedelta_string(klettersteige[url]["soup"].find("div", {"data-tip": "Zustiegszeit"}).find("span", {"class": "iconInfoValue"}).text)
        klettersteige[url]["abstieg"] = parse_timedelta_string(klettersteige[url]["soup"].find("div", {"data-tip": "Abstiegzeit"}).find("span", {"class": "iconInfoValue"}).text)
        klettersteige[url]["kletterlänge"], klettersteige[url]["gesamthöhe"], klettersteige[url]["kletterzeit"], klettersteige[url]["gesamtzeit"] = parse_times_and_lengths(klettersteige[url]["soup"])
        klettersteige[url]["topo_url"] = get_topo_url(klettersteige[url]["soup"])
        klettersteige[url]["gpx"] = base_url + klettersteige[url]["soup"].find("a", {"data-tip": "Download Track / GPX"}).get("href")

        for key, value in parse_activity_info(klettersteige[url]["soup"]).items():
            klettersteige[url][key] = value

        desc = get_description(klettersteige[url]["soup"])
        klettersteige[url]["höhe_einstieg"] = desc.get("Höhe Einstieg").split()[0] if desc.get("Höhe Einstieg") else None
        klettersteige[url]["info"] = desc.get("Infostand") if desc.get("Infostand") else None
        klettersteige[url]["region"] = list(desc.get("Regionen").keys())[-1] if desc.get("Regionen") else None
        klettersteige[url]["ausgangspunkt_höhe"] = desc.get("Ausgangspunkt / Höhe").replace("\xa0", " ").split()[-2] if desc.get("Ausgangspunkt / Höhe") else None
        klettersteige[url]["img0"] = ""
        klettersteige[url]["img1"] = ""
        klettersteige[url]["img2"] = ""
        
    except Exception as e:
        print(f"Error for {url}: {e}")
        
print()     
p()


https://www.bergsteigen.com/touren/klettersteig/rotschitza-klamm-klettersteig/
{'Beste Jahreszeit:': 'Mai, Jun, Jul, Aug, Sep, Okt',
 'Erfahrung:': '3',
 'Kondition:': '1',
 'Kraft:': '2',
 'Landschaft:': '4',
 'abstieg': datetime.timedelta(seconds=1800),
 'ausgangspunkt_höhe': '919',
 'gesamthöhe': '350',
 'gesamtzeit': datetime.timedelta(seconds=5400),
 'gpx': 'https://www.bergsteigen.com//fileadmin/userdaten/import/tracks/gpx/rotschitza_klamm_klettersteig_track.gpx',
 'höhe_einstieg': None,
 'img0': '',
 'img1': '',
 'img2': '',
 'info': None,
 'kletterlänge': '100',
 'kletterzeit': datetime.timedelta(seconds=1800),
 'region': 'Kärnten',
 'schwierigkeit': 'C',
 'titel': 'Rotschitza Klamm Klettersteig',
 'topo_url': 'https://www.bergsteigen.com//fileadmin/userdaten/tour/topo/9398/rotschitza-klamm-klettersteig-topo.png',
 'zustieg': datetime.timedelta(seconds=1800)}


In [10]:
# save to json but remove the soup object
klettersteige_copy = klettersteige.copy()
for key in klettersteige_copy.keys():
    klettersteige_copy[key].pop("soup") if "soup" in klettersteige_copy[key] else None

with open("data/klettersteige.json", "w", encoding="utf-8") as f:
    json.dump(klettersteige_copy, f, indent=4, sort_keys=True, default=str, ensure_ascii=False)