# Web Scraper Wasserstände KT Zuerich

## Libraries and settings

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Web Scraper Hydrological Data

In [2]:
# Send an HTTP request to the URL
url = 'https://hydroproweb.zh.ch/Listen/AktuelleWerte/aktuelle_werte.html'
response = requests.get(url)
html_content = response.content

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Locate the table and extract values 
table = soup.find('table')

# Extract table headers
headers = [header.get_text().replace('\xa0', '') for header in table.find_all('th')]

# Extract table rows
rows = []
for row in table.find_all('tr')[1:]: # tr and td -> watch html document -> cells and rows are named by tr and td in a html document
    cells = row.find_all('td')
    row_data = [cell.get_text().replace('\xa0', '').strip() for cell in cells]
    rows.append(row_data)

# Split the combined "ZeitDatum" column into separate "Zeit" and "Datum" columns
clean_rows = []
for row in rows:
    if len(row) == 9:  # Ensure the row has the correct number of columns
        zeit_datum = row[2]
        zeit, datum = zeit_datum[:5], zeit_datum[5:]
        clean_row = row[:2] + [zeit, datum] + row[3:]
        clean_rows.append(clean_row)

# Define the final columns
columns = ['Gewaesser', 'Einheit', 'Zeit', 'Datum', 'Wert_Aktuell', '24h_vorher', 'Differenz', 'Mittel_24h', 'Maximum_24h', 'Minimum_24h']

# Create DataFrame
df = pd.DataFrame(clean_rows, columns=columns)

# Optionally, save the DataFrame to a CSV file
df.to_csv('hydodata_table.csv', index=False)

# Print the DataFrame
df

Unnamed: 0,Gewaesser,Einheit,Zeit,Datum,Wert_Aktuell,24h_vorher,Differenz,Mittel_24h,Maximum_24h,Minimum_24h
0,Aa-Stegen-Wetzikon,l/s,11:00,09.11.2024,95,1'966,-1'871.2,564,1'982,29
1,Aabach-Käpfnach,l/s,11:00,09.11.2024,48,48,0.0,47,48,47
2,Aabach-Mönchaltorf,m3/s,11:00,09.11.2024,0.30,0.31,-0.010,0.28,0.32,0.23
3,Aabach-Niederuster,m3/s,11:00,09.11.2024,0.48,2.68,-2.200,1.09,3.09,0.37
4,Abistbach-Marthalen HW-RB,müM,11:00,09.11.2024,398.88,398.88,0.000,398.88,398.88,398.87
...,...,...,...,...,...,...,...,...,...,...
61,Türlersee,müM,12:30,08.11.2024,644.27,644.27,-0.003,644.27,644.27,644.27
62,Türlersee Abfluss,l/s,12:00,08.11.2024,31,32,-1.1,32,33,31
63,"Wildbach-Grosswies, Wetzikon",müM,11:00,09.11.2024,533.00,533.00,-0.002,533.00,533.00,533.00
64,Wildbach-Wetzikon,m3/s,11:00,09.11.2024,0.14,0.15,-0.007,0.13,0.16,0.09


In [None]:
# Send an HTTP request to the URL 
url = 'https://www.pegelalarm.at/de/country.php?country=CH' # müsste sehr wahrscheinlich mit selenium vorgenommen werden
response = requests.get(url)
html_content = response.content

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Locate the table and extract values 
table = soup.find('div.features div.table-spacer table')

# Prüfen, ob die Tabelle gefunden wurde
if table:
    # Extrahieren der Tabellen-Header
    headers = [header.get_text().strip() for header in table.find_all('th')]

 # Extrahieren der Tabellen-Zeilen
    rows = []
    for row in table.find_all('tr')[1:]:  # [1:] überspringt die Header-Zeile
        cells = row.find_all('td')
        row_data = [cell.get_text().strip() for cell in cells]
        rows.append(row_data)

# Split the combined "ZeitDatum" column into separate "Zeit" and "Datum" columns
clean_rows = []
for row in rows:
    if len(row) == 4:  # Ensure the row has the correct number of columns
        zeit_datum = row[2]
        zeit, datum = zeit_datum[:5], zeit_datum[5:]
        clean_row = row[:2] + [zeit, datum] + row[3:]
        clean_rows.append(clean_row)

# Define the final columns
columns = ['Station', 'Gewässer', 'Wasserstand', 'Durchfluss', 'Map']

# Create DataFrame
df = pd.DataFrame(clean_rows, columns=columns)

# Optionally, save the DataFrame to a CSV file
df.to_csv('hydodata_table.csv', index=False)

# Print the DataFrame
df

Unnamed: 0,Station,Gewässer,Wasserstand,Durchfluss,Map


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [4]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-11-09 10:13:50
Python Version: 3.11.10
-----------------------------------
