# Convert CAMELS data to a readable form for Frostbyte

In [2]:
from pathlib import Path
import pandas as pd
import xarray as xr
import numpy as np

In [3]:
# CAMELES Data
CAMELS_data_path = Path('../CH_data/CAMELS')
# Automatisch alle CSV-Dateien im Ordner auflisten
csv_files = list(CAMELS_data_path.glob('*.csv'))

# Leere Listen für Daten, Station IDs und Zeit
data_list = []
station_ids = []
time_list = []

# Alle CSV-Dateien durchgehen und Daten sammeln
for file_path in csv_files:
    # Extrahiere die Station-ID aus dem Dateinamen
    station_id = file_path.stem.split('_')[-1]
    station_ids.append(station_id)

    # Lade die CSV-Datei
    df = pd.read_csv(file_path, sep=';')
    #print(f'Processing {file_path.name} with columns: {df.columns}')

    # Annahme: Die CSV hat eine Spalte 'date', die umgewandelt wird
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

    # Füge Zeitwerte zur Zeitliste hinzu, falls sie nicht leer ist
    if not time_list:  # Überprüfen, ob time_list leer ist
        time_list = df['date'].tolist()  # Konvertiere zu einer Liste

    # Sammle die Daten für diese Station
    data_list.append(df[['discharge_vol(m3/s)', 'waterlevel(m)', 'precipitation(mm/d)',
                         'temperature_min(°C)', 'temperature_mean(°C)', 'temperature_max(°C)',
                         'rel_sun_dur(%)', 'swe(mm)']].values)

# Staple die Daten entlang der Stationsachse (axis=1)
data_array = np.stack(data_list, axis=1)
# Automatisch alle CSV-Dateien im Ordner auflisten
csv_files = list(CAMELS_data_path.glob('*.csv'))

# Leere Listen für Daten, Station IDs und Zeit
data_list = []
station_ids = []
time_list = []

# Alle CSV-Dateien durchgehen und Daten sammeln
for file_path in csv_files:
    # Extrahiere die Station-ID aus dem Dateinamen
    station_id = file_path.stem.split('_')[-1]
    station_ids.append(station_id)

    # Lade die CSV-Datei
    df = pd.read_csv(file_path, sep=';')
    #print(f'Processing {file_path.name} with columns: {df.columns}')

    # Annahme: Die CSV hat eine Spalte 'date', die umgewandelt wird
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

    # Füge Zeitwerte zur Zeitliste hinzu, falls sie nicht leer ist
    if not time_list:  # Überprüfen, ob time_list leer ist
        time_list = df['date'].tolist()  # Konvertiere zu einer Liste

    # Sammle die Daten für diese Station
    data_list.append(df[['discharge_vol(m3/s)', 'waterlevel(m)', 'precipitation(mm/d)',
                         'temperature_min(°C)', 'temperature_mean(°C)', 'temperature_max(°C)',
                         'rel_sun_dur(%)', 'swe(mm)']].values)

# Staple die Daten entlang der Stationsachse (axis=1)
data_array = np.stack(data_list, axis=1)

# Dataset erstellen
Camels_dataset = xr.Dataset(
    {
        'discharge_vol': (('time', 'Station_ID'), data_array[:, :, 0]),
        'waterlevel': (('time', 'Station_ID'), data_array[:, :, 1]),
        'precipitation': (('time', 'Station_ID'), data_array[:, :, 2]),
        'temperature_min': (('time', 'Station_ID'), data_array[:, :, 3]),
        'temperature_mean': (('time', 'Station_ID'), data_array[:, :, 4]),
        'temperature_max': (('time', 'Station_ID'), data_array[:, :, 5]),
        'rel_sun_dur': (('time', 'Station_ID'), data_array[:, :, 6]),
        'swe': (('time', 'Station_ID'), data_array[:, :, 7])
    },
    coords={
        'time': ('time', time_list),  # Zeit ist jetzt korrekt definiert
        'Station_ID': ('Station_ID', station_ids)  # Korrektur hier: fügen Sie die Dimension an
    }
)

# Längen- und Breitengrad Koordinaten aus CSV hinzufügen
coordinates_df = pd.read_csv('../CH_data/gauge_coordinates.csv', dtype={'gauge_id': str})


# Konvertiere station_ids in string
station_ids = [str(station_id) for station_id in station_ids]  # Konvertiere in str


# Erstelle die Koordinaten als DataArrays, bevor du die Werte hinzufügst
lon_array = xr.DataArray(np.nan, coords=[station_ids], dims='Station_ID', name='lon')
lat_array = xr.DataArray(np.nan, coords=[station_ids], dims='Station_ID', name='lat')

# Füge die Koordinaten zum Dataset hinzu
Camels_dataset = Camels_dataset.assign_coords(lon=lon_array, lat=lat_array)

# Füge Koordinaten in das Dataset ein
for index, row in coordinates_df.iterrows():
    gauge_id = str(row['gauge_id'])  # Konvertiere gauge_id in String
    if gauge_id in station_ids:  # Überprüfen der IDs
        # Fügen Sie Koordinaten zu den vorhandenen Koordinaten hinzu
        Camels_dataset['lon'].loc[gauge_id] = row['lon']
        Camels_dataset['lat'].loc[gauge_id] = row['lat']
    else:
        print(f"Gauge ID {gauge_id} not found in station_ids.")
print('this is Camels_dataset:')
print(Camels_dataset)

this is Camels_dataset:
<xarray.Dataset>
Dimensions:           (Station_ID: 331, time: 14610)
Coordinates:
  * time              (time) datetime64[ns] 1981-01-01 1981-01-02 ... 2020-12-31
  * Station_ID        (Station_ID) <U4 '2004' '2007' '2009' ... '6010' '6011'
    lon               (Station_ID) float64 7.117 6.324 6.889 ... 8.579 8.649
    lat               (Station_ID) float64 46.93 46.67 46.35 ... 45.94 45.7
Data variables:
    discharge_vol     (time, Station_ID) float64 nan nan 49.8 ... nan nan nan
    waterlevel        (time, Station_ID) float64 429.0 nan 374.4 ... nan nan nan
    precipitation     (time, Station_ID) float64 0.89 3.17 1.83 ... 0.15 0.46
    temperature_min   (time, Station_ID) float64 -1.63 -6.33 ... nan -10.32
    temperature_mean  (time, Station_ID) float64 2.02 -2.42 -6.89 ... nan -6.76
    temperature_max   (time, Station_ID) float64 3.68 0.46 -2.84 ... nan -3.65
    rel_sun_dur       (time, Station_ID) float64 0.84 0.05 31.12 ... nan 63.69
    swe       

In [4]:
# Qobs Cameles
Qobs_Camels = Camels_dataset.copy()

# Behalte nur die Variable 'discharge_vol' und benenne sie in 'Flow' um
Qobs_Camels = Qobs_Camels[['discharge_vol']].rename({'discharge_vol': 'Flow'})

n_stations = Qobs_Camels.dims['Station_ID']

# Erstelle ein Array mit der Länge von 'Station_ID', gefüllt mit dem Wert 'CAMELS'
source = np.full(n_stations, 'CAMELS')
# source als Koordinate hinzufügen
Qobs_Camels = Qobs_Camels.assign_coords(source=('Station_ID', source))

# Ausgabe des neuen Datasets
print('this is Qobs_Camels:')
print(Qobs_Camels)


# Extract flow data for station with Station_ID '2319'
try:
    flow_2319 = Qobs_Camels.sel(Station_ID='2319')['Flow']  # Verwende '2319' als String
    flow_2319_array = flow_2319.values
    print(flow_2319_array)
except KeyError:
    print("Station_ID '2319' not found in Qobs_Camels.")

##
Qobs_Camels.to_netcdf('../CH_data/CH_input_data/Qobs_Camels.nc')


this is Qobs_Camels:
<xarray.Dataset>
Dimensions:     (Station_ID: 331, time: 14610)
Coordinates:
    lat         (Station_ID) float64 46.93 46.67 46.35 ... 45.96 45.94 45.7
  * Station_ID  (Station_ID) <U4 '2004' '2007' '2009' ... '6009' '6010' '6011'
  * time        (time) datetime64[ns] 1981-01-01 1981-01-02 ... 2020-12-31
    lon         (Station_ID) float64 7.117 6.324 6.889 ... 8.525 8.579 8.649
    source      (Station_ID) <U6 'CAMELS' 'CAMELS' ... 'CAMELS' 'CAMELS'
Data variables:
    Flow        (time, Station_ID) float64 nan nan 49.8 26.26 ... nan nan nan
[0.196 0.195 0.195 ... 0.243 0.24  0.238]


In [5]:

# SWE Camels
SWE_Camels = Camels_dataset.copy()
#P_Camels = Camels_dataset.copy()

SWE_Camels = SWE_Camels[['swe']].rename({'swe': 'snw'})
#P_Camels = P_Camels[['precipitation']].rename({'precipitation': 'p'})

SWE_Camels = SWE_Camels.drop_vars(['lat', 'lon'])
#

# Längen- und Breitengrad Koordinaten aus CSV hinzufügen
coordinates_df = pd.read_csv('../CH_data/centroids_coordinates.csv', dtype={'gauge_id': str})

# Konvertiere station_ids in string
station_ids = [str(station_id) for station_id in station_ids]  # Konvertiere in str

# test delete me later
# Erstelle die Koordinaten als DataArrays, bevor du die Werte hinzufügst
lon_array = xr.DataArray(np.nan, coords=[station_ids], dims='Station_ID', name='lon')
lat_array = xr.DataArray(np.nan, coords=[station_ids], dims='Station_ID', name='lat')

# Füge die Koordinaten zum Dataset hinzu
SWE_Camels = SWE_Camels.assign_coords(lon=lon_array, lat=lat_array)
#

# Füge Koordinaten in das Dataset ein
for index, row in coordinates_df.iterrows():
    gauge_id = str(row['gauge_id'])  # Konvertiere gauge_id in String
    if gauge_id in station_ids:  # Überprüfen der IDs
        # Fügen Sie Koordinaten zu den vorhandenen Koordinaten hinzu
        SWE_Camels['lon'].loc[gauge_id] = row['lon']
        SWE_Camels['lat'].loc[gauge_id] = row['lat']
        #P_Camels['lon'].loc[gauge_id] = row['lon']
        #P_Camels['lat'].loc[gauge_id] = row['lat']
    else:
        print(f"Gauge ID {gauge_id} not found in station_ids. (SWE)")


# Füge die station_name Koordinate hinzu
station_names = [f"{station_id}_centroid" for station_id in SWE_Camels.coords['Station_ID'].values]
#

# Erstelle die station_name Koordinate als DataArray
station_name_array = xr.DataArray(station_names, coords=[SWE_Camels.coords['Station_ID']], dims='Station_ID', name='station_name')


# Füge die station_name Koordinate zum Dataset hinzu
SWE_Camels = SWE_Camels.assign_coords(station_name=station_name_array)


# Ausgabe des neuen Datasets mit der station_name Koordinate

SWE_Camels = SWE_Camels.rename({'Station_ID': 'station_id'})
SWE_Camels = SWE_Camels.rename({'snw': 'swe'})

# Transformer-Objekt für die Umrechnung von WGS84 (EPSG:4326) nach LV95 (EPSG:2056)
from pyproj import Transformer
transformer = Transformer.from_crs("EPSG:4326", "EPSG:21781", always_xy=True)

# Angenommen, dein Dataset heißt "SWE_Camels"
# Wandle lon und lat von WGS84 zu LV95 um
lon_lv95, lat_lv95 = transformer.transform(SWE_Camels['lon'].values, SWE_Camels['lat'].values)

# Überschreibe die ursprünglichen lon und lat Koordinaten im Dataset
SWE_Camels = SWE_Camels.assign_coords(lon=("station_id", lon_lv95), lat=("station_id", lat_lv95))


print(SWE_Camels)


<xarray.Dataset>
Dimensions:       (station_id: 331, time: 14610)
Coordinates:
  * station_id    (station_id) <U4 '2004' '2007' '2009' ... '6009' '6010' '6011'
  * time          (time) datetime64[ns] 1981-01-01 1981-01-02 ... 2020-12-31
    lon           (station_id) float64 5.587e+05 5.049e+05 ... 6.934e+05
    lat           (station_id) float64 1.753e+05 1.593e+05 ... 1.138e+05
    station_name  (station_id) <U13 '2004_centroid' ... '6011_centroid'
Data variables:
    swe           (time, station_id) float64 nan nan nan ... 114.0 80.0 102.8


In [6]:
import xarray as xr
import numpy as np

# Angenommen, dein ursprüngliches Dataset heißt `SWE_Camels`
# Kopiere das Dataset, um Änderungen vorzunehmen
new_ds = SWE_Camels.copy()

# 1. `station_id` in `Station_ID` umbenennen und als Zeichenkette verwenden
new_ds = new_ds.rename({'station_id': 'Station_ID'})
new_ds = new_ds.assign_coords(Station_ID=("Station_ID", new_ds['Station_ID'].values.astype(str)))

# 2. Projektionstyp hinzufügen, falls dieser fehlt
# Beispielwert für `Projection_Type` als Skalar hinzufügen
new_ds['Projection_Type'] = xr.DataArray(data=0.0)  # Skalar ohne Dimensionsangabe

# Überprüfen, dass `lon` und `lat` in der richtigen Form sind
new_ds['lon'] = new_ds['lon']
new_ds['lat'] = new_ds['lat']

# Zeige das modifizierte Dataset an
new_ds = new_ds.drop_vars('station_name')
new_ds = new_ds.rename_dims({'Station_ID': 'station_id'})
print(new_ds)
new_ds.to_netcdf('../CH_data/CH_input_data/SWE_Camels.nc')


<xarray.Dataset>
Dimensions:          (station_id: 331, time: 14610)
Coordinates:
    Station_ID       (station_id) <U4 '2004' '2007' '2009' ... '6010' '6011'
  * time             (time) datetime64[ns] 1981-01-01 1981-01-02 ... 2020-12-31
    lon              (station_id) float64 5.587e+05 5.049e+05 ... 6.934e+05
    lat              (station_id) float64 1.753e+05 1.593e+05 ... 1.138e+05
Dimensions without coordinates: station_id
Data variables:
    swe              (time, station_id) float64 nan nan nan ... 114.0 80.0 102.8
    Projection_Type  float64 0.0


In [8]:
#P_Camels
P_Camels = Camels_dataset.copy()
P_Camels = P_Camels[['precipitation']]#.rename({'precipitation': 'p'})
P_Camels = P_Camels.drop_vars(['lat', 'lon'])
# Längen- und Breitengrad Koordinaten aus CSV hinzufügen
coordinates_df = pd.read_csv('../CH_data/centroids_coordinates.csv', dtype={'gauge_id': str})

# Konvertiere station_ids in string
station_ids = [str(station_id) for station_id in station_ids]  # Konvertiere in str

# test delete me later
# Erstelle die Koordinaten als DataArrays, bevor du die Werte hinzufügst
lon_array = xr.DataArray(np.nan, coords=[station_ids], dims='Station_ID', name='lon')
lat_array = xr.DataArray(np.nan, coords=[station_ids], dims='Station_ID', name='lat')

# Füge die Koordinaten zum Dataset hinzu
P_Camels = P_Camels.assign_coords(lon=lon_array, lat=lat_array)

# Füge Koordinaten in das Dataset ein
for index, row in coordinates_df.iterrows():
    gauge_id = str(row['gauge_id'])  # Konvertiere gauge_id in String
    if gauge_id in station_ids:  # Überprüfen der IDs
        # Fügen Sie Koordinaten zu den vorhandenen Koordinaten hinzu
        P_Camels['lon'].loc[gauge_id] = row['lon']
        P_Camels['lat'].loc[gauge_id] = row['lat']
    else:
        print(f"Gauge ID {gauge_id} not found in station_ids. ("
              f"p)")
station_names = [f"{station_id}_centroid" for station_id in P_Camels.coords['Station_ID'].values]
station_name_array = xr.DataArray(station_names, coords=[P_Camels.coords['Station_ID']], dims='Station_ID', name='station_name')
P_Camels = P_Camels.assign_coords(station_name=station_name_array)
from pyproj import Transformer
transformer = Transformer.from_crs("EPSG:4326", "EPSG:2056", always_xy=True)

# Wandle lon und lat von WGS84 zu LV95 um
lon_lv95, lat_lv95 = transformer.transform(P_Camels['lon'].values, P_Camels['lat'].values)

# Überschreibe die ursprünglichen lon und lat Koordinaten im Dataset
P_Camels = P_Camels.assign_coords(lon=("station_id", lon_lv95), lat=("station_id", lat_lv95))

# 1. Definiere die Werte für "lle" als Objekt (Koordinatenbezeichner)
lle_coords = ["lon", "lat", "elev"]

# 2. Initialisiere "elev" mit Nullen
elev_values = np.zeros(P_Camels.dims['Station_ID'])  # 'elev' mit Nullen auffüllen

# 3. Staple die Koordinaten in einem Array und erstelle die "LLE"-Variable
lle_data = np.stack([P_Camels['lon'].values, P_Camels['lat'].values, elev_values], axis=0)

# 4. Erstelle das neue Dataset, füge die lle-Koordinate als Objekt und die LLE-Datenvariable hinzu
transformed_ds = xr.Dataset(
    {
        "LLE": (["station", "lle"], lle_data.T),  # .T für Transponieren, um die Dimensionen richtig zu setzen
        "precipitation": (["nday", "station"], P_Camels['precipitation'].values)
    },
    coords={
        "station": P_Camels["Station_ID"].values,  # station als eindimensionale Koordinate
        "nday": P_Camels["time"].values,           # nday als eindimensionale Koordinate
        "lle": lle_coords                          # lle als Objektkoordinate
    }
)

P_Camels = transformed_ds
print(P_Camels)
P_Camels.to_netcdf('../CH_data/CH_input_data/P_Camels.nc')

<xarray.Dataset>
Dimensions:        (lle: 3, nday: 14610, station: 331)
Coordinates:
  * station        (station) <U4 '2004' '2007' '2009' ... '6009' '6010' '6011'
  * nday           (nday) datetime64[ns] 1981-01-01 1981-01-02 ... 2020-12-31
  * lle            (lle) <U4 'lon' 'lat' 'elev'
Data variables:
    LLE            (station, lle) float64 2.559e+06 1.175e+06 ... 1.114e+06 0.0
    precipitation  (nday, station) float64 0.89 3.17 1.83 ... 0.11 0.15 0.46
