In [51]:
import pandas as pd
import xarray as xr
import os
import geopandas as gpd

# Define paths to the CSV folder and shapefile
folder_path = "N:/gebhyd/3_Hyv/Diplomanden/2_Running/L_Nuesch/data_sweden/CAMELS_SW/2023-173-1/data/catchment time series/catchment_time_series"
shapefile_path = "N:/gebhyd/3_Hyv/Diplomanden/2_Running/L_Nuesch/data_sweden/CAMELS_SW/2023-173-1/data/catchment_GIS_shapefiles/catchment_GIS_shapefiles/Sweden_catchments_50_stations_WGS84.shp"

# Load the shapefile and extract lon, lat, and Station_ID
gdf = gpd.read_file(shapefile_path)
gdf = gdf.set_geometry('geometry')

# Extract lon and lat from the geometry
gdf['lon'] = gdf.geometry.x
gdf['lat'] = gdf.geometry.y

# Keep only relevant columns: Station_ID, lon, and lat
station_info = gdf[['id', 'lon', 'lat']].set_index('id')
# Convert both to strings
station_info.index = station_info.index.astype(str)
dataset['Station_ID'] = dataset['Station_ID'].astype(str)

# Initialize a list to hold each station's data as a DataArray
data_arrays = []

# Loop through each CSV file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        # Extract Station_ID from the file name (assuming it follows a specific pattern)
        station_id = file_name.split('_')[2].split('.')[0]  # Adjust the index if necessary

        # Load the CSV file
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)

        # Combine Year, Month, and Day columns into a datetime column
        df['time'] = pd.to_datetime(df[['Year', 'Month', 'Day']])

        # Set the 'time' as the index and drop Year, Month, and Day columns
        df = df.set_index('time').drop(columns=['Year', 'Month', 'Day'])

        # Convert the DataFrame to an xarray DataArray for each station
        data_array = xr.DataArray(
            data=df.values,
            dims=['time', 'variable'],
            coords={
                'time': df.index,  # time coordinate
                'variable': df.columns,  # each column as a variable
                'Station_ID': station_id  # each file provides one station's data
            }
        )

        # Append the DataArray for this station to the list
        data_arrays.append(data_array)

# Concatenate all station DataArrays along the 'Station_ID' dimension
dataset = xr.concat(data_arrays, dim='Station_ID')

# Convert DataArray to Dataset, setting each column as a separate variable
dataset = dataset.to_dataset(dim='variable')

# Add lon and lat as coordinates from the station_info DataFrame
lon_values = station_info.reindex(dataset['Station_ID'].values)['lon'].values
lat_values = station_info.reindex(dataset['Station_ID'].values)['lat'].values
dataset = dataset.assign_coords(lon=('Station_ID', lon_values), lat=('Station_ID', lat_values))

# Rename variables from "variable" dimension values to actual column names
for var in dataset.data_vars:
    dataset[var].attrs['long_name'] = var

# Save the dataset to a NetCDF file (optional)
print(dataset)
# Save the dataset to a NetCDF file (optional)
#dataset.to_netcdf('output_dataset.nc')


<xarray.Dataset>
Dimensions:     (Station_ID: 50, time: 21915)
Coordinates:
  * time        (time) datetime64[ns] 1961-01-01 1961-01-02 ... 2020-12-31
  * Station_ID  (Station_ID) object '1069' '1083' '1123' ... '751' '855' '97'
    lon         (Station_ID) float64 14.13 12.13 21.81 ... 11.54 16.16 15.71
    lat         (Station_ID) float64 56.66 62.64 66.17 ... 58.88 57.01 62.82
Data variables:
    Qobs_m3s    (Station_ID, time) float64 27.0 27.0 28.0 ... 17.9 18.0 18.2
    Qobs_mm     (Station_ID, time) float64 2.274 2.274 2.358 ... 0.7188 0.7268
    Pobs_mm     (Station_ID, time) float64 2.081 0.3805 7.043 ... 5.704 9.268
    Tobs_C      (Station_ID, time) float64 -0.5748 0.7144 ... -2.064 -3.794


In [54]:
#streamflow
Q_Camels_SW = dataset.copy()
Q_Camels_SW = Q_Camels_SW.drop_vars(['Pobs_mm', 'Qobs_mm', 'Tobs_C'])
Q_Camels_SW = Q_Camels_SW.rename({"Qobs_m3s": "Flow"})

print(Q_Camels_SW)

Q_Camels_SW.to_netcdf('../CAMELS_SW/input_data/Q_Camels_SW.nc')

<xarray.Dataset>
Dimensions:     (Station_ID: 50, time: 21915)
Coordinates:
  * time        (time) datetime64[ns] 1961-01-01 1961-01-02 ... 2020-12-31
  * Station_ID  (Station_ID) object '1069' '1083' '1123' ... '751' '855' '97'
    lon         (Station_ID) float64 14.13 12.13 21.81 ... 11.54 16.16 15.71
    lat         (Station_ID) float64 56.66 62.64 66.17 ... 58.88 57.01 62.82
Data variables:
    Flow        (Station_ID, time) float64 27.0 27.0 28.0 ... 17.9 18.0 18.2


In [58]:
#precipitation
import pandas as pd
import xarray as xr

P_Camels_SW = dataset.copy()
P_Camels_SW = P_Camels_SW.drop_vars(['Qobs_m3s', 'Qobs_mm', 'Tobs_C'])


# Load the elevation data from the CSV file
elevation_data = pd.read_csv('N:/gebhyd/3_Hyv/Diplomanden/2_Running/L_Nuesch/data_sweden/CAMELS_SW/2023-173-1/data/catchment properties/catchment properties/catchments_physical_properties.csv')

# Rename columns for clarity (assuming 'ID' is the station identifier and 'Elevation_mabsl' is the elevation)
elevation_data = elevation_data.rename(columns={'ID': 'Station_ID', 'Elevation_mabsl': 'elevation'})

# Set Station_ID as the index for easy alignment
elevation_data = elevation_data.set_index('Station_ID')

# Ensure Station_ID is treated as a string to match the xarray dataset
elevation_data.index = elevation_data.index.astype(str)
P_Camels_SW['Station_ID'] = P_Camels_SW['Station_ID'].astype(str)

# Retrieve the elevation values, matching them to the Station_IDs in the xarray dataset
elevation_values = elevation_data.reindex(P_Camels_SW['Station_ID'].values)['elevation'].values

# Add elevation as a new coordinate in the dataset
P_Camels_SW = P_Camels_SW.assign_coords(elevation=('Station_ID', elevation_values))

# Step 1: Extract the longitude, latitude, and elevation as arrays
lon_values = P_Camels_SW['lon'].values
lat_values = P_Camels_SW['lat'].values
elevation_values = P_Camels_SW['elevation'].values

# Step 2: Create a new variable 'LLE' with dimension 'lle' (lon, lat, elevation)
lle_data = xr.DataArray(
    data=[lon_values, lat_values, elevation_values],
    dims=["lle", "Station_ID"],  # Use "Station_ID" as the dimension name
    coords={"lle": ["lon", "lat", "elev"], "Station_ID": P_Camels_SW["Station_ID"].values}
)

# Step 3: Rename 'Station_ID' to 'station' and 'time' to 'nday' to match the target structure
P_Camels_SW = P_Camels_SW.rename({"time": "nday"})

# Step 4: Add 'LLE' as a data variable instead of a coordinate
P_Camels_SW['LLE'] = lle_data

# Step 5: Drop the original 'lon', 'lat', and 'elevation' coordinates
P_Camels_SW = P_Camels_SW.drop_vars(['lon', 'lat', 'elevation'])

# Step 6: Rename the main data variable if necessary
P_Camels_SW = P_Camels_SW.rename({"Pobs_mm": "precipitation"})
P_Camels_SW = P_Camels_SW.rename({"Station_ID": "station"})
# Check the final structure of the transformed dataset
P_Camels_SW['LLE'] = P_Camels_SW['LLE'].transpose('station', 'lle')
print(P_Camels_SW)

P_Camels_SW.to_netcdf('../CAMELS_SW/input_data/P_Camels_SW.nc')

<xarray.Dataset>
Dimensions:        (lle: 3, nday: 21915, station: 50)
Coordinates:
  * nday           (nday) datetime64[ns] 1961-01-01 1961-01-02 ... 2020-12-31
  * station        (station) <U5 '1069' '1083' '1123' ... '751' '855' '97'
  * lle            (lle) <U4 'lon' 'lat' 'elev'
Data variables:
    precipitation  (station, nday) float64 2.081 0.3805 7.043 ... 5.704 9.268
    LLE            (station, lle) float64 14.13 56.66 165.7 ... 62.82 365.6
