## Run Regime Classification
Run regime classification for all CAMELS basins

In [22]:
import numpy as np
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import pandas as pd
import warnings
import io
import re  # For regular expression operations

# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load the Basin IDs
#ids = ["2327", "2498"]

file_path = '../CH_data/gauge_coordinates.csv'  # Replace with the actual file path
data = pd.read_csv(file_path)
gauge_id = data['gauge_id']
ids = gauge_id.tolist()


def clean_data(raw_data):
    """
    Clean the extracted data list by removing unwanted elements and parentheses.
    """
    cleaned_data = []
    
    for item in raw_data:
        if item == '0' or item == 'POINT':
            continue  # Skip '0' and 'POINT'
        if '(' in item:  # For latitude
            item = item.replace('(', '')  # Remove '('
        if ')' in item:  # For longitude
            item = item.replace(')', '')  # Remove ')'
        
        cleaned_data.append(item)  # Add the cleaned item to the list
    
    return cleaned_data

def extract_data_array(raw_output):
    """
    Extract data from the raw output string into an array without headers.
    """
    # Split the raw output into lines
    lines = raw_output.strip().splitlines()
    
    # Use a regex to capture only the relevant lines for DataFrame
    data_lines = []
    
    for line in lines:
        # Ignore lines that don't start with a valid pattern
        if re.match(r'^\d+', line) or 'CAMELS' in line:  # Keeping lines starting with digits or containing 'CAMELS'
            # Remove unwanted characters and split into elements
            cleaned_line = re.sub(r'\\', '', line)  # Remove backslashes
            elements = cleaned_line.split()  # Split by whitespace
            data_lines.append(elements)

    # Flatten the list of lists into a single list, removing headers
    return [item for sublist in data_lines for item in sublist]

def run_regime_classification(ids):
    # Load the rc_simulation Notebook
    notebook_file = '../notebooks/1_RegimeClassification.ipynb'
    
    # Initialize lists to store all results
    regime_annualmax_data = []
    regime_POT_data = []
    regime_COM_data = []

    for basin_id in ids:
        print(f"Processing Basin ID: {basin_id}")  # Log the current Basin ID
        with open(notebook_file) as f:
            nb = nbformat.read(f, as_version=4)

        # Set basin_id in the notebook as test_basin_id
        replacement_done = False  # Flag to track if replacement is done
        
        for cell in nb.cells:
            if cell.cell_type == 'code' and not replacement_done:
                # Search for the line that contains 'test_basin_id ='
                if 'test_basin_id' in cell.source:
                    # Replace the entire line where test_basin_id is defined
                    cell_lines = cell.source.split('\n')
                    for i, line in enumerate(cell_lines):
                        if 'test_basin_id' in line:
                            # Replace the line with the new basin_id assignment
                            cell_lines[i] = f"test_basin_id = '{basin_id}'  # Set basin_id for testing"
                            replacement_done = True
                            break  # Exit the loop after the first replacement
                    # Re-join the cell source code
                    cell.source = '\n'.join(cell_lines)

        # Execute the notebook
        ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
        try:
            ep.preprocess(nb, {'metadata': {'path': './'}})

            # Extract the outputs from the last 3 cells
            regime_annualmax_gdf = nb.cells[-3].outputs[0]['data']['text/plain']
            regime_POT_gdf = nb.cells[-2].outputs[0]['data']['text/plain']
            regime_COM_gdf = nb.cells[-1].outputs[0]['data']['text/plain']

            # Debug: Print raw outputs
            print("\nRaw outputs:")
            print("Annual Max:", regime_annualmax_gdf)
            print("POT:", regime_POT_gdf)
            print("COM:", regime_COM_gdf)

            # Extract data as arrays without headers
            annualmax_data = extract_data_array(regime_annualmax_gdf)
            POT_data = extract_data_array(regime_POT_gdf)
            COM_data = extract_data_array(regime_COM_gdf)

            # Clean the extracted data
            cleaned_annualmax_data = clean_data(annualmax_data)
            cleaned_POT_data = clean_data(POT_data)
            cleaned_COM_data = clean_data(COM_data)

            # Debug: Print cleaned arrays
            print("\nCleaned Data Arrays:")
            print("Annual Max Data:\n", cleaned_annualmax_data)
            print("POT Data:\n", cleaned_POT_data)
            print("COM Data:\n", cleaned_COM_data)

            # Append the current basin's cleaned data to the final results
            regime_annualmax_data.append(cleaned_annualmax_data)  # Add Basin ID
            regime_POT_data.append(cleaned_POT_data)  # Add Basin ID
            regime_COM_data.append(cleaned_COM_data)  # Add Basin ID

            print(f"Results for Basin ID {basin_id} were successfully processed.")

        except Exception as e:
            print(f"Error while executing the notebook for Basin ID {basin_id}: {e}")

    # Save the results as CSV files with headers
    header = 'source,Station_ID,lon,lat,circular_stats_theta_rad,circular_stats_regularity,mean_peak_doy'
    #np.savetxt('../CH_data/CH_output_data/regime_annualmax_data.csv', regime_annualmax_data, delimiter=',', fmt='%s')
    np.savetxt('../CH_data/CH_output_data/regime_annualmax_data.csv', regime_annualmax_data, delimiter=',', fmt='%s', header=header, comments='')
    np.savetxt('../CH_data/CH_output_data/regime_POT_data.csv', regime_POT_data, delimiter=',', fmt='%s', header=header, comments='')
    np.savetxt('../CH_data/CH_output_data/regime_COM_data.csv', regime_COM_data, delimiter=',', fmt='%s', header=header, comments='')

    print("All results have been successfully exported as CSV files.")

# Start the function
run_regime_classification(ids)


Processing Basin ID: 2004
Error while executing the notebook for Basin ID 2004: An error occurred while executing the following cell:
------------------
# Regime classification based on streamflow annual maxima (flag=1)
regime_annualmax_gdf, theta_rad_events_annualmax = regime_classification(Qobs_testbasin_ds, start_water_year=(month_start_water_year_default, day_start_water_year_default), max_gap_days=max_gap_days_default, flag=1)

display(regime_annualmax_gdf)
print(theta_rad_events_annualmax)
------------------

[1;31m---------------------------------------------------------------------------[0m
[1;31mUnboundLocalError[0m                         Traceback (most recent call last)
[1;32m<ipython-input-12-f832bed16c58>[0m in [0;36m<module>[1;34m[0m
[0;32m      1[0m [1;31m# Regime classification based on streamflow annual maxima (flag=1)[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[1;32m----> 2[1;33m [0mregime_annualmax_gdf[0m[1;33m,[0m [0mtheta_rad_events_annualmax[0m 

In [23]:
import pandas as pd

# Lade die CSV-Dateien
file_path1 = '../CH_data/CH_output_data/regime_annualmax_data.csv'
file_path2 = '../CH_data/CH_output_data/regime_COM_data.csv'
file_path3 = '../CH_data/CH_output_data/regime_POT_data.csv'

regime_annualmax_data = pd.read_csv(file_path1)
regime_COM_data = pd.read_csv(file_path2)
regime_POT_data = pd.read_csv(file_path3)

nival_start_doy_default = 60 # nival regime starting day of year, corresponds to the 1st of March
nival_end_doy_default = 213  # nival regime ending day of year, corresponds to the 1st of August
nival_regularity_threshold_default = 0.65

# Filtere die Daten nach 'mean_peak_doy' zwischen 150 und 200
filtered_annualmax = regime_annualmax_data[(regime_annualmax_data['mean_peak_doy'] >= nival_start_doy_default) & (regime_annualmax_data['mean_peak_doy'] <= nival_end_doy_default) & (regime_annualmax_data['circular_stats_regularity']>= nival_regularity_threshold_default)]
filtered_COM = regime_COM_data[(regime_COM_data['mean_peak_doy'] >= nival_start_doy_default) & (regime_COM_data['mean_peak_doy'] <= nival_end_doy_default) & (regime_annualmax_data['circular_stats_regularity']>= nival_regularity_threshold_default)]
filtered_POT = regime_POT_data[(regime_POT_data['mean_peak_doy'] >= nival_start_doy_default) & (regime_POT_data['mean_peak_doy'] <= nival_end_doy_default) & (regime_annualmax_data['circular_stats_regularity']>= nival_regularity_threshold_default)]

# Finde die Schnittmenge der Station_IDs in allen drei gefilterten DataFrames
nival_basins_id = set(filtered_annualmax['Station_ID']).intersection(set(filtered_COM['Station_ID'])).intersection(set(filtered_POT['Station_ID']))

# Gebe die Station_IDs aus, die in allen drei Metriken die Bedingung erfüllen
print("Nival basins:")
print(nival_basins_id)

# Umwandeln in ein DataFrame
df = pd.DataFrame(nival_basins_id, columns=['Station_ID'])

# Speichern als CSV
df.to_csv('../CH_data/CH_output_data/nival_basins_id.csv', index=False)

Nival basins:
{2304, 2053, 2056, 2185, 2319, 3031, 2067, 2327, 2200, 2457, 2462, 5025, 5027, 2469, 5031, 2473, 2346, 2219, 4010, 2351, 2607, 2481, 2355, 2104, 2105, 2617, 2491, 2620, 2109, 2110, 2239, 3001, 3003, 3005, 3006, 2244, 2117, 2372, 3008, 3009, 3010, 3018, 3019, 3021, 2256, 3025, 3027, 3030, 2262, 2135, 2263, 3033, 2009, 2265, 2011, 2268, 2141, 2269, 3032, 2018, 2019, 2403, 2150, 2024, 2152, 2030, 2288, 2033, 2418, 2419, 2299}
