In [10]:
import os
import shutil
import random
import pandas as pd
import shutil
import math
from collections import defaultdict

In [2]:
seed_value = 123 
random.seed(seed_value)

In [3]:
def normalize_dis(column, area):
    time = 24 * 3600
    area_m = area*1000000
    new_column = column * (time / area_m)
    return new_column


def normalize_columns_with_dis(df, area):
    for column in df.columns:
        if 'dis' in column:
            df[column] = normalize_dis(df[column], area)
    return df

In [4]:
def subsample_table(stations, filenames, area):
    grdc_nos = [str(grdc_no) for grdc_no in stations['grdc_no']]
    sub_filenames = [filename for filename in filenames if any(grdc_no in filename for grdc_no in grdc_nos)]
    sub_datas = []
    for filename in sub_filenames:
        sub_data = pd.read_csv(filename)
        sub_datas.append(sub_data)
    
    sub_table = pd.concat(sub_datas, ignore_index=True)
    
    return sub_table

In [5]:
# File paths
file_path_preds = '../R/data/allpredictors/'
file_list_preds = os.listdir(file_path_preds)
file_paths = [os.path.join(file_path_preds, file) for file in file_list_preds]

station_info = pd.read_csv("../R/data/stations_rhine_elbe.csv")

rhine_stations = station_info.loc[(station_info["sub_reg"] == 6351) | (station_info["sub_reg"] == 6361)]
elbe_stations = station_info.loc[(station_info["sub_reg"] == 6401)]
maas_stations = station_info.loc[(station_info["sub_reg"] == 6211)]

## Sampling cross validation Rhine datasetrhine_stations_ids

In [7]:
# Create an empty dictionary to store the samples
train_samples = {}
validation_samples = {}

# Get the station ids
list_station_ids = rhine_stations["grdc_no"].to_list()
list_validation = list_station_ids.copy()

# Define the number of stations per sample
stations_per_sample = 6

# Loop five times to create five samples
for sample_num in range(1, 6):
    # Randomly select 6 unique stations for the current sample
    validation_stations = random.sample(list_validation, stations_per_sample)
        
    # Add the current sample to the test_samples dictionary
    validation_samples[sample_num] = validation_stations
    
    # Remove the test stations from the list of station ids
    list_validation = list(set(list_validation) - set(validation_stations))  # Remove the selected values

    # Remove the selected stations from the list of station ids
    train_stations_id = [station_id for station_id in list_station_ids if station_id not in validation_stations]

    # Add the current sample to the train_samples dictionary
    train_samples[sample_num] = train_stations_id


for sample_num in range(1, 6):
    train_stations = train_samples[sample_num]
    validation_stations = validation_samples[sample_num]
    
    # Check if there is any overlap between train and test samples
    if any(station_id in validation_stations for station_id in train_stations):
        print(f"Overlap found in Sample {sample_num}")
        print(f"Train Sample {sample_num}: {train_stations}")
        print(f"Test Sample {sample_num}: {validation_stations}")
    else:
        print("No overlap")

No overlap
No overlap
No overlap
No overlap
No overlap


## Subsample Elbe and Maas as testing

In [8]:
list_test = elbe_stations["grdc_no"].to_list()

test_samples_elbe = {}

# Generate samples with 2 stations each
for i in range(5):
    sample = random.sample(list_test, 2)
    test_samples_elbe[i + 1] = sample
    list_test = list(set(list_test) - set(sample))  # Remove the selected values

# Generate the last sample with 3 stations
sample = random.sample(list_test, 3)
test_samples_elbe[5] = sample

# Print the generated samples dictionary
for sample_num in range(1, 6):
    print(test_samples_elbe[sample_num])

[6340300, 6340160]
[6340170, 6340130]
[6340301, 6340110]
[6140400, 6340140]
[6340190, 6340180, 6140401]


In [9]:
list_test_maas = maas_stations["grdc_no"].to_list()
list_test = list_test_maas.copy()

test_samples_maas = {}

# Generate samples with 1 station each for the first three subsamples
for i in range(1, 4):
    sample = random.sample(list_test, 1)
    test_samples_maas[i] = sample
    list_test = list(set(list_test) - set(sample))  # Remove the selected value

# Generate the last two samples with 2 stations each
sample_4 = random.sample(list_test, 2)
test_samples_maas[4] = sample_4
list_test = list(set(list_test) - set(sample_4))

sample_5 = random.sample(list_test, 2)
test_samples_maas[5] = sample_5

# Print the test samples
for sample_num, selected_stations in test_samples_maas.items():
    print(f"Test Sample {sample_num}: {selected_stations}")


Test Sample 1: [6421100]
Test Sample 2: [6221100]
Test Sample 3: [6421101]
Test Sample 4: [6221101, 6221102]
Test Sample 5: [6421102, 6421500]


# Elbe & Maas subsampling

In [12]:
# Iterate over sub-samples
# Path configurations
data_setup = {1: "rhine_only_lag6",
              2: "rhine_elbe_lag6",
              3: "rhine_maas_lag6"
                }

#Select the sampling setup
setup_number = 3
setup = data_setup[setup_number]

output_base_dir = f'../R/data/{setup}/'
os.makedirs(output_base_dir, exist_ok=True)

for subsample in range(1, 6):
    output_dir = os.path.join(output_base_dir, f'subsample_{subsample}')
    os.makedirs(output_dir, exist_ok=True)
    
    print(f'Sampling Subsample {subsample}...')
    
    ## Subset train stations randomly:
    train_stations = train_samples[subsample]
    train_stations = station_info[station_info['grdc_no'].isin(train_stations)]
   
    #train_stations = station_info[station_info['grdc_no'].isin(rhine_stations["grdc_no"].to_list())]

    if setup_number == 1:
        #Subset test stations for rhine only setup
        test_stations = validation_samples[subsample]
        test_stations = station_info[station_info['grdc_no'].isin(test_stations)]
    elif setup_number == 2:
        
        # Subset test stations elbe
        test_stations = test_samples_elbe[subsample]
        test_stations = station_info[station_info['grdc_no'].isin(test_stations)]
        
    elif setup_number == 3:
        
        # Subset test stations 
        test_stations = test_samples_maas[subsample]
        test_stations = station_info[station_info['grdc_no'].isin(test_stations)]

        
     # Create train table
    train_table = subsample_table(train_stations, file_paths, station_info)
    train_table['datetime'] = pd.to_datetime(train_table['datetime']).dt.date
    
    # Create train table
    test_table = subsample_table(test_stations, file_paths,station_info)
    test_table['datetime'] = pd.to_datetime(test_table['datetime']).dt.date
    


    nrow_train = train_table.shape[0]
    nrow_test = test_table.shape[0]
    
    ratio_subsamples = nrow_train / (nrow_train + nrow_test)
    
    print(ratio_subsamples)
    
    # Sample file paths for test stations
    test_file_paths = random.sample(file_paths, k=len(test_stations))
    
    # Filter file paths for train stations
    train_file_paths = [file_path for file_path in file_paths if file_path not in test_file_paths]
    
    # Write tables: train_stations, test_stations, validation_stations, train_table, test_table, validation_table
    train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
    test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)

    train_table.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
    test_table.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)
    
    # Save test file paths
    with open(os.path.join(output_dir, 'test_file_paths.txt'), 'w') as f:
        for file_path in test_file_paths:
            f.write(file_path + '\n')
    
     
    # Save train file paths
    with open(os.path.join(output_dir, 'train_file_paths.txt'), 'w') as f:
        for file_path in train_file_paths:
            f.write(file_path + '\n')

    print(f'Finished Subsample {subsample}...')
    

Sampling Subsample 1...
0.9579213128550389
Finished Subsample 1...
Sampling Subsample 2...
0.9787188306104901
Finished Subsample 2...
Sampling Subsample 3...
0.9779830308237568
Finished Subsample 3...
Sampling Subsample 4...
0.9521374514215586
Finished Subsample 4...
Sampling Subsample 5...
0.9455346415698839
Finished Subsample 5...


#station_info = pd.read_csv("../data/stations_rhine_elbe.csv")
# Random sampling all_stations (70/30)

In [1]:
# Iterate over sub-samples
# Path configurations
setup = "all_stations"

output_base_dir = f'../R/data/{setup}/'
os.makedirs(output_base_dir, exist_ok=True)

for subsample in range(1, 6):
    output_dir = os.path.join(output_base_dir, f'subsample_{subsample}')
    os.makedirs(output_dir, exist_ok=True)
    
    print(f'Sampling Subsample {subsample}...')
    
    ## Subset train stations randomly:
    train_stations = random.sample(list(station_info['grdc_no']), 35)
    train_stations = station_info[station_info['grdc_no'].isin(train_stations)]
    
    # Subset test stations
    test_stations = station_info[~station_info['grdc_no'].isin(train_stations['grdc_no'])]
    
    
    # Create train table
    train_table = subsample_table(train_stations, file_paths, station_info)
    train_table['datetime'] = pd.to_datetime(train_table['datetime']).dt.date
    
    # Create train test
    test_table = subsample_table(test_stations, file_paths,station_info)
    test_table['datetime'] = pd.to_datetime(test_table['datetime']).dt.date
    

    nrow_train = train_table.shape[0]
    nrow_test = test_table.shape[0]
    
    ratio_subsamples = nrow_train / (nrow_train + nrow_test)
    
    print(ratio_subsamples)
    
    # Sample file paths for test stations
    test_file_paths = random.sample(file_paths, k=len(test_stations))
    
    # Filter file paths for train stations
    train_file_paths = [file_path for file_path in file_paths if file_path not in test_file_paths]
    
    # Write tables: train_stations, test_stations, train_table
    train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
    test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)
    train_table.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
    test_table.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)

    # Save test file paths
    with open(os.path.join(output_dir, 'test_file_paths.txt'), 'w') as f:
        for file_path in test_file_paths:
            f.write(file_path + '\n')
    
     
    # Save train file paths
    with open(os.path.join(output_dir, 'train_file_paths.txt'), 'w') as f:
        for file_path in train_file_paths:
            f.write(file_path + '\n')

    print(f'Finished Subsample {subsample}...')

NameError: name 'os' is not defined

In [52]:
subregion_train_counts

{6351: 21, 6401: 10, 6211: 5}

10
5
21


In [None]:
################
setup = "Maas_lag6"  # 2, 3 and 4 => test stations for rhine, elbe, mass respectively

output_dir = f'../R/data/{setup}/'
os.makedirs(output_dir, exist_ok=True)


print(f'Setup {setup}...')

# Select test stations
if setup == "Elbe_lag6":
    test_stations = elbe_stations
elif setup == "Maas_lag6":
    test_stations = Maas_stations


################

Setup Maas_lag6...


In [None]:
#station_info[~station_info['grdc_no'].isin(test_stations['grdc_no'])]

# Create train stations
train_stations = rhine_stations

# Create train table
train_table = subsample_table(train_stations, file_paths, station_info)
train_table['datetime'] = pd.to_datetime(train_table['datetime']).dt.date

 # Create train test
test_table = subsample_table(test_stations, file_paths,station_info)
test_table['datetime'] = pd.to_datetime(test_table['datetime']).dt.date


nrow_train = train_table.shape[0]
nrow_test = test_table.shape[0]

ratio_subsamples = nrow_train / (nrow_train + nrow_test)

print(ratio_subsamples)

# Sample file paths for test stations
test_file_paths = random.sample(file_paths, k=len(test_stations))

# Filter file paths for train stations
train_file_paths = [file_path for file_path in file_paths if file_path not in test_file_paths]

# Write tables: train_stations, test_stations, train_table
train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)
train_table.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
test_table.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)

# Save test file paths
with open(os.path.join(output_dir, 'test_file_paths.txt'), 'w') as f:
    for file_path in test_file_paths:
        f.write(file_path + '\n')


# Save train file paths
with open(os.path.join(output_dir, 'train_file_paths.txt'), 'w') as f:
    for file_path in train_file_paths:
        f.write(file_path + '\n')

0.8638715967899198
