In [9]:
import os
import shutil
import random
import pandas as pd
import shutil

In [10]:
seed_value = 123 
random.seed(seed_value)

In [11]:
def normalize_dis(column, area):
    time = 24 * 3600
    area_m = area*1000000
    new_column = column * (time / area_m)
    return new_column


def normalize_columns_with_dis(df, area):
    for column in df.columns:
        if 'dis' in column:
            df[column] = normalize_dis(df[column], area)
    return df

In [12]:
def subsample_table(stations, filenames, area):
    grdc_nos = [str(grdc_no) for grdc_no in stations['grdc_no']]
    sub_filenames = [filename for filename in filenames if any(grdc_no in filename for grdc_no in grdc_nos)]
    sub_datas = []
    
    
    for filename in sub_filenames:
        sub_data = pd.read_csv(filename)
        
        # Convert 'datetime' column to datetime type
        sub_data['datetime'] = pd.to_datetime(sub_data['datetime'])
        
        # Subset the data based on the datetime range
        start_date = pd.to_datetime('1979-01-01')
        end_date = pd.to_datetime('2012-12-31')
        sub_data = sub_data[(sub_data['datetime'] >= start_date) & (sub_data['datetime'] <= end_date)]
        
        
        sub_datas.append(sub_data)
    
    sub_table = pd.concat(sub_datas, ignore_index=True)
    
    return sub_table

In [13]:
# File paths
file_path_preds = '../R/data/allpredictors_pcr/'
file_list_preds = os.listdir(file_path_preds)
file_paths = [os.path.join(file_path_preds, file) for file in file_list_preds]

station_info = pd.read_csv("../R/data/stations_rhine_elbe.csv")

rhine_stations = station_info.loc[(station_info["sub_reg"] == 6351) | (station_info["sub_reg"] == 6361)]
elbe_stations = station_info.loc[(station_info["sub_reg"] == 6401)]
maas_stations = station_info.loc[(station_info["sub_reg"] == 6211)]

In [14]:
elbe_stations = elbe_stations[~elbe_stations["grdc_no"].isin([6340300, 6340301])]

# Elbe & Maas subsampling

In [15]:
list_ids_rhine = rhine_stations["grdc_no"].to_list()
list_ids_all = station_info['grdc_no'].to_list()

station_info_rhine = station_info[station_info["grdc_no"].isin([x for x in list_ids_rhine])]
setup = "rhine_pcr"

output_base_dir = f'../R/data/{setup}/'
os.makedirs(output_base_dir, exist_ok=True)

sample_number = 21

for subsample in range(1, 6):
    output_dir = os.path.join(output_base_dir, f'subsample_{subsample}')
    os.makedirs(output_dir, exist_ok=True)
    
    print(f'Sampling Subsample {subsample}...')
    
    ## Subset train stations randomly:
    train_station_ids = random.sample(list_ids_rhine, sample_number)
    train_stations = station_info_rhine[station_info_rhine['grdc_no'].isin([x for x in train_station_ids])]
    
    # Subset test stations
    test_stations =  station_info_rhine[~station_info_rhine["grdc_no"].isin([x for x in train_station_ids])]
    
    
    # Create train table
    train_table = subsample_table(train_stations, file_paths, station_info)
    train_table['datetime'] = pd.to_datetime(train_table['datetime']).dt.date
    
    # Create train test
    test_table = subsample_table(test_stations, file_paths,station_info)
    test_table['datetime'] = pd.to_datetime(test_table['datetime']).dt.date
    

    nrow_train = train_table.shape[0]
    nrow_test = test_table.shape[0]
    
    ratio_subsamples = nrow_train / (nrow_train + nrow_test)
    
    print(ratio_subsamples)
    
    
    # Write tables: train_stations, test_stations, train_table
    train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
    test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)
    train_table.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
    test_table.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)



    print(f'Finished Subsample {subsample}...')

Sampling Subsample 1...
0.7
Finished Subsample 1...
Sampling Subsample 2...
0.7
Finished Subsample 2...
Sampling Subsample 3...
0.7
Finished Subsample 3...
Sampling Subsample 4...
0.7
Finished Subsample 4...
Sampling Subsample 5...
0.7
Finished Subsample 5...


#station_info = pd.read_csv("../data/stations_rhine_elbe.csv")
# Random sampling all_stations (70/30)

In [7]:
# Iterate over sub-samples
# Path configurations
setup = "all_stations"

output_base_dir = f'../R/data/{setup}/'
os.makedirs(output_base_dir, exist_ok=True)

for subsample in range(1, 6):
    output_dir = os.path.join(output_base_dir, f'subsample_{subsample}')
    os.makedirs(output_dir, exist_ok=True)
    
    print(f'Sampling Subsample {subsample}...')
    
    ## Subset train stations randomly:
    train_stations = random.sample(list(station_info['grdc_no']), 35)
    train_stations = station_info[station_info['grdc_no'].isin(train_stations)]
    
    # Subset test stations
    test_stations = station_info[~station_info['grdc_no'].isin(train_stations['grdc_no'])]
    
    
    # Create train table
    train_table = subsample_table(train_stations, file_paths, station_info)
    train_table['datetime'] = pd.to_datetime(train_table['datetime']).dt.date
    
    # Create train test
    test_table = subsample_table(test_stations, file_paths,station_info)
    test_table['datetime'] = pd.to_datetime(test_table['datetime']).dt.date
    

    nrow_train = train_table.shape[0]
    nrow_test = test_table.shape[0]
    
    ratio_subsamples = nrow_train / (nrow_train + nrow_test)
    
    print(ratio_subsamples)
    
    # Sample file paths for test stations
    test_file_paths = random.sample(file_paths, k=len(test_stations))
    
    # Filter file paths for train stations
    train_file_paths = [file_path for file_path in file_paths if file_path not in test_file_paths]
    
    # Write tables: train_stations, test_stations, train_table
    train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
    test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)
    train_table.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
    test_table.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)

    # Save test file paths
    with open(os.path.join(output_dir, 'test_file_paths.txt'), 'w') as f:
        for file_path in test_file_paths:
            f.write(file_path + '\n')
    
     
    # Save train file paths
    with open(os.path.join(output_dir, 'train_file_paths.txt'), 'w') as f:
        for file_path in train_file_paths:
            f.write(file_path + '\n')

    print(f'Finished Subsample {subsample}...')

Sampling Subsample 1...
0.6754865051169203
Finished Subsample 1...
Sampling Subsample 2...
0.6934089824486982
Finished Subsample 2...
Sampling Subsample 3...
0.7060289516941514
Finished Subsample 3...
Sampling Subsample 4...
0.6969616628665358
Finished Subsample 4...
Sampling Subsample 5...
0.6981812397263906
Finished Subsample 5...


In [18]:
################
setup = "maas_pcr"  # 2, 3 and 4 => test stations for rhine, elbe, mass respectively

output_dir = f'../R/data/{setup}/'
os.makedirs(output_dir, exist_ok=True)


print(f'Setup {setup}...')

# Select test stations
if setup == "elbe_pcr":
    test_stations = elbe_stations
elif setup == "maas_pcr":
    test_stations = maas_stations


################

Setup maas_pcr...


In [19]:
#station_info[~station_info['grdc_no'].isin(test_stations['grdc_no'])]

# Create train stations
train_stations = rhine_stations

# Create train table
train_table = subsample_table(train_stations, file_paths, station_info)
train_table['datetime'] = pd.to_datetime(train_table['datetime']).dt.date

 # Create train test
test_table = subsample_table(test_stations, file_paths,station_info)
test_table['datetime'] = pd.to_datetime(test_table['datetime']).dt.date


nrow_train = train_table.shape[0]
nrow_test = test_table.shape[0]

ratio_subsamples = nrow_train / (nrow_train + nrow_test)

print(ratio_subsamples)

# Sample file paths for test stations
test_file_paths = random.sample(file_paths, k=len(test_stations))

# Filter file paths for train stations
train_file_paths = [file_path for file_path in file_paths if file_path not in test_file_paths]

# Write tables: train_stations, test_stations, train_table
train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)
train_table.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
test_table.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)

0.8108108108108109
