In [None]:
import os
import shutil
import random
import pandas as pd
import shutil
import math
from collections import defaultdict

In [None]:
# File paths
file_path_preds = '../R/data/allpredictors/'
file_list_preds = os.listdir(file_path_preds)
file_paths = [os.path.join(file_path_preds, file) for file in file_list_preds]


In [1]:
def subsample_table(stations, filenames, area):
    grdc_nos = [str(grdc_no) for grdc_no in stations['grdc_no']]
    sub_filenames = [filename for filename in filenames if any(grdc_no in filename for grdc_no in grdc_nos)]
    sub_datas = []
    for filename in sub_filenames:
        sub_data = pd.read_csv(filename)
        sub_datas.append(sub_data)
    
    sub_table = pd.concat(sub_datas, ignore_index=True)
    
    return sub_table

## Stratified sampling All stations

In [None]:
# Calculate the desired train-test ratio
train_test_ratio = 0.7  # 70% for training, 30% for testing
station_info['sub_reg'] = station_info['sub_reg'].replace({6361: 6351})

# Calculate the number of stations needed for each subregion
subregion_counts = station_info['sub_reg'].value_counts().to_dict()
subregion_train_counts = {sub_reg: math.ceil(count * train_test_ratio) for sub_reg, count in subregion_counts.items()}

grdc_no_train_counts = {grdc_no: subregion_train_counts[sub_reg] for grdc_no, sub_reg in station_info[['grdc_no', 'sub_reg']].values}

# Group dictionary based on values
grouped_grdc = defaultdict(list)

for index, row in station_info.iterrows():
    if row['sub_reg'] in subregion_train_counts.keys():
        grdc_no_train_counts[row['grdc_no']] = subregion_train_counts[row['sub_reg']]
        grouped_grdc[subregion_train_counts[row['sub_reg']]].append(row['grdc_no'])

        
for num_test_samples, value in grouped_grdc.items():
    print(num_test_samples)
    

In [None]:
# Iterate over sub-samples
# Path configurations
setup = "all_stations"

output_base_dir = f'../R/data/{setup}/'
os.makedirs(output_base_dir, exist_ok=True)

    rhine = 21 (70% of avaliable stations)
    elbe = 10 (70% of avaliable stations)
    maas = 5 (70% of avaliable stations)


for subsample in range(1, 6):
    output_dir = os.path.join(output_base_dir, f'subsample_{subsample}')
    os.makedirs(output_dir, exist_ok=True)
    
    print(f'Sampling Subsample {subsample}...')
    train_stations = []

    ## Subset train stations randomly:
    train_stations.extend(random.sample(grouped_grdc[rhine], 21))
    train_stations.extend(random.sample(grouped_grdc[elbe], 10))
    train_stations.extend(random.sample(grouped_grdc[maas], 5))

    
    train_stations = station_info[station_info['grdc_no'].isin(train_stations)]
    
    # Subset test stations
    test_stations = station_info[~station_info['grdc_no'].isin(train_stations['grdc_no'])]
    
    
    # Create train table
    train_table = subsample_table(train_stations, file_paths, station_info)
    train_table['datetime'] = pd.to_datetime(train_table['datetime']).dt.date
    
    # Create train test
    test_table = subsample_table(test_stations, file_paths,station_info)
    test_table['datetime'] = pd.to_datetime(test_table['datetime']).dt.date
    

    nrow_train = train_table.shape[0]
    nrow_test = test_table.shape[0]
    
    ratio_subsamples = nrow_train / (nrow_train + nrow_test)
    
    print(ratio_subsamples)
    
    # Sample file paths for test stations
    test_file_paths = random.sample(file_paths, k=len(test_stations))
    
    # Filter file paths for train stations
    train_file_paths = [file_path for file_path in file_paths if file_path not in test_file_paths]
    
    # Write tables: train_stations, test_stations, train_table
    train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
    test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)
    train_table.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
    test_table.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)

    # Save test file paths
    with open(os.path.join(output_dir, 'test_file_paths.txt'), 'w') as f:
        for file_path in test_file_paths:
            f.write(file_path + '\n')
    
     
    # Save train file paths
    with open(os.path.join(output_dir, 'train_file_paths.txt'), 'w') as f:
        for file_path in train_file_paths:
            f.write(file_path + '\n')

    print(f'Finished Subsample {subsample}...')