In [1]:
import os
import glob
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
# Lag 4 folder paths
rhine_only = '../R/data/rhine_only/'
pcr_rhine = '../R/data/pcr_rhine/'
new_folder =  '../R/data/mmc_catchAtt/'

In [8]:
catchmentAtt = ["datetime", "airEntry1", "airEntry2", "aqThick", "area_pcr", "aridityIdx", "bankArea", "bankDepth", "bankWidth",
                   "demAverage", "forestFraction", "groundwaterDepth", "KSat1", "KSat2", "kSatAquifer", "percolationImp",
                   "poreSize1", "poreSize2", "recessionCoeff", "resWC1", "resWC2", "satWC1", "satWC2", "slopeLength",
                   "specificYield", "storage2", "storDepth1", "tanSlope"]

# Create a loop that will go through all the files in the folder and create lagged variables
for subsample in range(1, 6):
    sub_folder = 'subsample_' + str(subsample)
    output_dir = os.path.join(new_folder, sub_folder)
    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    
    # Rhine files
    train_file = os.path.join(rhine_only, sub_folder,'train_table_allpredictors.csv')
    test_file = os.path.join(rhine_only, sub_folder,'test_table_allpredictors.csv')
    train_stations = os.path.join(rhine_only, sub_folder,'train_stations.csv')
    test_stations = os.path.join(rhine_only, sub_folder,'test_stations.csv')
    
    df_train_rhine = pd.read_csv(train_file)
    df_test_rhine = pd.read_csv(test_file)
    
    df_train_stations = pd.read_csv(train_stations)
    df_test_stations = pd.read_csv(test_stations)
    
    df_train_rhine.loc[:, 'datetime']  = pd.to_datetime(df_train_rhine['datetime']).dt.date
    df_test_rhine.loc[:, 'datetime']  = pd.to_datetime(df_test_rhine['datetime']).dt.date

    # PCR files
    train_file = os.path.join(pcr_rhine, sub_folder,'train_table_allpredictors.csv')
    test_file = os.path.join(pcr_rhine, sub_folder,'test_table_allpredictors.csv')
    
    df_train_pcr = pd.read_csv(train_file)
    df_train_catchAtt_pcr = df_train_pcr[catchmentAtt]
    df_train_catchAtt_pcr.loc[:, 'datetime']  = pd.to_datetime(df_train_catchAtt_pcr['datetime']).dt.date


    df_test_pcr = pd.read_csv(test_file)
    df_test_catchAtt_pcr = df_test_pcr[catchmentAtt]
    df_test_catchAtt_pcr.loc[:, 'datetime']  = pd.to_datetime(df_test_catchAtt_pcr['datetime']).dt.date

    #Add catchment attributes to the rhine only dataset
    train_mmc_catchAtt = df_train_rhine.merge(df_train_catchAtt_pcr, on='datetime')
    test_mmc_catchAtt = df_train_rhine.merge(df_train_catchAtt_pcr, on='datetime')
    
    train_mmc_catchAtt = train_mmc_catchAtt.drop(columns=['Unnamed: 0'])  # Remove the empty column
    test_mmc_catchAtt = test_mmc_catchAtt.drop(columns=['Unnamed: 0'])  # Remove the empty column

    
    # Write tables: train_stations, test_stations, train_table
    train_mmc_catchAtt.to_csv(os.path.join(output_dir, 'train_table_allpredictors.csv'), index=False)
    test_mmc_catchAtt.to_csv(os.path.join(output_dir, 'test_table_allpredictors.csv'), index=False)
    df_train_stations.to_csv(os.path.join(output_dir, 'train_stations.csv'), index=False)
    df_test_stations.to_csv(os.path.join(output_dir, 'test_stations.csv'), index=False)

In [5]:
# Create all predictors folder with catchment att
old_folder = '../R/data/allpredictors/'
folder_pcr = '../R/data/pcr_allpredictors/'
new_allpredictors = '../R/data/all_predCachAtt/'

if not os.path.exists(new_allpredictors):
    os.makedirs(new_allpredictors)

In [6]:
# Create a loop that will go through all the files in the folder and create lagged variables
for filename in glob.glob(os.path.join(old_folder, '*.csv')):
    df_old = pd.read_csv(filename)
    df_old.loc[:, 'datetime']  = pd.to_datetime(df_old['datetime']).dt.date

    station_id = filename.split("_")[1]

    df_pcr = pd.read_csv(os.path.join(folder_pcr, f"pcr_allpredictors_{str(station_id)}"))
    df_pcr_subset = df_pcr[catchmentAtt]
    df_pcr_subset.loc[:, 'datetime']  = pd.to_datetime(df_pcr_subset['datetime']).dt.date

    
    new_path = os.path.join(new_allpredictors, f"all_predCachAtt_{str(station_id)}")
    df_new = df_old.merge(df_pcr_subset, on='datetime')
    df_new = df_new.drop(columns=['Unnamed: 0'])  # Remove the empty column
    df_new.reset_index(drop=True)
    
    df_new.to_csv(new_path)