In [8]:
import os 
import pandas as pd
from os.path import join as pjoin # Joining file directories
import glob
import warnings

In [2]:
# set your working directory and go there
work_dir = "data"
os.chdir(work_dir)

In [3]:
# Read the station latitude and longitude from a CSV file
loc = pd.read_csv('stationLatLon.csv')

sub_regions = [6351, 6361, 6401, 6211] 
loc = loc[(loc.wmo_reg == 6) & (loc.lat.between(45, 54.5)) & (loc.lon.between(4, 15.5)) & (loc.sub_reg.isin(sub_regions)) & (loc.river != "HAVEL")]
loc.to_csv("stations_rhine_elbe.csv")

In [4]:
stations_desired = loc["grdc_no"].to_list()

In [5]:
len(stations_desired)

50

In [6]:
models = ["pcr", "meteo", "wg3", "lis"]

#Try for one model
model = models[1]
input_folder = f'{model}/upstream_station_all/'

import warnings
warnings.filterwarnings("ignore")

stations_desired = loc["grdc_no"].to_list()
aggregated_dfs = {}

file_pattern = f"{model}/upstream_station_all/*.csv"
file_list = glob.glob(file_pattern)

for path_csv_file in file_list:
    station = int(os.path.basename(path_csv_file).split("_")[0])

    # Combine the csv files containing the same station into one csv file based on the datetime column.
    # only use stations that are in the desired list

    # Check if the station is in the desired list
    if station in stations_desired:
        # Read the CSV file
        df = pd.read_csv(path_csv_file)
        
        # Check if the station already exists in the aggregated_dfs dictionary
        if station in aggregated_dfs:
            # Concatenate the current data with the existing data for the station
            aggregated_dfs[station] = pd.concat([aggregated_dfs[station], df], axis=1)
        else:
            # Add the data to the aggregated_dfs dictionary
            aggregated_dfs[station] = df


# Save the combined data for each station to separate CSV files
output_folder = f'predictors/{model}'
for station, data in aggregated_dfs.items():
    output_file = pjoin(output_folder, f"{station}_combined.csv")
    data = data.loc[:, ~data.columns.duplicated()]
    data['datetime'] = pd.to_datetime(data['datetime'], format='%Y_%m')
    data.to_csv(output_file, index=False)


In [7]:
warnings.filterwarnings("ignore")

def combine_variables(models):
    loc = pd.read_csv('stationLatLon.csv')
    loc = loc[(loc.wmo_reg == 6) & (loc.lat.between(45, 54.5)) & (loc.lon.between(4, 15.5))]
    stations_desired = loc["grdc_no"].to_list()
    
    for model in models:
        input_folder = f'{model}/upstream_station_all/'

        aggregated_dfs = {}

        file_pattern = f"{model}/upstream_station_all/*.csv"
        file_list = glob.glob(file_pattern)

        for path_csv_file in file_list:
            station = int(os.path.basename(path_csv_file).split("_")[0])

            # Combine the csv files containing the same station into one csv file based on the datetime column.
            # only use stations that are in the desired list

            # Check if the station is in the desired list
            if station in stations_desired:
                # Read the CSV file
                df = pd.read_csv(path_csv_file)

                # Check if the station already exists in the aggregated_dfs dictionary
                if station in aggregated_dfs:
                    # Concatenate the current data with the existing data for the station
                    aggregated_dfs[station] = pd.concat([aggregated_dfs[station], df], axis=1)
                else:
                    # Add the data to the aggregated_dfs dictionary
                    aggregated_dfs[station] = df


        # Save the combined data for each station to separate CSV files
        output_folder = f'predictors/{model}'
        os.makedirs(output_folder, exist_ok=True)
        for station, data in aggregated_dfs.items():
            output_file = os.path.join(output_folder, f"{station}_combined.csv")
            data = data.loc[:, ~data.columns.duplicated()]
            data['datetime'] = pd.to_datetime(data['datetime'], format='%Y_%m')
            data.to_csv(output_file, index=False)

In [8]:
models = ["pcr", "meteo", "wg3", "lis"]
combine_variables(models)

In [6]:
def normalize_dis(column, area):
    time = 24 * 3600
    area_m = area*1000000
    new_column = column * (time / area_m)
    return new_column


def normalize_columns_with_dis(df, area):
    for column in df.columns:
        if 'dis' in column:
            df[column] = normalize_dis(df[column], area)
    return df

In [9]:
warnings.filterwarnings("ignore")
def combine_models(models):
    loc = pd.read_csv('stationLatLon.csv')
    loc = loc[(loc.wmo_reg == 6) & (loc.lat.between(45, 54.5)) & (loc.lon.between(4, 15.5))]
    stations_desired = loc["grdc_no"].to_list()
    combined_dfs = {}

    for model in models:
        input_folder = f'{model}/upstream_station_all/'
        file_pattern = f"{input_folder}*.csv"
        file_list = glob.glob(file_pattern)

        for path_csv_file in file_list:
            station = int(os.path.basename(path_csv_file).split("_")[0])

            # Check if the station is in the desired list
            if station in stations_desired:
                # Read the CSV file
                df = pd.read_csv(path_csv_file)

            
                area_km = loc.loc[loc["grdc_no"] == station].reset_index(drop=True)["area"][0]
                df = normalize_columns_with_dis(df, area_km)  # Apply normalization here

                # Check if the station already exists in the combined_dfs dictionary
                if station in combined_dfs:
                    # Concatenate the current data with the existing data for the station
                    combined_dfs[station] = pd.concat([combined_dfs[station], df], axis=1)
                else:
                    # Add the data to the combined_dfs dictionary
                    combined_dfs[station] = df

    # Save the combined data for each station to separate CSV files
    output_folder = 'predictors/combined'
    os.makedirs(output_folder, exist_ok=True)
    for station, data in combined_dfs.items():
        output_file = os.path.join(output_folder, f"allpredictors_{station}.csv")
        data = data.loc[:, ~data.columns.duplicated()]
        data['datetime'] = pd.to_datetime(data['datetime'], format='%Y_%m')
        data.to_csv(output_file, index=False)


In [10]:
models = ["meteo", "wg3", "lis", "pcr", "grdc"]
combine_models(models)