In [1]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(os.path.join(module_path, 'scripts'))

In [2]:
import pandas as pd

import utils

In [3]:
utils.set_project_dir()

In [4]:
time_interval = 'd'
country_codes = ['ESP', 'USA', 'EST', 'ETH']
test_size = 0.5

In [6]:
time_intervals = ['d', 'm']
test_size_int = int(test_size * 100)
for time_interval in time_intervals:
    results_list = []
    target = f'Q_{time_interval}+1'
    feat_set = f'FS3_{time_interval}'
    for country_code in country_codes:

        # Read SWAT results
        excel_file = 'swat/SWAT_results.xlsx'
        sheet_name = f'{country_code}_{time_interval.upper()}'
        swat_results = pd.read_excel(excel_file, sheet_name=sheet_name)
        swat_results = swat_results.rename(
            columns={swat_results.columns[0]: 'Date', swat_results.columns[1]: 'Observed', swat_results.columns[2]: 'SWAT'}
        )

        # Get start and end of SWAT time series
        swat_min_date = swat_results[swat_results.columns[0]].min()
        swat_max_date = swat_results[swat_results.columns[0]].max()

        swat_train_min_date = swat_results[swat_results.columns[0]].min()
        swat_test_max_date = swat_results[swat_results.columns[0]].max()
        swat_train_max_date = pd.to_datetime(swat_train_min_date + pd.DateOffset(years=6) - pd.DateOffset(days=1)).strftime('%Y-%m-%d')
        swat_test_min_date = pd.to_datetime(swat_train_min_date + pd.DateOffset(years=6)).strftime('%Y-%m-%d')
        warmup_min_date = pd.to_datetime(swat_train_min_date - pd.DateOffset(years=2)).strftime('%Y-%m-%d')
        warmup_max_date = pd.to_datetime(swat_train_min_date - pd.DateOffset(days=1)).strftime('%Y-%m-%d')

        model_dir = utils.get_model_dir(country_code, target, feat_set)

        # Get indices of training samples
        train_indices = pd.read_csv(
            f'{model_dir}/{country_code}_{target}_{feat_set}_feat_train_{test_size_int}.csv', usecols=['Index']
        )['Index'].values

        # Get indices of test samples
        test_indices = pd.read_csv(
            f'{model_dir}/{country_code}_{target}_{feat_set}_feat_test_{test_size_int}.csv', usecols=['Index']
        )['Index'].values

        # Read RF results
        obs_vs_pred = pd.read_csv(f'{model_dir}/{country_code}_{target}_{feat_set}_obs_vs_pred_{test_size_int}.csv', parse_dates=['Date'])
        obs_vs_pred['Index'] = obs_vs_pred.index

        # Training set
        end_index_train = train_indices[-1]
        obs_vs_pred_train = obs_vs_pred.loc[:end_index_train, :]
        rf_train_min_date = obs_vs_pred_train['Date'].dt.strftime('%Y-%m-%d').min()
        rf_train_max_date = obs_vs_pred_train['Date'].dt.strftime('%Y-%m-%d').max()

        # Test set
        start_index_test = test_indices[0]
        obs_vs_pred_test = obs_vs_pred.loc[start_index_test:, :]
        rf_test_min_date = obs_vs_pred_test['Date'].dt.strftime('%Y-%m-%d').min()
        rf_test_max_date = obs_vs_pred_test['Date'].dt.strftime('%Y-%m-%d').max()
        
        # SWAT results
        swat_results = {
            'catchment_name': utils.get_catchment_name(country_code),
            'country_code': country_code,
            'time_interval': time_interval,
            'model': 'SWAT',
            'warmup_min_date': warmup_min_date,
            'warmup_max_date': warmup_max_date,
            'train_min_date': swat_train_min_date.strftime('%Y-%m-%d'),
            'train_max_date': swat_train_max_date,
            'test_min_date': swat_test_min_date,
            'test_max_date': swat_test_max_date.strftime('%Y-%m-%d')
        }
        df_swat_results = pd.DataFrame([swat_results.values()], columns=swat_results.keys())
        results_list.append(df_swat_results)
        
        # RF results
        rf_results = {
            'catchment_name': utils.get_catchment_name(country_code),
            'country_code': country_code,
            'time_interval': time_interval,
            'model': 'RF',
            'warmup_min_date': None,
            'warmup_max_date': None,
            'train_min_date': rf_train_min_date,
            'train_max_date': rf_train_max_date,
            'test_min_date': rf_test_min_date,
            'test_max_date': rf_test_max_date
        }
        df_rf_results = pd.DataFrame([rf_results.values()], columns=rf_results.keys())
        results_list.append(df_rf_results)
        
    out_df = pd.concat(results_list).reset_index(drop=True)
    display(out_df)
    out_df.to_csv(f'ml/{target}_train_test_time.csv', index=False)

Unnamed: 0,catchment_name,country_code,time_interval,model,warmup_min_date,warmup_max_date,train_min_date,train_max_date,test_min_date,test_max_date
0,Argos,ESP,d,SWAT,2004-01-01,2005-12-31,2006-01-01,2011-12-31,2012-01-01,2017-12-31
1,Argos,ESP,d,RF,,,2004-02-01,2010-12-31,2011-01-01,2017-11-30
2,Bald Eagle,USA,d,SWAT,2000-01-01,2001-12-31,2002-01-01,2007-12-31,2008-01-01,2013-11-30
3,Bald Eagle,USA,d,RF,,,2000-02-01,2006-12-31,2007-01-01,2013-11-30
4,Porijõgi,EST,d,SWAT,2007-01-01,2008-12-31,2009-01-01,2014-12-31,2015-01-01,2020-12-31
5,Porijõgi,EST,d,RF,,,2007-02-01,2013-12-31,2014-01-01,2020-11-30
6,Rib,ETH,d,SWAT,1995-01-01,1996-12-31,1997-01-01,2002-12-31,2003-01-01,2008-12-31
7,Rib,ETH,d,RF,,,1995-02-01,2001-12-31,2002-01-01,2008-11-30


Unnamed: 0,catchment_name,country_code,time_interval,model,warmup_min_date,warmup_max_date,train_min_date,train_max_date,test_min_date,test_max_date
0,Argos,ESP,m,SWAT,2004-01-01,2005-12-31,2006-01-01,2011-12-31,2012-01-01,2017-12-01
1,Argos,ESP,m,RF,,,2005-01-31,2010-12-31,2011-01-31,2016-12-31
2,Bald Eagle,USA,m,SWAT,2000-01-01,2001-12-31,2002-01-01,2007-12-31,2008-01-01,2013-11-01
3,Bald Eagle,USA,m,RF,,,2001-01-31,2006-12-31,2007-01-31,2012-12-31
4,Porijõgi,EST,m,SWAT,2007-01-01,2008-12-31,2009-01-01,2014-12-31,2015-01-01,2020-12-01
5,Porijõgi,EST,m,RF,,,2008-01-31,2013-12-31,2014-01-31,2019-12-31
6,Rib,ETH,m,SWAT,1995-01-01,1996-12-31,1997-01-01,2002-12-31,2003-01-01,2008-12-01
7,Rib,ETH,m,RF,,,1996-01-31,2001-12-31,2002-01-31,2007-12-31
