In [1]:
import pandas as pd
import re
from post_processing import series_name, add_weights

In [2]:
from experiment_utils import get_freq, get_univariate_series
from timecave.data_characteristics import get_features
from statsmodels.tsa.stattools import adfuller, kpss

## Utils

Constants

In [3]:
pretty_methods = {'Holdout': 'Holdout', 
                'Repeated_Holdout': 'Repeated Holdout', 
                'RepeatedHoldout': 'Repeated Holdout', 
                'Growing_Window': 'Growing Window',
                'GrowingWindow': 'Growing Window', 
                'Rolling_Window': 'Rolling Window',
                'RollingWindow': 'Rolling Window',
                'Block_CV': 'Block CV',
                'BlockCV': 'Block CV', 
                'AdaptedhvBlockCV': 'Adapted hv-Block CV', 
                'MarkovCV': 'Markov CV',
                'BlockCV_with_weights_paper': 'Weighted Block CV var.1', 
                'BlockCV_with_weights': 'Weighted Block CV var.2',
                'GrowingWindow_with_weights': 'Weighted Growing Window', 
                'RollingWindow_with_weights': 'Weighted Rolling Window',
                'Block_CV_with_weights_paper': 'Weighted Block CV var.1', 
                'Block_CV_with_weights': 'Weighted Block CV var.2',
                'Growing_Window_with_weights': 'Weighted Growing Window', 
                'Rolling_Window_with_weights': 'Weighted Rolling Window'}

Functions

In [4]:
def clean_filename(filepath, st_split = '/'):
    filename = filepath.split(st_split)[-1]
    
    filename = filename.rsplit('.', 1)[0]
    
    # Check if the filename ends with a date in the format DDMMYYYY or similar
    # and remove the date if it exists
    filename = re.sub(r'_\d{8}$', '', filename)
    
    return filename

def clean_method(method_str):
    parts = method_str.split('.')
    if len(parts) > 3:
        method_name = parts[3].split()[0]
    else:
        method_name = parts[-1].split()[0]
    return method_name

def prettify(df, pretty_methods: dict = pretty_methods):
    copy = df.copy()
    copy['method'] = copy['method'].map(pretty_methods)
    return copy

In [5]:
def stationarity_tests(time_series: pd.Series, significance_level=0.05) -> bool:
    """
    Perform two different stationarity tests: 
    -> Augmented Dickey-Fuller (ADF)
    -> KPSS

    This function can only handle one time series at a time.
    Returns a boolean indicating the stationarity of the series:
    - True if the series is stationary
    - False if the series is not stationary
    """

    # ADF Test
    adf_test = adfuller(time_series, autolag="BIC")
    adf_p_value = adf_test[1]
    adf_stationary = adf_p_value <= significance_level

    # KPSS Test
    kpss_test = kpss(time_series, regression="ct")
    kpss_p_value = kpss_test[1]
    kpss_stationary = kpss_p_value > significance_level

    # Case 1: Both tests conclude that the series is not stationary
    if not adf_stationary and not kpss_stationary:
        return False

    # Case 2: Both tests conclude that the series is stationary
    if adf_stationary and kpss_stationary:
        return True

    # Case 3: KPSS indicates stationarity and ADF indicates non-stationarity (Trend Stationary)
    if kpss_stationary and not adf_stationary:
        return False

    # Case 4: KPSS indicates non-stationarity and ADF indicates stationarity (Difference Stationary)
    if not kpss_stationary and adf_stationary:
        return False

    return None  


In [6]:
def create_analytics_df(data_files: list):
    d = {}
    for file in data_files:
        df = pd.read_csv(file)
        freq = get_freq(df, df.columns[0])
        ts_list = get_univariate_series(df)
        d_temp = {}

        for cid, ts in enumerate(ts_list):
            features = get_features(ts, freq)
            features['is_stationary'] = stationarity_tests(ts)
            d_temp[str(cid)] = features
        
        file_df = pd.concat(d_temp).reset_index(0).rename(columns={'level_0': 'column_index'})
        d[file] = file_df

    return pd.concat(d).reset_index(0).rename(columns={'level_0': 'filename'})

## Upload Data

### Real Data

Be carefull with "get_latest_files" when lstm results are ready!

In [7]:
# Validation Results
saude = pd.read_csv('results/saude\\table_A_saude.csv', dtype={'column_index': 'string'})
transportes = pd.read_csv('results/transportes\\table_A_transportes.csv', dtype={'column_index': 'string'})
eco_financas = pd.read_csv('results/eco_financas\\table_A_eco_financas.csv', dtype={'column_index': 'string'})
eng_ciencias = pd.read_csv('results/eng_ciencias\\table_A_eng_ciencias.csv', dtype={'column_index': 'string'})
energia = pd.read_csv('results/energia\\table_A_energia.csv', dtype={'column_index': 'string'})
ambiente = pd.read_csv('results/ambiente\\table_A_ambiente.csv', dtype={'column_index': 'string'})
meteorologia = pd.read_csv('results/meteorologia\\table_A_meteorologia.csv', dtype={'column_index': 'string'})

In [8]:
val_dict = {'saude': saude.drop_duplicates(),
 'transportes': transportes.drop_duplicates(),
 'eco_financas':eco_financas.drop_duplicates(),
 'eng_ciencias': eng_ciencias.drop_duplicates(),
 'energia': energia.drop_duplicates(),
 'ambiente': ambiente.drop_duplicates(),
 'meteorologia': meteorologia.drop_duplicates()
 }

In [9]:
real_data_val = pd.concat(val_dict).reset_index(0).rename(columns={'level_0': 'Field'})
real_data_val

Unnamed: 0,Field,filename,column_index,method,iteration,model,mse,mae,rmse
0,saude,datasets/processed_data/covid19_17052024.csv,0,<timecave.validation_methods.OOS.Holdout objec...,0,Tree,1122.564394,20.017677,33.504692
1,saude,datasets/processed_data/covid19_17052024.csv,0,<timecave.validation_methods.OOS.Holdout objec...,0,ARMA,1097.670935,20.360957,33.131117
2,saude,datasets/processed_data/covid19_17052024.csv,0,<timecave.validation_methods.OOS.Repeated_Hold...,0,Tree,197.821429,8.396104,14.064901
3,saude,datasets/processed_data/covid19_17052024.csv,0,<timecave.validation_methods.OOS.Repeated_Hold...,0,ARMA,4652.950871,64.066829,68.212542
4,saude,datasets/processed_data/covid19_17052024.csv,0,<timecave.validation_methods.OOS.Repeated_Hold...,1,Tree,182.480132,8.099338,13.508521
...,...,...,...,...,...,...,...,...,...
3856,meteorologia,datasets/processed_data/jena_climate_data.csv,6,<timecave.validation_methods.markov.MarkovCV o...,5,LSTM,0.336334,0.376041,0.579943
3857,meteorologia,datasets/processed_data/jena_climate_data.csv,6,<timecave.validation_methods.markov.MarkovCV o...,6,LSTM,0.331249,0.375486,0.575543
3858,meteorologia,datasets/processed_data/jena_climate_data.csv,6,<timecave.validation_methods.markov.MarkovCV o...,7,LSTM,0.330733,0.368902,0.575094
3859,meteorologia,datasets/processed_data/jena_climate_data.csv,6,<timecave.validation_methods.markov.MarkovCV o...,8,LSTM,0.326968,0.365175,0.571811


In [10]:
# Test Results
saude_test = pd.read_csv('results/saude\\table_B_saude.csv', dtype={'column_index': 'string'})
transportes_test = pd.read_csv('results/transportes\\table_B_transportes.csv', dtype={'column_index': 'string'})
eco_financas_test = pd.read_csv('results/eco_financas\\table_B_eco_financas.csv', dtype={'column_index': 'string'})
eng_ciencias_test = pd.read_csv('results/eng_ciencias\\table_B_eng_ciencias.csv', dtype={'column_index': 'string'})
energia_test = pd.read_csv('results/energia\\table_B_energia.csv', dtype={'column_index': 'string'})
ambiente_test = pd.read_csv('results/ambiente\\table_B_ambiente.csv', dtype={'column_index': 'string'})
meteorologia_test = pd.read_csv('results/meteorologia\\table_B_meteorologia.csv', dtype={'column_index': 'string'})

test_dict = {'saude': saude_test.drop_duplicates(),
 'transportes': transportes_test.drop_duplicates(),
 'eco_financas':eco_financas_test.drop_duplicates(),
 'eng_ciencias': eng_ciencias_test.drop_duplicates(),
 'energia': energia_test.drop_duplicates(),
 'ambiente': ambiente_test.drop_duplicates(),
 'meteorologia': meteorologia_test.drop_duplicates()
 }

In [11]:
real_data_test = pd.concat(test_dict).reset_index(0).rename(columns={'level_0': 'Field'})
real_data_test

Unnamed: 0,Field,filename,column_index,model,mse,mae,rmse
0,saude,datasets/processed_data/covid19_17052024.csv,0,Tree,17.316252,2.493517,4.161280
1,saude,datasets/processed_data/covid19_17052024.csv,0,ARMA,35.113134,5.037637,5.925634
2,saude,datasets/processed_data/covid19_17052024.csv,1,Tree,5.463435,1.555446,2.337399
3,saude,datasets/processed_data/covid19_17052024.csv,1,ARMA,14.434002,3.375995,3.799211
4,saude,datasets/processed_data/covid19_17052024.csv,2,Tree,8.104641,1.812699,2.846865
...,...,...,...,...,...,...,...
93,meteorologia,datasets/processed_data/jena_climate_data.csv,2,LSTM,72.096596,6.881391,8.490971
94,meteorologia,datasets/processed_data/jena_climate_data.csv,3,LSTM,0.035177,0.124977,0.187557
95,meteorologia,datasets/processed_data/jena_climate_data.csv,4,LSTM,1.254967,0.765258,1.120253
96,meteorologia,datasets/processed_data/jena_climate_data.csv,5,LSTM,0.091031,0.155307,0.301713


### Synthetic Data

In [12]:
# Validation Results
s1 = pd.read_csv('results/s1\\table_A_s1.csv', dtype={'column_index': 'string'})
s2 = pd.read_csv('results/s2\\table_A_s2.csv', dtype={'column_index': 'string'})
s3 = pd.read_csv('results/s3\\table_A_s3.csv', dtype={'column_index': 'string'})

val_dict = {
's1': s1,
's2': s2,
's3':s3
 }

In [13]:
syn_data_val = pd.concat(val_dict).reset_index(0).rename(columns={'level_0': 'Field'})
syn_data_val

Unnamed: 0,Field,filename,column_index,method,iteration,model,mse,mae,rmse
0,s1,datasets/synthetic_data/s1.csv,0,<timecave.validation_methods.OOS.Holdout objec...,0,Tree,2.290200e+00,1.212833,1.513341
1,s1,datasets/synthetic_data/s1.csv,0,<timecave.validation_methods.OOS.Holdout objec...,0,ARMA,1.747274e+00,1.008874,1.321845
2,s1,datasets/synthetic_data/s1.csv,0,<timecave.validation_methods.OOS.Repeated_Hold...,0,Tree,2.557555e+00,1.351258,1.599236
3,s1,datasets/synthetic_data/s1.csv,0,<timecave.validation_methods.OOS.Repeated_Hold...,0,ARMA,1.652459e+00,1.030527,1.285480
4,s1,datasets/synthetic_data/s1.csv,0,<timecave.validation_methods.OOS.Repeated_Hold...,1,Tree,3.059749e+00,1.464176,1.749214
...,...,...,...,...,...,...,...,...,...
117580,s3,datasets/synthetic_data/s3.csv,999,<timecave.validation_methods.markov.MarkovCV o...,5,LSTM,1.078563e+07,3252.978832,3284.147807
117581,s3,datasets/synthetic_data/s3.csv,999,<timecave.validation_methods.markov.MarkovCV o...,6,LSTM,7.446406e+06,2670.193913,2728.810370
117582,s3,datasets/synthetic_data/s3.csv,999,<timecave.validation_methods.markov.MarkovCV o...,7,LSTM,7.340841e+06,2169.858197,2709.398580
117583,s3,datasets/synthetic_data/s3.csv,999,<timecave.validation_methods.markov.MarkovCV o...,8,LSTM,5.157779e+06,2171.699985,2271.074347


In [14]:
# Test Results
s1_test = pd.read_csv('results/s1\\table_B_s1.csv', dtype={'column_index': 'string'})
s2_test = pd.read_csv('results/s2\\table_B_s2.csv', dtype={'column_index': 'string'})
s3_test = pd.read_csv('results/s3\\table_B_s3.csv', dtype={'column_index': 'string'})

test_dict = {
's1': s1_test,
's2': s2_test,
's3':s3_test
 }

In [15]:
syn_data_test = pd.concat(test_dict).reset_index(0).rename(columns={'level_0': 'Field'})
syn_data_test

Unnamed: 0,Field,filename,column_index,model,mse,mae,rmse
0,s1,datasets/synthetic_data/s1.csv,0,Tree,2.239900e+00,1.216184,1.496630
1,s1,datasets/synthetic_data/s1.csv,0,ARMA,2.726234e+00,1.342448,1.651131
2,s1,datasets/synthetic_data/s1.csv,1,Tree,6.658242e+00,2.083506,2.580357
3,s1,datasets/synthetic_data/s1.csv,1,ARMA,1.337243e+01,2.930200,3.656833
4,s1,datasets/synthetic_data/s1.csv,2,Tree,1.130832e+01,2.703739,3.362784
...,...,...,...,...,...,...,...
2995,s3,datasets/synthetic_data/s3.csv,995,LSTM,5.398360e+07,7322.236320,7347.353537
2996,s3,datasets/synthetic_data/s3.csv,996,LSTM,1.655345e+08,12791.898134,12866.021662
2997,s3,datasets/synthetic_data/s3.csv,997,LSTM,5.086851e+07,7080.503968,7132.216305
2998,s3,datasets/synthetic_data/s3.csv,998,LSTM,8.350341e+07,9086.432205,9138.019857


# Analysing Results

In [16]:
data_val, data_test = syn_data_val, syn_data_test

## Preprocessing results

In [17]:
data_val['filename'] = data_val['filename'].apply(clean_filename)
data_val['method'] = data_val['method'].apply(clean_method)
data_val = series_name(data_val)
data_val

Unnamed: 0,series,Field,method,iteration,model,mse,mae,rmse
0,s1_0,s1,Holdout,0,Tree,2.290200e+00,1.212833,1.513341
1,s1_0,s1,Holdout,0,ARMA,1.747274e+00,1.008874,1.321845
2,s1_0,s1,Repeated_Holdout,0,Tree,2.557555e+00,1.351258,1.599236
3,s1_0,s1,Repeated_Holdout,0,ARMA,1.652459e+00,1.030527,1.285480
4,s1_0,s1,Repeated_Holdout,1,Tree,3.059749e+00,1.464176,1.749214
...,...,...,...,...,...,...,...,...
117580,s3_999,s3,MarkovCV,5,LSTM,1.078563e+07,3252.978832,3284.147807
117581,s3_999,s3,MarkovCV,6,LSTM,7.446406e+06,2670.193913,2728.810370
117582,s3_999,s3,MarkovCV,7,LSTM,7.340841e+06,2169.858197,2709.398580
117583,s3_999,s3,MarkovCV,8,LSTM,5.157779e+06,2171.699985,2271.074347


In [18]:
data_val = add_weights(data_val, preq_methods = ['Growing_Window', 'Rolling_Window', 'GrowingWindow', 'RollingWindow'], CV_methods = ['Block_CV', 'BlockCV'])
data_val['method'] = data_val['method'].map(pretty_methods)
data_val

Unnamed: 0,series,Field,method,iteration,model,mse,mae,rmse,weights
0,s1_0,s1,Holdout,0,Tree,2.290200e+00,1.212833,1.513341,1.000000
1,s1_0,s1,Holdout,0,ARMA,1.747274e+00,1.008874,1.321845,1.000000
2,s1_0,s1,Repeated Holdout,0,Tree,2.557555e+00,1.351258,1.599236,1.000000
3,s1_0,s1,Repeated Holdout,0,ARMA,1.652459e+00,1.030527,1.285480,1.000000
4,s1_0,s1,Repeated Holdout,1,Tree,3.059749e+00,1.464176,1.749214,1.000000
...,...,...,...,...,...,...,...,...,...
513850,s3_999,s3,Weighted Rolling Window,1,LSTM,1.316166e+06,1000.216088,1147.242794,0.066667
513851,s3_999,s3,Weighted Rolling Window,2,LSTM,5.532977e+06,2275.366738,2352.228111,0.133333
513852,s3_999,s3,Weighted Rolling Window,3,LSTM,1.163932e+07,3375.021583,3411.644142,0.266667
513853,s3_999,s3,Weighted Rolling Window,1,LSTM,5.534582e+06,2275.719361,2352.569214,0.066667


In [19]:
data_test['filename'] = data_test['filename'].apply(clean_filename)
data_test = series_name(data_test)
data_test

Unnamed: 0,series,Field,model,mse,mae,rmse
0,s1_0,s1,Tree,2.239900e+00,1.216184,1.496630
1,s1_0,s1,ARMA,2.726234e+00,1.342448,1.651131
2,s1_1,s1,Tree,6.658242e+00,2.083506,2.580357
3,s1_1,s1,ARMA,1.337243e+01,2.930200,3.656833
4,s1_2,s1,Tree,1.130832e+01,2.703739,3.362784
...,...,...,...,...,...,...
2995,s3_995,s3,LSTM,5.398360e+07,7322.236320,7347.353537
2996,s3_996,s3,LSTM,1.655345e+08,12791.898134,12866.021662
2997,s3_997,s3,LSTM,5.086851e+07,7080.503968,7132.216305
2998,s3_998,s3,LSTM,8.350341e+07,9086.432205,9138.019857


In [20]:
data_val, data_test = data_val.drop_duplicates(), data_test.drop_duplicates()

In [21]:
#data_val.to_csv('results/real_data_val.csv', index=False)
#data_test.to_csv('results/real_data_test.csv', index=False)

In [22]:
#data_val.to_csv('results/syn_data_val.csv', index=False)
#data_test.to_csv('results/syn_data_test.csv', index=False)

## Stats

In [23]:
#real_data_files = get_csv_filenames('datasets/processed_data')
#create_analytics_df(real_data_files).to_csv('datasets/other/stats_new.csv', index=False)

In [24]:
stats_raw = pd.read_csv('datasets/other/stats.csv', dtype={'column_index':'string'})
stats_raw

Unnamed: 0,filename,column_index,Mean,Median,Min,Max,Variance,P2P_amplitude,Trend_slope,Spectral_centroid,Spectral_rolloff,Spectral_entropy,Strength_of_trend,Mean_crossing_rate,Median_crossing_rate,is_stationary
0,c:\Users\User\Desktop\Project_repos\tsvalidati...,0,3.236646,0.754264,0.000,43.581806,3.135014e+01,43.581806,0.001134,0.210916,0.466772,0.910387,0.847788,0.286619,0.307205,True
1,c:\Users\User\Desktop\Project_repos\tsvalidati...,1,3.545170,0.772238,0.000,51.703699,3.747002e+01,51.703699,0.001233,0.211844,0.466772,0.911376,0.842911,0.302454,0.306413,True
2,c:\Users\User\Desktop\Project_repos\tsvalidati...,2,19.092943,11.050000,3.300,192.000000,4.540312e+02,188.700000,0.008202,0.139688,0.430380,0.738486,1.671502,0.080760,0.063341,True
3,c:\Users\User\Desktop\Project_repos\tsvalidati...,3,8.188431,4.160000,0.515,85.300000,1.199358e+02,84.785000,0.004134,0.135518,0.425633,0.744166,1.727200,0.090261,0.067300,True
4,c:\Users\User\Desktop\Project_repos\tsvalidati...,4,0.885597,0.577500,-0.179,4.286000,8.136213e-01,4.465000,0.000307,0.130385,0.423259,0.687693,1.905978,0.090261,0.068092,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,c:\Users\User\Desktop\Project_repos\tsvalidati...,0,542.062745,536.000000,412.000,725.000000,3.354341e+03,313.000000,-0.212687,0.064824,0.372549,0.589454,1.200526,0.240157,0.255906,False
264,c:\Users\User\Desktop\Project_repos\tsvalidati...,1,6767.615686,6565.000000,3224.000,10857.000000,2.082331e+06,7633.000000,-3.613374,0.069888,0.364706,0.537016,1.622677,0.165354,0.200787,False
265,c:\Users\User\Desktop\Project_repos\tsvalidati...,2,2870.447059,2980.000000,757.000,4822.000000,7.663633e+05,4065.000000,-8.886310,0.089182,0.407843,0.464625,1.816557,0.129921,0.173228,False
266,c:\Users\User\Desktop\Project_repos\tsvalidati...,3,49.407843,45.000000,32.000,83.000000,1.752611e+02,51.000000,0.099019,0.089059,0.415686,0.507671,1.602214,0.173228,0.200787,False


In [25]:
# preprocessing file
stats_raw['filename'] = stats_raw['filename'].apply(clean_filename, st_split = '/')
stats_raw['filename'] = stats_raw['filename'].apply(clean_filename, st_split = '\\')
stats_raw = series_name(stats_raw)

In [26]:
field_series_map = data_test[[ 'Field','series']].drop_duplicates()
stats = pd.merge(field_series_map, stats_raw, how = 'right', on ='series')

In [27]:
stats.is_stationary.sum()

118

In [28]:
#stats.to_csv('results/time_series_stats.csv', index=False)