In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

from validation import mbe, nmbe, rmse, nrmse, get_summary_from_validation_metrics

In [19]:
data_path = Path.home()/"OneDrive-3E/Research/Solar/tickets/2024/IN3043_accuracy_factors_analysis/IN3136_EURAC_shared_folder/data_sites"
results_path = Path.home()/"OneDrive-3E/Research/Solar/tickets/2024/IN3043_accuracy_factors_analysis/IN3136_EURAC_shared_folder/validation_removed"

## read sites metadata

In [5]:
sites_metadata = pd.read_csv(os.path.join(data_path, "sites_metadata.csv"), index_col=0)
sites_metadata

Unnamed: 0_level_0,country,latitude,longitude,altitude,provider
station_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
camborne,United Kingdom,50.22,-5.32,88.0,bsrn
carpentras,France,44.08,5.06,100.0,bsrn
budapest_lorinc,Hungary,47.43,19.18,139.0,bsrn
magurele_mars,Romania,44.34,26.01,110.0,bsrn
cabauw,Netherlands,51.97,4.93,0.0,bsrn
...,...,...,...,...,...
dublin_arpt,Ireland,53.43,-6.23,82.0,wrdc
grossenzersdorf,Austria,48.20,16.57,157.0,wrdc
gonzaga,Italy,44.96,10.77,16.0,lombardy
landriano,Italy,45.32,9.27,88.0,lombardy


## based on daily data

### read data

In [6]:
reference_data = pd.read_csv(os.path.join(data_path, "parsed_removed", "parsed_removed_reference_data.csv"), index_col=0)
model_data = pd.read_csv(os.path.join(data_path, "parsed_removed", "parsed_removed_model_data.csv"), index_col=0)

In [7]:
# convert index in pd.DateTimeIndex
reference_data.index = pd.DatetimeIndex(reference_data.index)
model_data.index = pd.DatetimeIndex(model_data.index)

### parse

In [8]:
# convert to nan where the other dataset is nan
reference_data = reference_data.where(model_data.notna())
model_data = model_data.where(reference_data.notna())

In [9]:
# aggregate
reference_data_monthly = reference_data.resample('MS').sum(min_count=1)
model_data_monthly = model_data.resample('MS').sum(min_count=1)
reference_data_yearly = reference_data.resample('AS').sum(min_count=1)
model_data_yearly = model_data.resample('AS').sum(min_count=1)

### calculate sites validation metrics

In [10]:
valid_data_pairs_daily = reference_data.count()

In [12]:
validation_metrics = 100 * pd.DataFrame({site: {
    "nmbe": nmbe(model_data[site], reference_data[site]),
    "nrmse_yearly": nrmse(model_data_yearly[site], reference_data_yearly[site]),
    "nrmse_monthly": nrmse(model_data_monthly[site], reference_data_monthly[site]),
    "nrmse_daily": nrmse(model_data[site], reference_data[site]),
} for site in reference_data.columns}).T    

In [13]:
validation_metrics["valid_data_pairs_daily"] = valid_data_pairs_daily

## concat

In [14]:
columns_round_2 = ["nmbe", "nrmse_yearly", "nrmse_monthly", "nrmse_daily"]

In [15]:
columns_int = ["valid_data_pairs_daily"]

In [16]:
for column in list(validation_metrics.columns):
    
    if column in columns_round_2:
        
        validation_metrics[column] = round(validation_metrics[column], 2)
    
    elif column in columns_int:
        
        validation_metrics[column] = validation_metrics[column].astype('Int64')

In [17]:
validation_metrics = validation_metrics[["nmbe", "nrmse_yearly", "nrmse_monthly", "nrmse_daily", "valid_data_pairs_daily"]]

In [20]:
validation_metrics.to_csv(os.path.join(results_path, "validation_results_sites.csv"))

In [18]:
validation_metrics

Unnamed: 0,nmbe,nrmse_yearly,nrmse_monthly,nrmse_daily,valid_data_pairs_daily
camborne,-1.56,2.25,2.87,9.90,941
carpentras,-0.46,0.79,2.25,6.23,1459
budapest_lorinc,-0.21,0.37,2.50,6.64,935
magurele_mars,-0.12,0.12,2.78,6.35,244
cabauw,0.06,0.63,2.18,7.64,2547
...,...,...,...,...,...
zuerich_kloten,0.77,1.00,2.68,7.44,2192
basel,2.36,2.63,3.72,7.86,2192
gurteen,2.35,2.52,4.56,10.82,2192
dublin_arpt,3.36,3.49,5.24,9.97,2187
