In [1]:
import pandas as pd
import os
import yaml

import sys
from pathlib import Path
sys.path.insert(0,str(Path(os.path.abspath('.')).parent.parent))

from eee.resource.validation.database import ValidationDatabase
import eee.solar.model.util

In [2]:
validation_database_path = Path.home()/"OneDrive-3E/Research/Solar/validation/validation_database"
yaml_path = Path.home()/"OneDrive-3E/Research/Solar/tickets/2024/IN3043_accuracy_factors_analysis/IN3471_get_model_data/db_sites_ready.yml"
result_path = Path.home()/"OneDrive-3E/Research/Solar/tickets/2024/IN3475_check_measurement_data/reference_data_difference_removed"

In [3]:
## instantiate validation database
db = ValidationDatabase(validation_database_path)

In [4]:
sites_inv = db.get_sites()
sites_inv

SitesInventory(n=291)

In [5]:
# sites_inv.df.name.unique()

In [6]:
with open(yaml_path, 'r') as file:
    sites_name = yaml.safe_load(file)

filtered_sites_name = [site for site in sites_name if not site.startswith('bsrn_')]


In [7]:
period_dict = {
    'energydata_38': [
        ('2015-07-05 09:40:00+00:00', '2015-08-27 09:40:00+00:00'),
        ('2016-05-27 09:40:00+00:00', '2016-08-23 09:40:00+00:00')        
    ],
    'energydata_54': [
        ('2020-03-09 00:00:00+00:00', '2020-12-07 00:00:00+00:00')
    ],
    'energydata_62': [
        ('2021-05-01 05:48:00+00:00', '2021-09-07 05:48:00+00:00')
    ],
    'energydata_64': [
        ('2016-12-16 21:05:00+00:00', '2017-04-14 21:05:00+00:00')
    ],
    
    'enermena_1': [
        ('2007-01-01 00:00:00+00:00', '2007-01-10 00:00:00+00:00'),
        ('2000-12-02 00:00:00+00:00', '2000-12-17 00:00:00+00:00'),
        ('2008-12-18 00:00:00+00:00', '2009-01-05 00:00:00+00:00'),        
        ('2012-09-24 00:00:00+00:00', '2012-10-27 00:00:00+00:00'),
        ('2020-09-08 00:00:00+00:00', '2020-11-11 00:00:00+00:00'),
        ('2021-10-12 00:00:00+00:00', '2022-01-16 00:00:00+00:00'),        
        ('2022-09-15 00:00:00+00:00', '2022-12-30 00:00:00+00:00'),
        ('2023-10-06 00:00:00+00:00', '2023-11-19 00:00:00+00:00')        
    ],    
    'enermena_2': [
        ('2014-05-19 00:00:00+00:00', '2014-06-17 00:00:00+00:00'),
        ('2021-04-25 00:00:00+00:00', '2021-06-05 00:00:00+00:00'),
    ],    
    'enermena_8': [
        ('2007-12-05 00:00:00+00:00', '2007-12-16 00:00:00+00:00'),
        ('2013-09-19 00:00:00+00:00', '2013-10-26 00:00:00+00:00'),
        ('2012-09-27 00:00:00+00:00', '2012-11-16 00:00:00+00:00'),
        ('2018-01-01 00:00:00+00:00', '2018-01-28 00:00:00+00:00'),
    ],    
    'enermena_9': [
        ('2013-09-21 00:00:00+00:00', '2013-10-16 00:00:00+00:00'),
    ],    
    'enermena_10': [
        ('2019-12-16 00:00:00+00:00', '2020-03-10 00:00:00+00:00'),
        ('2020-12-15 00:00:00+00:00', '2021-04-11 00:00:00+00:00'),
        ('2023-03-16 00:00:00+00:00', '2023-10-02 00:00:00+00:00'),
    ],    
    'enermena_11': [
        ('2017-04-07 00:00:00+00:00', '2017-05-16 00:00:00+00:00'),
        ('2011-05-28 00:00:00+00:00', '2011-06-09 00:00:00+00:00'),
        ('2012-05-22 00:00:00+00:00', '2012-06-02 00:00:00+00:00'),
    ],    
}


In [8]:
# sites without corresponding model data
empty_list = ['energydata_20', 'energydata_21', 'energydata_22', 'energydata_23', 'energydata_36', 'energydata_37', 'energydata_40', 'energydata_41', 'energydata_42', 'energydata_43']

# site with whole range of large difference
drop_list = ['energydata_20', 'energydata_52', 'energydata_53', 'energydata_55', 'energydata_56', 'energydata_57', 'energydata_58', 'energydata_59', 'energydata_72']

In [9]:
for site_id in filtered_sites_name:
    if site_id not in empty_list and site_id not in drop_list:
    
        site = sites_inv.get_site(site_id)
        print(f'start {site_id}')
    
        # Reading
        ## measurement data
        ts = db.get_measurement_timeseries(
            site=site,
            validation_variables=["ghi"],
        )

        df = ts.to_dataframe()

        start_time = df.first_valid_index()
        end_time = df.last_valid_index()
    
        df_converted = eee.solar.model.util.ire_to_irn(df)

        df_converted_resampled_15min = df_converted.resample('15T', origin=start_time, label='left', closed='left').sum()
        df_converted_resampled_day = df_converted.resample('1D', origin=start_time, label='left', closed='left').sum()

        ## simulation data
        ts_sim = db.get_simulation_timeseries(
            site=site,
            validation_variables=["ghirrn"],
            model="solar-resource-api-1.6.12",
        )
    
        df_sim = ts_sim.to_dataframe()
        df_sim = df_sim[start_time:end_time]

        df_sim_resampled_day = df_sim.resample('1D', origin=start_time, label='left', closed='left').sum()
                   
    
        ref = df_converted_resampled_day.iloc[:,0]
        ref.index = pd.DatetimeIndex(ref.index)
        mod = df_sim_resampled_day.iloc[:,0]
        mod.index = pd.DatetimeIndex(mod.index)
        

        # Difference
        ref_aligned, mod_aligned = ref.align(mod, fill_value=0)   
        difference = mod_aligned - ref_aligned
        difference[ref_aligned == 0] = float('nan')
        difference[mod_aligned == 0] = float('nan')


        # Threshold: 1300
        mask = abs(difference) > 1300
        datetime_index_to_record = difference.loc[mask].index
    
        ## list of datatime to delete
        print(f"DatetimeIndex where absolute difference > 1300 for site_id {site_id}:")
        print(datetime_index_to_record)
    
        ref[difference > 1300] = float('nan')
        deleted_values = mask.sum()
        print(f"number of absolute difference > 1300 for site_id {site_id}: {deleted_values}")
        
        
        # Periods to drop
        for site, periods in period_dict.items():
            for start, end in periods:
                ref.loc[start:end,] = float('nan')


        ref.to_csv(os.path.join(result_path, f'{site_id}_difference_removed.csv'))

start energydata_1
DatetimeIndex where absolute difference > 1300 for site_id energydata_1:
DatetimeIndex([], dtype='datetime64[ns, UTC]', name='time', freq='D')
number of absolute difference > 1300 for site_id energydata_1: 0
start energydata_2
DatetimeIndex where absolute difference > 1300 for site_id energydata_2:
DatetimeIndex(['2022-05-22 05:43:00+00:00'], dtype='datetime64[ns, UTC]', name='time', freq='D')
number of absolute difference > 1300 for site_id energydata_2: 1
start energydata_3
DatetimeIndex where absolute difference > 1300 for site_id energydata_3:
DatetimeIndex(['2021-11-26 05:37:00+00:00', '2021-12-02 05:37:00+00:00',
               '2021-12-31 05:37:00+00:00', '2022-01-28 05:37:00+00:00',
               '2022-02-04 05:37:00+00:00', '2022-02-12 05:37:00+00:00',
               '2022-02-13 05:37:00+00:00', '2022-02-23 05:37:00+00:00',
               '2022-03-04 05:37:00+00:00'],
              dtype='datetime64[ns, UTC]', name='time', freq=None)
number of absolute diff