In [1]:
import os
from pathlib import Path
import pandas as pd
import yaml
import timezonefinder


import sys
from pathlib import Path
sys.path.insert(0,str(Path(os.path.abspath('.')).parent.parent))
import library.validation.parsing as parsing
import library.validation.accuracy.comparison_new_measurement as comparison
from eee.resource.validation.database import ValidationDatabase
import eee.solar.model.util

In [2]:
validation_database_path = Path.home()/"OneDrive-3E/Research/Solar/validation/validation_database"
plot_path = Path.home()/"OneDrive-3E/Research/Solar/tickets/2024/IN3475_check_measurement_data/comparison_plots"
yaml_path = Path.home()/"OneDrive-3E/Research/Solar/tickets/2024/IN3043_accuracy_factors_analysis/IN3471_get_model_data/db_sites_ready.yml"
meta_path =  Path.home()/"OneDrive-3E/Research/Solar/tickets/2023/IN2946_EnergyData_measurement_data/metadata" 

In [3]:
metadata = pd.read_csv(os.path.join(meta_path, 'data.csv'))
metadata['normalized_station_name'] = metadata['station name'].apply(parsing.normalize_station_name)


In [4]:
## instantiate validation database
db = ValidationDatabase(validation_database_path)

In [5]:
sites_inv = db.get_sites()
sites_inv

SitesInventory(n=291)

In [6]:
with open(yaml_path, 'r') as file:
    sites_name = yaml.safe_load(file)

filtered_sites_name = [site for site in sites_name if site.startswith('energydata_')]


In [7]:
for site_id in filtered_sites_name:
    site = sites_inv.get_site(site_id)

    # Reading
    ## Measurement data
    ts = db.get_measurement_timeseries(
        site=site,
        validation_variables=["ghi"],
    )

    df = ts.to_dataframe()

    # Convert ire to irn
    df_converted = eee.solar.model.util.ire_to_irn(df)

    # Time shift
    metadata_row = metadata[metadata['normalized_station_name'] == site.name]
    if not metadata_row.empty:
        # Determine local time zone from latitude and longitude using timezonefinder
        tf = timezonefinder.TimezoneFinder()
        timezone_str = tf.certain_timezone_at(
            lat=metadata_row["latitude"].values[0],
            lng=metadata_row["longitude"].values[0]
        )

        if df_converted.index.tzinfo is None:
            # If not timezone-aware, localize the index to the specified timezone
            df_converted.index = pd.DatetimeIndex(df_converted.index).tz_localize(
                tz=timezone_str,
                ambiguous="NaT",
                nonexistent='shift_forward'
            )
        else:
            # If already timezone-aware, convert the index to the new timezone
            df_converted.index = df_converted.index.tz_convert(
                tz=timezone_str
            )
    else:
        print(f"No metadata found for {site_id}. Skipping this dataframe.")

    # Calculate start and end times
    start_time = df_converted.first_valid_index().normalize()
    end_time = df_converted.last_valid_index().normalize()
    # end_time = df_converted.last_valid_index().normalize() + pd.Timedelta(days=1) - pd.Timedelta(minutes=1)

    # Resampling the data
    df_converted_resampled_15min = df_converted.resample('15T', origin=start_time, label='left', closed='left').sum()
    df_converted_resampled_day = df_converted.resample('1D', origin=start_time, label='left', closed='left').sum()

    # Simulation data
    ts_sim = db.get_simulation_timeseries(
        site=site,
        validation_variables=["ghirrn"],
        model="solar-resource-api-1.6.12",
    )

    df_sim = ts_sim.to_dataframe()
    df_sim = df_sim[start_time:]


    # Return the data with 15-minute and 1-day granularities
    ref_15m = df_converted_resampled_15min.iloc[:,0]
    mod_15m = df_sim.iloc[:,0]
    ref_15m.index = ref_15m.index.tz_convert(timezone_str)  # Convert ref_15m index
    mod_15m.index = mod_15m.index.tz_convert(timezone_str)  # Convert mod_15m index


    # 1-day granularity (for difference check)
    ref_1d = df_converted_resampled_day.iloc[:,0]
    ref_1d.index = ref_1d.index.tz_convert(timezone_str)  # Convert ref_1d index
    mod_1d = mod_15m.resample('1D', origin=start_time, label='left', closed='left').sum()
    
    site_directory = os.path.join(plot_path, site_id)
    os.makedirs(site_directory, exist_ok=True)
    
    print(f'start {site_id}')
    
    # shift check
    comparison.shift_check_start_end(site_id, ref_15m, mod_15m, start_time, site_directory, start=True)
    comparison.shift_check_start_end(site_id, ref_15m, mod_15m, end_time, site_directory)

    # interactive plots
    comparison.ref_mod_interactive(site_id, ref_1d, mod_1d, site_directory)

    # list of top difference
    merged_as, merged_de = comparison.diff_ratio_merged(site_id, ref_1d, mod_1d)
    merged_as.to_csv(os.path.join(site_directory, 'top_negative_differences.csv'))
    merged_de.to_csv(os.path.join(site_directory, 'top_positive_differences.csv'))

start energydata_1
start energydata_2
start energydata_3
start energydata_4
start energydata_5
start energydata_6
start energydata_7
start energydata_8
start energydata_9
start energydata_10
start energydata_11
start energydata_12
start energydata_13
start energydata_14
start energydata_15
start energydata_16
start energydata_17
start energydata_18
start energydata_19
start energydata_20
start energydata_21
start energydata_22
start energydata_23
start energydata_24
start energydata_25
start energydata_26
start energydata_27
start energydata_28
start energydata_29
start energydata_30
start energydata_31
start energydata_32
start energydata_33
start energydata_34
start energydata_35
start energydata_36
start energydata_37
start energydata_38
start energydata_39
start energydata_40
start energydata_41
start energydata_42
start energydata_43
start energydata_44
start energydata_45
start energydata_46
start energydata_47
start energydata_48
start energydata_49
start energydata_50
start ene