In [1]:
from pathlib import Path
import pandas as pd
import os
import numpy as np

from eee.resource.validation.database import ValidationDatabase
from eee.resource.validation.models.site import Site, SitesInventory
from eee.ontology.variables import Variable
from eee.resource.validation import REGISTRY
from eee.resource.validation.models.timeseries import TimeseriesVariable,TimeseriesDataset, TimeseriesVariableStandardizer

In [2]:
TICKET_DIR = Path.home()/"OneDrive-3E/Research/Solar/tickets/2023/IN2833_centralize_measurement_data_collection/IN2835_BSRN"

In [3]:
raw_data_dir = os.path.join(TICKET_DIR, "raw_data")
intermediate_data_dir = os.path.join(TICKET_DIR, "intermediate_data")
parsed_data_dir = os.path.join(TICKET_DIR, "parsed_data")

In [4]:
validation_database_path = Path.home()/"OneDrive-3E/Research/Solar/validation/validation_database_lhn"

In [5]:
variables_match = {
    "ghi": "ghi",
    "dhi": "dhi",
    "dni": "dni",
}

## get list of stations already pushed

In [6]:
## instantiate validation database
db = ValidationDatabase(validation_database_path)

In [7]:
db_stations = db.get_sites()

In [8]:
db_stations_name = db_stations.df.name.unique()
db_stations_name

array([], dtype=object)

In [9]:
# site_name = "kwajalein"
# site_id = db.get_sites(names=site_name).site_ids[0]
# site = db_stations.get_site(site_id)
# site

In [10]:
# db_stations.remove_site(site)

## read inventory and remove already pushed sites

In [11]:
## read inventory
inventory = pd.read_csv(os.path.join(parsed_data_dir, "inventory.csv"))
len(inventory)

150

In [12]:
inventory = inventory[~inventory["name"].isin(db_stations_name)]
len(inventory)

150

## push

In [13]:
inventory.columns

Index(['name', 'domain', 'latitude', 'longitude', 'source', 'classification',
       'device_type', 'pyrnanometer_type', 'variable_name',
       'variable_physical_parameter_id', 'variable_units',
       'variable_time_granularity', 'variable_start', 'variable_end',
       'variable_temporal_aggregation_method',
       'variable_temporal_aggregation_period',
       'variable_temporal_aggregation_convention',
       'variable_data_availability_percent', 'timeseries_path', 'institution',
       'description', 'references'],
      dtype='object')

In [14]:
old_path_prefix = '/home/pmalcorps/OneDrive-3E/RD_ResourceData/Research/Solar/tickets/2023/IN2833_centralize_measurement_data_collection/IN2835_BSRN/parsed_data/'
new_path_prefix = '/home/lhn3e/OneDrive-3E/Research/Solar/tickets/2023/IN2833_centralize_measurement_data_collection/IN2835_BSRN/parsed_data/'

inventory["timeseries_path"] = inventory["timeseries_path"].apply(lambda x: x.replace(old_path_prefix, new_path_prefix))


In [15]:
list_files = inventory["timeseries_path"].unique()
len(list_files)

50

In [16]:
# list_files = list_files[:3]
# len(list_files)

In [17]:
for file in list_files:
    inventory_station = inventory[inventory["timeseries_path"]==file]
    station = inventory_station.iloc[0]["name"]
    print(f"{station} push start")
    
    df_timeseries = pd.read_csv(file, index_col=0)
    df_timeseries.index = pd.DatetimeIndex(df_timeseries.index)
    
    list_of_variables = []
    list_of_standardized_ts_variable = []
    
    for index, row in inventory_station.iterrows():
        variable = Variable(name = row.variable_name, 
                            physical_parameter_id=row.variable_physical_parameter_id,
                            units = row.variable_units,
                            time_granularity = row.variable_time_granularity,
                            start = row.variable_start,
                            end = row.variable_end,
                            temporal_aggregation_method = row.variable_temporal_aggregation_method,
                            temporal_aggregation_period = row.variable_time_granularity,
                            temporal_aggregation_timestamp_convention=row.variable_temporal_aggregation_convention,
                            data_availability_percent = row.variable_data_availability_percent)

        var_timeseries = df_timeseries.loc[:,row.variable_name]
        ts_variable = TimeseriesVariable(variable=variable,data=var_timeseries)

        list_of_variables.append(variable)
        list_of_standardized_ts_variable.append(ts_variable)
    
    site = Site(name = inventory_station.iloc[0]["name"],
                domain = inventory_station.iloc[0]["domain"], 
                latitude = inventory_station.iloc[0]["latitude"],
                longitude = inventory_station.iloc[0]["longitude"],
                variables = list_of_variables,
                source = REGISTRY.datasources(id=inventory_station.iloc[0]["source"]),
                classification = inventory_station.iloc[0]["classification"],
                device_type = inventory_station.iloc[0]["device_type"],
           )
    ts_dataset = TimeseriesDataset(timeseries_variables=list_of_standardized_ts_variable)
    
    db.add_site(site)
    db.add_measurement_timeseries(site, ts_dataset)
    print(f"{station} push completed")

abashiri push start
abashiri push completed
alert push start
alert push completed
alice_springs push start
alice_springs push completed
barrow push start
barrow push completed
bermuda push start
bermuda push completed
billings push start
billings push completed
bondville push start
bondville push completed
boulder1 push start
boulder1 push completed
boulder push start
boulder push completed
budapest_lorinc push start
budapest_lorinc push completed
cabauw push start
cabauw push completed
camborne push start
camborne push completed
carpentras push start
carpentras push completed
chesapeake_light push start
chesapeake_light push completed
concordia_station_dome_c push start
concordia_station_dome_c push completed
darwin push start
darwin push completed
desert_rock push start
desert_rock push completed
de_aar push start
de_aar push completed
eastern_north_atlantic push start
eastern_north_atlantic push completed
fort_peck push start
fort_peck push completed
fukuoka push start
fukuoka push 