# Demonstration of the v3 `PlantData` class

In [1]:
from copy import deepcopy
from pprint import pprint
from pathlib import Path

import yaml
import numpy as np
import pandas as pd
from openoa import PlantData

import project_ENGIE

In [2]:
scada_df, meter_df, curtail_df, asset_df, reanalysis_dict = project_ENGIE.prepare(return_value="dataframes")

INFO:root:Loading SCADA data
INFO:root:SCADA data loaded
INFO:root:Timestamp conversion to datetime and UTC
INFO:root:Removing out of range of temperature readings
INFO:root:Flagging unresponsive sensors
INFO:root:Converting pitch to the range [-180, 180]
INFO:root:Calculating energy production
INFO:root:Reading in the meter data
INFO:root:Reading in the curtailment data
INFO:root:Reading in the reanalysis data and calculating the extra fields
INFO:root:Reading in the asset data


In [10]:
with open("data/plant_meta.yml", "r") as f:
    meta = yaml.safe_load(f)
pprint(meta)

{'asset': {'elevation': 'elevation_m',
           'hub_height': 'Hub_height_m',
           'id': 'Wind_turbine_name',
           'latitude': 'Latitude',
           'longitude': 'Longitude',
           'rated_power': 'Rated_power',
           'rotor_diameter': 'Rotor_diameter_m'},
 'curtail': {'availability': 'availability_kwh',
             'curtailment': 'curtailment_kwh',
             'frequency': '10T',
             'net_energy': 'net_energy_kwh',
             'time': 'time'},
 'latitude': 48.4497,
 'longitude': 5.5896,
 'meter': {'energy': 'net_energy_kwh', 'time': 'time'},
 'reanalysis': {'era5': {'frequency': 'H',
                         'surface_pressure': 'surf_pres',
                         'temperature': 't_2m',
                         'time': 'datetime',
                         'windspeed_u': 'u_100',
                         'windspeed_v': 'v_100'},
                'merra2': {'frequency': 'H',
                           'surface_pressure': 'surface_pressure',
          

In [3]:
engie = PlantData(
    analysis_type=None,
    metadata="data/plant_meta.yml",
    scada=scada_df,
    meter=meter_df,
    curtail=curtail_df,
    asset=asset_df,
    reanalysis=reanalysis_dict
)

In [4]:
engie.analysis_type = "all"
engie.validate()

ValueError: `scada` data is missing the following columns: ['status']
`meter` data is missing the following columns: ['power']
`tower` data is missing the following columns: ['time', 'id']
`status` data is missing the following columns: ['time', 'id', 'status_id', 'status_code', 'status_text']
`scada` data columns were of the wrong type: ['status']
`meter` data columns were of the wrong type: ['power']
`tower` data columns were of the wrong type: ['time', 'id']
`status` data columns were of the wrong type: ['time', 'id', 'status_id', 'status_code', 'status_text']
`scada` data is of the wrong frequency: None

In [12]:
engie.scada.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Wind_turbine_name,Date_time,Ba_avg,P_avg,Ws_avg,Va_avg,Ot_avg,Ya_avg,Wa_avg,time,energy_kwh
time,id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-01-01 00:00:00,R80736,R80736,2014-01-01T01:00:00+01:00,-1.0,642.78003,7.12,0.66,4.69,181.34,182.00999,2014-01-01 00:00:00,107.130005
2014-01-01 00:00:00,R80721,R80721,2014-01-01T01:00:00+01:00,-1.01,441.06,6.39,-2.48,4.94,179.82001,177.36,2014-01-01 00:00:00,73.51
2014-01-01 00:00:00,R80790,R80790,2014-01-01T01:00:00+01:00,-0.96,658.53003,7.11,1.07,4.55,172.39,173.50999,2014-01-01 00:00:00,109.755005
2014-01-01 00:00:00,R80711,R80711,2014-01-01T01:00:00+01:00,-0.93,514.23999,6.87,6.95,4.3,172.77,179.72,2014-01-01 00:00:00,85.706665
2014-01-01 00:10:00,R80790,R80790,2014-01-01T01:10:00+01:00,-0.96,640.23999,7.01,-1.9,4.68,172.39,170.46001,2014-01-01 00:10:00,106.706665


## Load the data and create file mappings for later use

In [None]:
# project = Project_Engie('./data/la_haute_borne')
# project.prepare()

fpath = Path("data/la_haute_borne")
fn_scada = fpath / "la-haute-borne-data-2014-2015.csv"
fn_meter = fpath / "plant_data.csv"
fn_curtail = fpath / "plant_data.csv"
fn_reanalysis_merra2 = fpath / "merra2_la_haute_borne.csv"
fn_reanalysis_era5 = fpath / "era5_wind_la_haute_borne.csv"
fn_asset = fpath / "la-haute-borne_asset_table.csv"

yaml_meta = "data/plant_meta.yml"
project = PlantData(
    analysis_type=None,  # Choosing a random type that doesn't fail validation
    metadata=yaml_meta,
    scada=fn_scada,
    meter=fn_meter,
    curtail=fn_curtail,
    asset=fn_asset,
    reanalysis=dict(era5=fn_reanalysis_era5, merra2=fn_reanalysis_merra2),
)

# Create missing variables from the data set
project.asset["type"] = "turbine"

In [None]:
fpath = Path("data/la_haute_borne")
fn_scada = fpath / "la-haute-borne-data-2014-2015.csv"
fn_meter = fpath / "plant_data.csv"
fn_curtail = fpath / "plant_data.csv"
fn_reanalysis_merra2 = fpath / "merra2_la_haute_borne.csv"
fn_reanalysis_era5 = fpath / "era5_wind_la_haute_borne.csv"
fn_asset = fpath / "la-haute-borne_asset_table.csv"

scada = pd.read_csv(fn_scada)
meter = pd.read_csv(fn_meter)
curtail = pd.read_csv(fn_curtail)
reanalysis_era5 = pd.read_csv(fn_reanalysis_era5)
reanalysis_merra2 = pd.read_csv(fn_reanalysis_merra2)
asset = pd.read_csv(fn_asset)

latitude = 48.4497
longitude = 5.5896

yaml_meta = "data/plant_meta.yml"
json_meta = "data/plant_meta.json"

## TODO
 - [x] read data from spark, csv, pandas
 - [x] read metadata from json, yaml, dict, and pre-loaded object
 - [x] automatically calculate wind direction from u/v windspeed
 - [x] call planetos api if API key is provided
   - [x] validate this works
 - [x] support flags for if csv/planetos/data object/etc
 - datetime column frequency checks
    - [ ] check against the provided metadata
    - [ ] validate against the analysis requirements
    - **note**: bring Lewis into this conversation on datetime & frequency validation, but is ok to use pandas for now
 - [x] expand metadata to contain plant-level identifiers (latitude, longitude)
 - check against the -25 namings and (likely) adopt that naming convention for the plant data
   - [ ] update internal column naming convention to the -25 schema (Eric/Lewis)
 - [x] map the input column names, and provide a method to provide them back as the original inputs
 - [x] get the 0 notebook working, or at least as a means to understand what will be required for refactoring
 - [x] no failures for tower data as it's not used
 - [x] none flag for raising warning, not error, for missing/bad data
   - `None` will run no validation
 - [ ] flag to not raise an error for known missing data
 - [x] metadata keyword argument for validate() to recreate `PlantMetaData`
     - allows for more flexibility in use cases, especially in the exploratory phase, or for changing analysis types
 - [ ] review the v3 todo workbook to stay on track with the rest of v3 development


## Create a dictionary of plant meta data 

**NOTE**: the datetime frequency checking is not in place, but the placeholder exists to implement it later

In [None]:
plant_meta = dict(
    latitude=latitude,
    longitude=longitude,
    scada=dict(
        time="Date_time",
        id="Wind_turbine_name",
        power="P_avg",
        windspeed="Ws_avg",
#         wtur_wspd="Ws_avg",  # TODO: adopt the -25 naming
        wind_direction="Wa_avg",
#         status="?",
        pitch="Ba_avg",
        temperature="Ot_avg",
        frequency="10T",
    ),
    meter=dict(
        time="time_utc",
        energy="net_energy_kwh",
    ),
    curtail=dict(
        time="time_utc",
        curtailment="curtailment_kwh",
        availability="availability_kwh",
        net_energy="net_energy_kwh",
        frequency="10T",
    ),
    reanalysis=dict(  # keys are informational/product-type, not pre-defined
        era5=dict(
            time="datetime",
            # windspeed="ws_100m",  # Commented out to demonstrate variable creation from base windspeed data
            windspeed_u="u_100",
            windspeed_v="v_100",
            temperature="t_2m",
            # density="dens_100m",  # Commented out to demonstrate variable creation from base windspeed data
            surface_pressure="surf_pres",
            frequency="H",
        ),
        merra2=dict(
            time="datetime",
            # windspeed="ws_50",  # Commented out to demonstrate variable creation from base windspeed data
            windspeed_u="u_50",
            windspeed_v="v_50",
            temperature="temp_10m",
            # density="dens_50",  # Commented out to demonstrate variable creation from base windspeed data
            surface_pressure="surface_pressure",
            frequency="H",
        )
    ),
    asset=dict(
        id="id",
        latitude="Latitude",
        longitude="Longitude",
        rated_power="Rated_power",
        hub_height="Hub_height_m",
        rotor_diameter="Rotor_diameter_m",
        elevation="elevation_m",
#         type="?",
    ),
)

# Recreate the YAML and JSON meta data objects as the dictionary above gets updated
import yaml
import json

with open(yaml_meta, "w") as f:
    yaml.safe_dump(plant_meta, f, default_flow_style=False)
    
with open(json_meta, "w") as f:
    json.dump(plant_meta, f, indent=4)

## Demonstrate the loading from YAML, JSON, and dictionary produce the exact same meta data

In [None]:
meta_from_dict = PlantMetaData.from_dict(plant_meta)
meta_from_json = PlantMetaData.from_json(json_meta)
meta_from_yaml = PlantMetaData.from_yaml(yaml_meta)
meta_from_dict == meta_from_json == meta_from_yaml, type(meta_from_dict)

## Show the PlantData capabilities

### Load from `DataFrame`s and a metadata dictionary

In [None]:
plant_from_data = PlantDataV3(
    metadata=meta_from_dict,
    scada=scada,
    meter=meter,
    curtail=curtail,
    reanalysis={"merra2": reanalysis_merra2, "era5": reanalysis_era5},  # preferred, and enable API pulling
    asset=asset,
    analysis_type="MonteCarloAEP",
)
type(plant_from_data)

### Show that "windspeed", "wind_direction", and "density" columns are all created from the core variables

In [None]:
plant_from_data.reanalysis["era5"].head()

In [None]:
plant_from_data.reanalysis["merra2"].head()

### Show loading the data from file for both the meta data (JSON and YAML) and data (CSV)

In [None]:
plant_from_file1 = PlantDataV3(
    metadata=yaml_meta,
    scada=fn_scada,
    meter=fn_meter,
    curtail=fn_curtail,
    reanalysis={"merra2": fn_reanalysis_merra2, "era5": fn_reanalysis_era5},  # preferred, and enable API pulling
    asset=fn_asset,
    analysis_type="MonteCarloAEP"
)
type(plant_from_file1)

In [None]:
plant_from_file2 = PlantDataV3(
    metadata=json_meta,
    scada=fn_scada,
    meter=fn_meter,
    curtail=fn_curtail,
    reanalysis={"merra2": fn_reanalysis_merra2, "era5": fn_reanalysis_era5},  # preferred, and enable API pulling
    asset=fn_asset,
    analysis_type="MonteCarloAEP"
)
type(plant_from_file2)

### When updating the `analysis_type` to "all", note all the column data errors that are saved until the end of the validation

In [None]:
plant_from_data = PlantDataV3(
    metadata=meta_from_dict,
    scada=scada,
    meter=meter,
    curtail=curtail,
    reanalysis={"merra2": reanalysis_merra2, "era5": reanalysis_era5},  # preferred, and enable API pulling
    asset=asset,
    analysis_type="all"
)

### Demonstrate changing a parameter (`analysis_type`) and revalidating with `PlantDataV3.validate()`

In [None]:
plant = deepcopy(plant_from_data)

In [None]:
plant.analysis_type = None
plant.validate()

In [None]:
plant.analysis_type = "all"
plant.validate()

In [None]:
plant.analysis_type = "TurbineLongTermGrossEnergy"
plant.validate()

In [None]:
plant.analysis_type = "ElectricalLosses"
plant.validate()

#### Direct copy of the analysis requirements for easy referece

In [None]:
ANALYSIS_REQUIREMENTS = {
    "MonteCarloAEP": {
        "meter": {
            "columns": ["energy"],
            "freq": ("MS", "D", "H", "T"),
        },
        "curtail": {
            "columns": ["availability", "curtailment"],
            "freq": ("MS", "D", "H", "T"),
        },
        "reanalysis": {
            "columns": ["windspeed", "rho"],
            "conditional_columns": {
                "reg_temperature": ["temperature"],
                "reg_winddirection": ["windspeed_u", "windspeed_v"],
            },
        },
    },
    "TurbineLongTermGrossEnergy": {
        "scada": {
            "columns": ["id", "windspeed", "power"],  # TODO: wtur_W_avg vs energy_kwh ?
            "freq": ("D", "H", "T"),
        },
        "reanalysis": {
            "columns": ["windspeed", "wind_direction", "rho"],
        },
    },
    "ElectricalLosses": {
        "scada": {
            "columns": ["energy"],
            "freq": ("D", "H", "T"),
        },
        "meter": {
            "columns": ["energy"],
            "freq": ("MS", "D", "H", "T"),
        },
    },
}

### Show the updated column names and how to map them back to the original data

In [None]:
scada.columns.tolist()

In [None]:
plant.scada.columns.tolist()

In [None]:
plant.update_column_names(to_original=True)
plant.scada.columns.tolist()

## Demonstrate the PlanetOS integration

In [None]:
apikey_file = Path("./APIKEY").resolve()
plant_meta_planetos = deepcopy(plant_meta)
plant_meta_planetos["reanalysis"]["era5"] = dict(
    time="datetime",
    windspeed="windspeed_ms",
    wind_direction="winddirection_deg",
    windspeed_u="u_ms",
    windspeed_v="v_ms",
    temperature="temperature_K",
    density="rho_kgm-3",
    surface_pressure="surf_pres_Pa",
    frequency="H",
)
plant_meta_planetos["reanalysis"]["merra2"] = dict(
    time="datetime",
    windspeed="windspeed_ms",
    wind_direction="winddirection_deg",
    windspeed_u="u_ms",
    windspeed_v="v_ms",
    temperature="temperature_K",
    density="rho_kgm-3",
    surface_pressure="surf_pres_Pa",
    frequency="H",
)

plant_from_data = PlantDataV3(
    metadata=meta_from_dict,
    scada=scada,
    meter=meter,
    curtail=curtail,
    reanalysis={
        "merra2": {"apikey_file": apikey_file, "save_pathname": ".", "save_filename": "merra2"},
        "era5": {"apikey_file": apikey_file, "save_pathname": ".", "save_filename": "era5"},
    },
    asset=asset,
    analysis_type="all"
)