# Demonstration of the v3 `PlantData` class

In [1]:
from copy import deepcopy
from pprint import pprint
from pathlib import Path

import numpy as np
import pandas as pd

from openoa import PlantData

In [63]:
import itertools
from openoa.types.plant_v2 import ANALYSIS_REQUIREMENTS

In [65]:
analysis_types = ["MonteCarloAEP", "TurbineLongTermGrossEnergy"]
categories = ("scada", "meter", "tower", "curtail", "reanalysis", "asset")
requirements = {key: ANALYSIS_REQUIREMENTS[key] for key in analysis_types}

frequency_requirements = {key: {name: value["freq"] for name, value in values.items()} for key, values in requirements.items()}
frequency = {k: [] for k in set(itertools.chain.from_iterable([[*val] for val in frequency_requirements.values()]))}

for vals in frequency_requirements.values():
    for name, req in vals.items():
        reqs = frequency[name]
        if reqs == []:
            frequency[name] = set(req)
        else:
            frequency[name] = reqs.intersection(req)
            
frequency

{'scada': {'D', 'H', 'L', 'N', 'S', 'T', 'U', 'min', 'ms', 'us'},
 'meter': {'D',
  'H',
  'L',
  'M',
  'MS',
  'N',
  'S',
  'T',
  'U',
  'W',
  'min',
  'ms',
  'us'},
 'reanalysis': {'D', 'H', 'L', 'N', 'S', 'T', 'U', 'min', 'ms', 'us'},
 'curtail': {'D',
  'H',
  'L',
  'M',
  'MS',
  'N',
  'S',
  'T',
  'U',
  'W',
  'min',
  'ms',
  'us'}}

In [69]:
def check_freq(actual_freq, desired_freq, exact):
    if exact:
        return actual_freq != desired_freq
    
    if desired_freq is None:
        return True

    actual_freq = "".join(filter(str.isalpha, actual_freq))
    return actual_freq in desired_freq

check_freq("10T", frequency["scada"], exact=False)

True

In [11]:
# TODO
# create a frequency based comparison for hour, day, month categories
# be able to prioritize how one is chosen: most/least restrictive
# pass the existing frequency to the _validate_frequency() outputs

In [93]:
plant.scada.set_index("time").index.freq

In [109]:
reanalysis_era5.head()

Unnamed: 0.1,Unnamed: 0,datetime,u_100,v_100,t_2m,surf_pres,ws_100m,dens_100m,windspeed,wind_direction,density
0,0,1999-01-01 00:00:00+00:00,-4.456234,4.999991,277.496492,97020.784132,6.697606,1.216004,6.697606,138.291024,1.216004
1,1,1999-01-01 01:00:00+00:00,-5.006666,4.540059,277.330357,97002.61796,6.758612,1.216525,6.758612,132.201828,1.216525
2,2,1999-01-01 02:00:00+00:00,-5.101353,4.032729,276.968939,97035.490081,6.502823,1.218571,6.502823,128.327129,1.218571
3,3,1999-01-01 03:00:00+00:00,-5.787051,2.111372,276.721193,97021.649188,6.160183,1.219518,6.160183,110.044204,1.219518
4,4,1999-01-01 04:00:00+00:00,-6.349969,1.693571,276.29128,97005.213128,6.571932,1.221259,6.571932,104.93349,1.221259


In [110]:
reanalysis_era5.datetime

0        1999-01-01 00:00:00+00:00
1        1999-01-01 01:00:00+00:00
2        1999-01-01 02:00:00+00:00
3        1999-01-01 03:00:00+00:00
4        1999-01-01 04:00:00+00:00
                    ...           
187167   2020-05-08 17:00:00+00:00
187168   2020-05-08 18:00:00+00:00
187169   2020-05-08 19:00:00+00:00
187170   2020-05-08 20:00:00+00:00
187171   2020-05-08 21:00:00+00:00
Name: datetime, Length: 187172, dtype: datetime64[ns, UTC]

In [111]:
df = pd.read_csv(fn_reanalysis_era5)
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,u_100,v_100,t_2m,surf_pres,ws_100m,dens_100m
0,0,1999-01-01 00:00:00,-4.456234,4.999991,277.496492,97020.784132,6.697606,1.216004
1,1,1999-01-01 01:00:00,-5.006666,4.540059,277.330357,97002.61796,6.758612,1.216525
2,2,1999-01-01 02:00:00,-5.101353,4.032729,276.968939,97035.490081,6.502823,1.218571
3,3,1999-01-01 03:00:00,-5.787051,2.111372,276.721193,97021.649188,6.160183,1.219518
4,4,1999-01-01 04:00:00,-6.349969,1.693571,276.29128,97005.213128,6.571932,1.221259


In [114]:
dt = pd.to_datetime(df.datetime)
dt

0        1999-01-01 00:00:00
1        1999-01-01 01:00:00
2        1999-01-01 02:00:00
3        1999-01-01 03:00:00
4        1999-01-01 04:00:00
                 ...        
187167   2020-05-08 17:00:00
187168   2020-05-08 18:00:00
187169   2020-05-08 19:00:00
187170   2020-05-08 20:00:00
187171   2020-05-08 21:00:00
Name: datetime, Length: 187172, dtype: datetime64[ns]

In [115]:
pd.DatetimeIndex(dt[:5], freq="H")

DatetimeIndex(['1999-01-01 00:00:00', '1999-01-01 01:00:00',
               '1999-01-01 02:00:00', '1999-01-01 03:00:00',
               '1999-01-01 04:00:00'],
              dtype='datetime64[ns]', name='datetime', freq='H')

In [117]:
asset.head()

Unnamed: 0,Wind_turbine_name,Latitude,Longitude,elevation_m,Rated_power,Hub_height_m,Rotor_diameter_m,Manufacturer,Model
0,R80711,48.4569,5.5847,411,2050.0,80,82,Senvion,MM82
1,R80721,48.4497,5.5869,411,2050.0,80,82,Senvion,MM82
2,R80736,48.4461,5.5925,411,2050.0,80,82,Senvion,MM82
3,R80790,48.4536,5.5875,411,2050.0,80,82,Senvion,MM82


## Load the data and create file mappings for later use

In [70]:
fpath = Path("data/la_haute_borne")
fn_scada = fpath / "la-haute-borne-data-2014-2015.csv"
fn_meter = fpath / "plant_data.csv"
fn_curtail = fpath / "plant_data.csv"
fn_reanalysis_merra2 = fpath / "merra2_la_haute_borne.csv"
fn_reanalysis_era5 = fpath / "era5_wind_la_haute_borne.csv"
fn_asset = fpath / "la-haute-borne_asset_table.csv"

scada = pd.read_csv(fn_scada)
meter = pd.read_csv(fn_meter)
curtail = pd.read_csv(fn_curtail)
reanalysis_era5 = pd.read_csv(fn_reanalysis_era5)
reanalysis_merra2 = pd.read_csv(fn_reanalysis_merra2)
asset = pd.read_csv(fn_asset)

latitude = 48.4497
longitude = 5.5896

yaml_meta = "data/plant_meta.yml"
json_meta = "data/plant_meta.json"

## TODO
 - [x] read data from spark, csv, pandas
 - [x] read metadata from json, yaml, dict, and pre-loaded object
 - [x] automatically calculate wind direction from u/v windspeed
 - [x] call planetos api if API key is provided
   - [x] validate this works
 - [x] support flags for if csv/planetos/data object/etc
 - datetime column frequency checks
    - [ ] check against the provided metadata
    - [ ] validate against the analysis requirements
    - **note**: bring Lewis into this conversation on datetime & frequency validation, but is ok to use pandas for now
 - [x] expand metadata to contain plant-level identifiers (latitude, longitude)
 - check against the -25 namings and (likely) adopt that naming convention for the plant data
   - [ ] update internal column naming convention to the -25 schema (Eric/Lewis)
 - [x] map the input column names, and provide a method to provide them back as the original inputs
 - [x] get the 0 notebook working, or at least as a means to understand what will be required for refactoring
 - [x] no failures for tower data as it's not used
 - [x] none flag for raising warning, not error, for missing/bad data
   - `None` will run no validation
 - [ ] flag to not raise an error for known missing data
 - [x] metadata keyword argument for validate() to recreate `PlantMetaData`
     - allows for more flexibility in use cases, especially in the exploratory phase, or for changing analysis types
 - [ ] review the v3 todo workbook to stay on track with the rest of v3 development


## Create a dictionary of plant meta data 

**NOTE**: the datetime frequency checking is not in place, but the placeholder exists to implement it later

In [71]:
plant_meta = dict(
    latitude=latitude,
    longitude=longitude,
    scada=dict(
        time="Date_time",
        id="Wind_turbine_name",
        power="P_avg",
        windspeed="Ws_avg",
#         wtur_wspd="Ws_avg",  # TODO: adopt the -25 naming
        wind_direction="Wa_avg",
#         status="?",
        pitch="Ba_avg",
        temperature="Ot_avg",
        frequency="10T",
    ),
    meter=dict(
        time="time_utc",
        energy="net_energy_kwh",
    ),
    curtail=dict(
        time="time_utc",
        curtailment="curtailment_kwh",
        availability="availability_kwh",
        net_energy="net_energy_kwh",
        frequency="10T",
    ),
    reanalysis=dict(  # keys are informational/product-type, not pre-defined
        era5=dict(
            time="datetime",
            # windspeed="ws_100m",  # Commented out to demonstrate variable creation from base windspeed data
            windspeed_u="u_100",
            windspeed_v="v_100",
            temperature="t_2m",
            # density="dens_100m",  # Commented out to demonstrate variable creation from base windspeed data
            surface_pressure="surf_pres",
            frequency="H",
        ),
        merra2=dict(
            time="datetime",
            # windspeed="ws_50",  # Commented out to demonstrate variable creation from base windspeed data
            windspeed_u="u_50",
            windspeed_v="v_50",
            temperature="temp_10m",
            # density="dens_50",  # Commented out to demonstrate variable creation from base windspeed data
            surface_pressure="surface_pressure",
            frequency="H",
        )
    ),
    asset=dict(
        id="id",
        latitude="Latitude",
        longitude="Longitude",
        rated_power="Rated_power",
        hub_height="Hub_height_m",
        rotor_diameter="Rotor_diameter_m",
        elevation="elevation_m",
#         type="?",
    ),
)

# Recreate the YAML and JSON meta data objects as the dictionary above gets updated
import yaml
import json

with open(yaml_meta, "w") as f:
    yaml.safe_dump(plant_meta, f, default_flow_style=False)
    
with open(json_meta, "w") as f:
    json.dump(plant_meta, f, indent=4)

## Demonstrate the loading from YAML, JSON, and dictionary produce the exact same meta data

In [72]:
meta_from_dict = PlantMetaData.from_dict(plant_meta)
meta_from_json = PlantMetaData.from_json(json_meta)
meta_from_yaml = PlantMetaData.from_yaml(yaml_meta)
meta_from_dict == meta_from_json == meta_from_yaml, type(meta_from_dict)

(True, openoa.types.plant_v2.PlantMetaData)

## Show the PlantData capabilities

### Load from `DataFrame`s and a metadata dictionary

In [73]:
plant_from_data = PlantDataV3(
    metadata=meta_from_dict,
    scada=scada,
    meter=meter,
    curtail=curtail,
    reanalysis={"merra2": reanalysis_merra2, "era5": reanalysis_era5},  # preferred, and enable API pulling
    asset=asset,
    analysis_type="MonteCarloAEP",
)
type(plant_from_data)

openoa.types.plant_v2.PlantDataV3

### Show that "windspeed", "wind_direction", and "density" columns are all created from the core variables

In [74]:
plant_from_data.reanalysis["era5"].head()

Unnamed: 0.1,Unnamed: 0,time,windspeed_u,windspeed_v,temperature,surface_pressure,ws_100m,dens_100m,windspeed,wind_direction,density
0,0,1999-01-01 00:00:00+00:00,-4.456234,4.999991,277.496492,97020.784132,6.697606,1.216004,6.697606,138.291024,1.216004
1,1,1999-01-01 01:00:00+00:00,-5.006666,4.540059,277.330357,97002.61796,6.758612,1.216525,6.758612,132.201828,1.216525
2,2,1999-01-01 02:00:00+00:00,-5.101353,4.032729,276.968939,97035.490081,6.502823,1.218571,6.502823,128.327129,1.218571
3,3,1999-01-01 03:00:00+00:00,-5.787051,2.111372,276.721193,97021.649188,6.160183,1.219518,6.160183,110.044204,1.219518
4,4,1999-01-01 04:00:00+00:00,-6.349969,1.693571,276.29128,97005.213128,6.571932,1.221259,6.571932,104.93349,1.221259


In [75]:
plant_from_data.reanalysis["merra2"].head()

Unnamed: 0.1,Unnamed: 0,time,surface_pressure,surface_skin_temperature,u_10,v_10,windspeed_u,windspeed_v,temp_2m,temperature,u_850,v_850,temp_850,ws_50m,dens_50m,windspeed,wind_direction,density
0,0,1997-01-01 00:30:00+00:00,97353.04,258.85715,-5.517786,-3.35072,-7.167111,-4.020257,258.64264,258.5481,-10.273251,-0.722203,264.77643,8.217661,1.310616,8.217661,60.710568,1.311099
1,1,1997-01-01 01:30:00+00:00,97372.805,258.3825,-5.922551,-3.048721,-7.753224,-3.629969,258.16272,258.06815,-11.208241,-0.177881,265.06012,8.560909,1.313339,8.560909,64.911556,1.313824
2,2,1997-01-01 02:30:00+00:00,97371.945,258.06287,-5.918024,-2.92526,-7.74793,-3.49686,257.8359,257.74847,-11.152638,0.983774,265.516,8.500497,1.315005,8.500497,65.709001,1.315454
3,3,1997-01-01 03:30:00+00:00,97354.71,257.95093,-5.58718,-3.09337,-7.363984,-3.77929,257.74982,257.68427,-9.815676,2.255801,266.19235,8.277155,1.315215,8.277155,62.832528,1.315552
4,4,1997-01-01 04:30:00+00:00,97348.41,257.9991,-5.471163,-3.099406,-7.257393,-3.8353,257.8432,257.80655,-8.338905,3.257462,266.77216,8.208488,1.314649,8.208488,62.14498,1.314838


### Show loading the data from file for both the meta data (JSON and YAML) and data (CSV)

In [76]:
plant_from_file1 = PlantDataV3(
    metadata=yaml_meta,
    scada=fn_scada,
    meter=fn_meter,
    curtail=fn_curtail,
    reanalysis={"merra2": fn_reanalysis_merra2, "era5": fn_reanalysis_era5},  # preferred, and enable API pulling
    asset=fn_asset,
    analysis_type="MonteCarloAEP"
)
type(plant_from_file1)

openoa.types.plant_v2.PlantDataV3

In [77]:
plant_from_file2 = PlantDataV3(
    metadata=json_meta,
    scada=fn_scada,
    meter=fn_meter,
    curtail=fn_curtail,
    reanalysis={"merra2": fn_reanalysis_merra2, "era5": fn_reanalysis_era5},  # preferred, and enable API pulling
    asset=fn_asset,
    analysis_type="MonteCarloAEP"
)
type(plant_from_file2)

openoa.types.plant_v2.PlantDataV3

### When updating the `analysis_type` to "all", note all the column data errors that are saved until the end of the validation

In [78]:
plant_from_data = PlantDataV3(
    metadata=meta_from_dict,
    scada=scada,
    meter=meter,
    curtail=curtail,
    reanalysis={"merra2": reanalysis_merra2, "era5": reanalysis_era5},  # preferred, and enable API pulling
    asset=asset,
    analysis_type="all"
)

ValueError: `scada` data is missing the following columns: ['status']
`meter` data is missing the following columns: ['power']
`asset` data is missing the following columns: ['id', 'type']
`status` data is missing the following columns: ['time', 'id', 'status_id', 'status_code', 'status_text']
`scada` data columns were of the wrong type: ['status']
`meter` data columns were of the wrong type: ['power']
`asset` data columns were of the wrong type: ['id', 'type']
`status` data columns were of the wrong type: ['time', 'id', 'status_id', 'status_code', 'status_text']

### Demonstrate changing a parameter (`analysis_type`) and revalidating with `PlantDataV3.validate()`

In [79]:
plant = deepcopy(plant_from_data)

In [24]:
plant.analysis_type = None
plant.validate()

TypeError: 'NoneType' object is not iterable

In [20]:
plant.analysis_type = "all"
plant.validate()

ValueError: `scada` data is missing the following columns: ['Ba_avg', 'status', 'Date_time', 'Wa_avg', 'P_avg', 'Wind_turbine_name', 'Ws_avg', 'Ot_avg']
`meter` data is missing the following columns: ['time_utc', 'power', 'net_energy_kwh']
`tower` data is missing the following columns: ['time', 'id']
`asset` data is missing the following columns: ['Longitude', 'Rated_power', 'type', 'id', 'Latitude']
`status` data is missing the following columns: ['time', 'id', 'status_id', 'status_code', 'status_text']
`curtail` data is missing the following columns: ['availability_kwh', 'time_utc', 'net_energy_kwh', 'curtailment_kwh']
`reanalysis-merra2` data is missing the following columns: ['temp_10m', 'u_50', 'v_50', 'datetime']
`reanalysis-era5` data is missing the following columns: ['v_100', 't_2m', 'surf_pres', 'datetime', 'u_100']
`scada` data columns were of the wrong type: ['Date_time', 'Wind_turbine_name', 'P_avg', 'Ws_avg', 'Wa_avg', 'status', 'Ba_avg', 'Ot_avg']
`meter` data columns were of the wrong type: ['time_utc', 'power', 'net_energy_kwh']
`tower` data columns were of the wrong type: ['time', 'id']
`asset` data columns were of the wrong type: ['id', 'Latitude', 'Longitude', 'Rated_power', 'type']
`status` data columns were of the wrong type: ['time', 'id', 'status_id', 'status_code', 'status_text']
`curtail` data columns were of the wrong type: ['time_utc', 'curtailment_kwh', 'availability_kwh', 'net_energy_kwh']
`reanalysis` data columns were of the wrong type: ['era5', 'merra2']
`reanalysis-merra2` data columns were of the wrong type: ['datetime', 'u_50', 'v_50', 'temp_10m']
`reanalysis-era5` data columns were of the wrong type: ['datetime', 'u_100', 'v_100', 't_2m', 'surf_pres']

In [21]:
plant.analysis_type = "TurbineLongTermGrossEnergy"
plant.validate()

In [22]:
plant.analysis_type = "ElectricalLosses"
plant.validate()

#### Direct copy of the analysis requirements for easy referece

In [None]:
ANALYSIS_REQUIREMENTS = {
    "MonteCarloAEP": {
        "meter": {
            "columns": ["energy"],
            "freq": ("MS", "D", "H", "T"),
        },
        "curtail": {
            "columns": ["availability", "curtailment"],
            "freq": ("MS", "D", "H", "T"),
        },
        "reanalysis": {
            "columns": ["windspeed", "rho"],
            "conditional_columns": {
                "reg_temperature": ["temperature"],
                "reg_winddirection": ["windspeed_u", "windspeed_v"],
            },
        },
    },
    "TurbineLongTermGrossEnergy": {
        "scada": {
            "columns": ["id", "windspeed", "power"],  # TODO: wtur_W_avg vs energy_kwh ?
            "freq": ("D", "H", "T"),
        },
        "reanalysis": {
            "columns": ["windspeed", "wind_direction", "rho"],
        },
    },
    "ElectricalLosses": {
        "scada": {
            "columns": ["energy"],
            "freq": ("D", "H", "T"),
        },
        "meter": {
            "columns": ["energy"],
            "freq": ("MS", "D", "H", "T"),
        },
    },
}

### Show the updated column names and how to map them back to the original data

In [None]:
scada.columns.tolist()

In [None]:
plant.scada.columns.tolist()

In [None]:
plant.update_column_names(to_original=True)
plant.scada.columns.tolist()

## Demonstrate the PlanetOS integration

In [17]:
apikey_file = Path("./APIKEY").resolve()
plant_meta_planetos = deepcopy(plant_meta)
plant_meta_planetos["reanalysis"]["era5"] = dict(
    time="datetime",
    windspeed="windspeed_ms",
    wind_direction="winddirection_deg",
    windspeed_u="u_ms",
    windspeed_v="v_ms",
    temperature="temperature_K",
    density="rho_kgm-3",
    surface_pressure="surf_pres_Pa",
    frequency="H",
)
plant_meta_planetos["reanalysis"]["merra2"] = dict(
    time="datetime",
    windspeed="windspeed_ms",
    wind_direction="winddirection_deg",
    windspeed_u="u_ms",
    windspeed_v="v_ms",
    temperature="temperature_K",
    density="rho_kgm-3",
    surface_pressure="surf_pres_Pa",
    frequency="H",
)

plant_from_data = PlantDataV3(
    metadata=meta_from_dict,
    scada=scada,
    meter=meter,
    curtail=curtail,
    reanalysis={
        "merra2": {"apikey_file": apikey_file, "save_pathname": ".", "save_filename": "merra2"},
        "era5": {"apikey_file": apikey_file, "save_pathname": ".", "save_filename": "era5"},
    },
    asset=asset,
    analysis_type="all"
)

ValueError: `scada` data is missing the following columns: ['status']
`meter` data is missing the following columns: ['power']
`asset` data is missing the following columns: ['type', 'id']
`status` data is missing the following columns: ['time', 'id', 'status_id', 'status_code', 'status_text']
`reanalysis-merra2` data is missing the following columns: ['temp_10m', 'surface_pressure', 'windspeed', 'u_50', 'v_50', 'density', 'wind_direction']
`reanalysis-era5` data is missing the following columns: ['windspeed', 'v_100', 't_2m', 'surf_pres', 'u_100', 'density', 'wind_direction']
`scada` data columns were of the wrong type: ['status']
`meter` data columns were of the wrong type: ['power']
`asset` data columns were of the wrong type: ['id', 'type']
`status` data columns were of the wrong type: ['time', 'id', 'status_id', 'status_code', 'status_text']
`reanalysis-merra2` data columns were of the wrong type: ['windspeed', 'u_50', 'v_50', 'wind_direction', 'temp_10m', 'density', 'surface_pressure']
`reanalysis-era5` data columns were of the wrong type: ['windspeed', 'u_100', 'v_100', 'wind_direction', 't_2m', 'density', 'surf_pres']