## Imports and set data path

In [None]:
# Imports
import h5py
import numpy as np
import pandas as pd

# Set options
pd.set_option('display.max_columns',None)

# Set data path here
data_path = 'wind_plant_data.h5'


## Explore contents and architecture of .h5 data file

`Data Structure` <br>
The .h5 data file is structured as follows: <br>

|-- root (group, 501 members) <br>
 &emsp;&emsp;	|-- LayoutXXX (group, 3 members) <br>
 &emsp;&emsp;	 &emsp;&emsp;	|-- Number of Turbines (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	|-- Scenarios (group, 500 members) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- ScenarioXXX (group, 6 members) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Optimal Yaw (group, 3 members)** <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Turbine Power (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Turbine Wind Speed (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Yaw Angles (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Turbine Power (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Turbine Wind Speed (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Turbulence Intensity (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Wind Direction (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Wind Speed (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Yaw Angles (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	|--  Turbines (group, variable number of members) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- TurbineXXX (group, 4 members) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Hub Height (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Rotor Diameter (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- X Location (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Y Location (dataset)  <br>
 &emsp;&emsp;	|-- One-hot Encoded (group, 2 members) <br>
 &emsp;&emsp;	 &emsp;&emsp;	|-- 2D Representation (group, 1 member) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Full 2D Array (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	|-- 3D Representation (group, 2 members) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Plant Level 2D Array (dataset) <br>
 &emsp;&emsp;	 &emsp;&emsp;	 &emsp;&emsp;	|-- Turbine Level 3D Array (dataset) <br>

** The Optimal Yaw group is only present in a subset of layouts/scenarios, which are identified in the opt_yaw_list.csv file.

In [None]:
# Open file and print the top level group names/keys

# This data set contains 500 unique wind plant layouts.
with h5py.File(data_path, 'r') as hf:
    layout_names = [k for k in hf.keys() if 'Layout' in k]
    print(layout_names)


In [None]:
# Explore contents of LayoutXXX groups (first 3)

# Each LayoutXXX Group contains 3 members: 2 groups and 1 dataset.
# The dataset is a single int32 value representing the number of turbines in the plant layout.
# The two groups contain (i) plant turbine details and (ii) a collection of atmospheric flow scenarios.
with h5py.File(data_path, 'r') as hf:
    for name in layout_names[0:3]:
        print('-------------------------------------------------')
        print('Group Name:', name)
        print('Group Info:', hf[name])
        layout_keys = list(hf[name].keys())
        print('Group Members:')
        for key in layout_keys:
            print(hf[name][key])
            if 'Dataset' in str(type(hf[name][key])):
                print('Number of Turbines =', hf[name][key][()])
    print('-------------------------------------------------')
    

In [None]:
# Explore contents of /LayoutXXX/Turbines groups (first 10 members)

# Each LayoutXXX/Turbines group contains a variable number of sub groups, one for each 
# of the individual turbines in the layout. In the original version of the dataset these
# values ranged between [25-199], however this may change over time with new data added.
with h5py.File(data_path, 'r') as hf:
    print('Members in /Layout000/Turbines')
    turbines_keys = list(hf['Layout000']['Turbines'].keys())
    for key in turbines_keys[0:10]:
        print(hf['Layout000']['Turbines'][key])
    print('...')


In [None]:
# Explore contents of /LayoutXXX/Turbines/TurbineXXX groups

# Each /LayoutXXX/Turbines/TurbineXXX group contains 4 datasets describing the turbine details.
# Each dataset contains a single float32 value that represents the given turbine's:
#       - Hub Height (m)
#       - Rotor Diameter (m)
#       - X Location (m)
#       - Y Location (m)
# Note: The wind plant center of mass is defined at the origin (0, 0)
with h5py.File(data_path, 'r') as hf:
    print('Members in /Layout000/Turbines/Turbine000')
    turbineXXX_keys = list(hf['Layout000']['Turbines']['Turbine000'].keys())
    for key in turbineXXX_keys:
        print(hf['Layout000']['Turbines']['Turbine000'][key])
        print(key, '=', hf['Layout000']['Turbines']['Turbine000'][key][()])


In [None]:
# Explore contents of /LayoutXXX/Scenarios groups

# Each LayoutXXX/Scenarios group contains 500 subgroups that define randomly sampled
# atmospheric inflow scenarios for which FLORIS was evaluated.
with h5py.File(data_path, 'r') as hf:
    print('Members in /Layout000/Scenarios')
    scenarios_keys = list(hf['Layout000']['Scenarios'].keys())
    for key in scenarios_keys[0:10]:
        print(hf['Layout000']['Scenarios'][key])
    print('...')


In [None]:
# Explore contents of /LayoutXXX/Scenarios/ScenarioXXX groups

# Each /LayoutXXX/Scenarios/ScenarioXXX group contains 6 datasets: 
#       - Turbine Power: power output vector of all turbines (W) float32
#       - Turbine Wind Speed: local wind speed vector at each turbine (m/s) float32
#       - Turbulence Intensity: single value (%) float32
#       - Wind Direction: single value (degrees) float32
#       - Wind Speed: single value (m/s) float32
#       - Yaw Angles: yaw angle vector for all turbines (degrees) float32
# Note: some datasets contain an additional subgroup 'Optimal Yaw' containing data for the optimized yaw simulations
with h5py.File(data_path, 'r') as hf:
    print('Members in /Layout000/Scenarios/Scenario000')
    scenarioXXX_keys = list(hf['Layout000']['Scenarios']['Scenario000'].keys())
    for key in scenarioXXX_keys:
        print(hf['Layout000']['Scenarios']['Scenario000'][key])


In [None]:
# For a randomly selected subset of the data (50 scenarios for 50 different layouts), yaw angles
# were determined using FLORIS's wake steering routine that optimizes overall plant power.

# The opt_yaw_list.csv file provides a list of LayoutXXX/ScenarioXXX where these wake steering cases were performed.
opt_yaw_cases = pd.read_csv('opt_yaw_list.csv', header=None)

# Explore contents of /LayoutXXX/Scenarios/ScenarioXXX/Optimal Yaw groups

# For ScenarioXXXs that contain them, each 'Optimal Yaw' group contains 3 datasets:
#       - Turbine Power: power output vector of all turbines (W) float32
#       - Turbine Wind Speed: local wind speed vector at each turbine (m/s) float32
#       - Yaw Angles: yaw angle vector for all turbines (degrees) float32
# NOTE: these datasets are identical to their counterparts inside of the ScenarioXXX groups;
#       however, these datasets are from FLORIS simulations with optimized yaw control.
with h5py.File(data_path, 'r') as hf:
    #for idx, opt_yaw in opt_yaw_cases.iterrows():
    for i in range(5):
        opt_layout, opt_scenario = opt_yaw_cases.iloc[i][0], opt_yaw_cases.iloc[i][1]
        print('-------------------------------------------------')
        print(opt_layout, opt_scenario)
        for key in list(hf[opt_layout]['Scenarios'][opt_scenario]['Optimal Yaw'].keys()):
            print(hf[opt_layout]['Scenarios'][opt_scenario]['Optimal Yaw'][key])
    print('-------------------------------------------------')

## Read and Load one-hot encoded data from .h5


For convenience, we provide a one-hot encodings for the wind farm data that can more easily be loaded and processed for machine learning workflows. Below we examine these encodings to support accessibility.

In [None]:
# Explore one-hot encoded data

# The one-hot encoded data is contained within a separate group, label 'One-hot Encoded'.
# Two representations are provided here: a complete 2D encoding and a paired 2D/3D encoding 

# In these representations all vectors that contain individual turbine data have been parsed into features/columns for each turbine
# Additionally, index and binary / dummy columns have been added for ease of training and parsing
with h5py.File(data_path, 'r') as hf:
    print('Members in /One-hot Encoded group:')
    onehot_keys = list(hf['One-hot Encoded'].keys())
    for key in onehot_keys:
        print(hf['One-hot Encoded'][key])
        subkeys = list(hf['One-hot Encoded'][key].keys())
        
    print('-------------------------------------------------')
    
    print('Members in 2D Representation group:')
    print('')
    
    
    print('Columns in "Full 2D array":')
    print('[layout, scenario, opt_yaw, num_turbines, hub_h, rotor_d, wind_dir, wind_speed, turbulence]', end='')
    print('+ [t_###, t_X_###, t_Y_###, t_ws_###, t_yaw_###, t_power_###] for 0-199')
    print(hf['One-hot Encoded']['2D Representation']['Full 2D array'])
        
    print('-------------------------------------------------')
    
    print('Members in 3D Representation group:')
    print('')
    
    print([k for k in hf['One-hot Encoded']['3D Representation'].keys()])
    print('Columns in "Plant Level 2D array":')
    print('[layout, scenario, opt_yaw, num_turbines, hub_h, rotor_d, wind_dir, wind_speed, turbulence]')
    print(hf['One-hot Encoded']['3D Representation']['Plant Level 2D array'])
    print('')
    
    print('Rows in "Turbine Level 3D array":')
    print('[t_###, t_X_###, t_Y_###, t_ws_###, t_yaw_###, t_power_###] for 0-199')
    print(hf['One-hot Encoded']['3D Representation']['Turbine Level 3D array'])
           

In [None]:
# We can convert the Full 2D array back to a pandas dataframe with column headers for exploratory data analysis

column_names = ['layout', 'scenario', 'opt_yaw', 'num_turbines', 'hub_h', 'rotor_d', 
                'wind_dir', 'wind_speed', 'turbulence']
column_template = ['t_{:03d}', 't_X_{:03d}', 't_Y_{:03d}', 't_ws_{:03d}', 't_yaw_{:03d}', 't_power_{:03d}']
for c in column_template:
    for i in range(200):
        column_names.append(c.format(i))
with h5py.File(data_path, 'r') as hf:
    df = pd.DataFrame(data=hf['One-hot Encoded']['2D Representation']['Full 2D array'][()], columns=column_names)
df
