## WTK Data Preparation

Caleb Phillips (caleb.phillips@nrel.gov), Dmitry Duplyakin (dmitry.duplyakin@nrel.gov) and Jenna Ruzekowicz (jenna.ruzekowicz@nrel.gov)

The purpose of this notebook is to read in WTK and WTK-LED data at the turbine locations (the entire time period available, spatially and vertically interpolated).

Notes: 
Might need to install Rex if it isn't installed already:
conda install nrel-rex --channel=nrel

More about rex: https://github.com/NREL/rex
2018 5-min monthly h5 (the file you referenced on the 21st):
/campaign/tap/CONUS/wtk/5min/2018/{month}/conus_2018-{month}.h5
 
2018 5-min yearly h5 slices:
/shared-projects/wtk-led/CONUS/wtk/2018/yearly_h5/conus_2018_{height}m.h5
 
2019 60-min yearly h5:
/campaign/tap/CONUS/wtk/60min/2019/conus_2019.h5

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

import h5pyd
from rex.resource_extraction import MultiYearWindX
from dw_tap.data_fetching import get_data_wtk_led_on_eagle 
from dw_tap.data_fetching import getData

from tqdm import tqdm

from site_index import SiteIndex

#Added for OneEnergy
from site_index_oe import SiteIndexOE

### Read in index of turbines

If running just the windtoolkit code, you only need this not the next two sections.

In [2]:
#Bergey
#index = SiteIndex()

#One Energy
index = SiteIndexOE()

index.head()

Unnamed: 0,APRS ID,Public Site Name,State,Model,Rotor Diameter (m),Latitude,Longitude,Hub Height (m),Rating (kW)
0,p1w1,Findlay,OH,GW 87/1500,87,41.101219,-83.644394,80,1500
1,p1w2,Findlay,OH,GW 87/1500,87,41.09975,-83.643533,80,1500
2,p1z1,Findlay,OH,GW 87/1500,87,41.097669,-83.642588,80,1500
3,p1z2,Findlay,OH,GW 87/1500,87,41.096111,-83.64138,80,1500
4,p1z3,Findlay,OH,GW 87/1500,87,41.094388,-83.639116,80,1500


### Fetch Wind Toolkit Data

In [3]:
# Open the wind data "file"
# server endpoint, username, password are found in ~/.hscfg
f = h5pyd.File("/nrel/wtk-us.h5", 'r', bucket="nrel-pds-hsds") 

def fetch_dfs(index,wtk_dfs=[]):
    for row in tqdm(index.itertuples()):
        tid = row[1]
        lat = row[6]
        lon = row[7]
        z_turbine = row[8]
        # avoid re-running things that have already run
        if tid in [x['tid'][0] for x in wtk_dfs]:
            print("Skipping "+str(tid))
            continue
        atmospheric_df = getData(f, lat, lon, z_turbine, "IDW", 
                                 power_estimate=True,
                                 inverse_monin_obukhov_length=True)
        atmospheric_df['tid'] = tid
        wtk_dfs.append(atmospheric_df)

    return wtk_dfs

# in case HSDS has a connection error we will try a second time
try:
    wtk_dfs = fetch_dfs(index)
except ConnectionError:
    print("Caught a connection error, trying to resume...")
    wtk_dfs = fetch_dfs(indx,wtk_dfs)

21it [1:04:48, 185.18s/it]


In [4]:
wtk_df = pd.concat(wtk_dfs)

In [5]:
wtk_df.head()

Unnamed: 0,datetime,ws,wd,temp,pres,inversemoninobukhovlength_2m,tid
0,2007-01-01 00:00:00,11.429972,160.344914,285.184784,95416.112667,0.004876,p1w1
1,2007-01-01 01:00:00,12.601066,170.244337,285.679199,95341.99029,0.003894,p1w1
2,2007-01-01 02:00:00,11.008472,222.11003,285.239716,95438.777724,0.007028,p1w1
3,2007-01-01 03:00:00,12.665842,219.703827,284.333313,95451.063645,0.013516,p1w1
4,2007-01-01 04:00:00,11.366636,213.22369,283.002716,95438.52509,0.019596,p1w1


In [6]:
wtk_df['packet_date'] = pd.to_datetime(wtk_df["datetime"]).dt.tz_localize('UTC')

#Bergey
#wtk_df.to_csv("01 Bergey Turbine Data/wtk_tp.csv.bz2",index=False)

#One Energy
wtk_df.to_csv("01 One Energy Turbine Data/wtk_tp.csv.bz2",index=False)

### Fetch WTK-LED Data

**Requirement:** The following code must run on `Eagle` to access h5 files for WTK-LED

#### 2018 5-minute data from WTK-LED

In [3]:
#Bergy
#index = pd.read_csv("01 Bergey Turbine Data/bergey_sites.csv")

#One Energy
index = pd.read_csv("01 One Energy Turbine Data/OneEnergyTurbineData.csv")

index.head()

Unnamed: 0,APRS ID,Public Site Name,State,Model,Rotor Diameter (m),Latitude,Longitude,Hub Height (m),Rating (kW)
0,p1w1,Findlay,OH,GW 87/1500,87,41.101219,-83.644394,80,1500
1,p1w2,Findlay,OH,GW 87/1500,87,41.09975,-83.643533,80,1500
2,p1z1,Findlay,OH,GW 87/1500,87,41.097669,-83.642588,80,1500
3,p1z2,Findlay,OH,GW 87/1500,87,41.096111,-83.64138,80,1500
4,p1z3,Findlay,OH,GW 87/1500,87,41.094388,-83.639116,80,1500


In [4]:
def fetch_dfs_wtk_led_2018(index, wtk_dfs=[]):
    # 12 monthly files for 2018
    files = ['/campaign/tap/CONUS/wtk/5min/2018/%s/conus_2018-%s.h5' % \
             (str(i).zfill(2), str(i).zfill(2)) for i in range(1,13)]
    
    for row in tqdm(index.itertuples()):
        tid = row[1]
        lat = row[5]
        lon = row[6]
        z_turbine = row[7]
        # avoid re-running things that have already run
        if tid in [x['tid'][0] for x in wtk_dfs]:
            print("Skipping "+str(tid))
            continue
            
        atmospheric_df = pd.DataFrame()
        # Iterate over all monthly files
        for file in files:
            myr = MultiYearWindX(file, hsds=False)
            d = get_data_wtk_led_on_eagle(myr, 
                                          lat, lon, z_turbine, "IDW", 
                                          power_estimate=False,
                                          start_time_idx=None, 
                                          end_time_idx=None,
                                          time_stride=None)
            atmospheric_df = pd.concat([atmospheric_df, d])
        
        atmospheric_df['tid'] = tid
        atmospheric_df.reset_index(drop=True, inplace=True)
        wtk_dfs.append(atmospheric_df)

    return wtk_dfs

# in case HSDS has a connection error we will try a second time
try:
    wtk_dfs = fetch_dfs_wtk_led_2018(index)
except ConnectionError:
    print("Caught a connection error, trying to resume...")
    wtk_dfs = fetch_dfs_wtk_led_2018(indx,wtk_dfs)

0it [00:00, ?it/s]


FileInputError: Could not find any file paths with pattern: /campaign/tap/CONUS/wtk/5min/2018/01/conus_2018-01.h5

In [7]:
wtk_df_2018 = pd.concat(wtk_dfs)
wtk_df_2018.head()

Unnamed: 0,datetime,ws,wd,tid
0,2018-01-01 00:00:00+00:00,8.338811,296.537156,t007
1,2018-01-01 00:05:00+00:00,8.282,297.003766,t007
2,2018-01-01 00:10:00+00:00,8.18532,297.41583,t007
3,2018-01-01 00:15:00+00:00,8.10486,297.803657,t007
4,2018-01-01 00:20:00+00:00,8.058473,298.054636,t007


In [8]:
len(wtk_df_2018)

1997280

In [11]:
wtk_df_2018.tid.value_counts()

t007    105120
t139    105120
t207    105120
t192    105120
t183    105120
t182    105120
t170    105120
t169    105120
t140    105120
t135    105120
t024    105120
t133    105120
t114    105120
t083    105120
t074    105120
t041    105120
t034    105120
t028    105120
t221    105120
Name: tid, dtype: int64

In [10]:
# The following causes the error: Already tz-aware, use tz_convert to convert.
#wtk_df_2018['packet_date'] = \
#    pd.to_datetime(wtk_df_2018["datetime"]).dt.tz_localize('UTC')
# Skip localization: 
wtk_df_2018['packet_date'] = wtk_df_2018["datetime"]

#Bergey
#wtk_df_2018[['tid','packet_date','ws','wd']].\
#    to_csv("01 Bergey Turbine Data/wtk_led_2018.csv.bz2",index=False)

#One Energy
wtk_df_2018[['tid','packet_date','ws','wd']].\
    to_csv("01 One Energy Turbine Data/wtk_led_2018.csv.bz2",index=False)

#### 2019 hourly data from WTK-LED

In [5]:
def fetch_dfs_wtk_led_2019(index, wtk_dfs=[]):

    # 2019 hourly file
    myr = MultiYearWindX('/campaign/tap/CONUS/wtk/60min/2019/conus_2019.h5', hsds=False)
    
    for row in tqdm(index.itertuples()):
        tid = row[1]
        lat = row[5]
        lon = row[6]
        z_turbine = row[7]
        # avoid re-running things that have already run
        if tid in [x['tid'][0] for x in wtk_dfs]:
            print("Skipping "+str(tid))
            continue
            
        atmospheric_df = pd.DataFrame()

        atmospheric_df = get_data_wtk_led_on_eagle(myr, 
                                          lat, lon, z_turbine, "IDW", 
                                          power_estimate=False,
                                          start_time_idx=None, 
                                          end_time_idx=None,
                                          time_stride=None)
        
        atmospheric_df['tid'] = tid
        wtk_dfs.append(atmospheric_df)

    return wtk_dfs

# in case HSDS has a connection error we will try a second time
try:
    wtk_dfs = fetch_dfs_wtk_led_2019(index)
except ConnectionError:
    print("Caught a connection error, trying to resume...")
    wtk_dfs = fetch_dfs_wtk_led_2019(indx,wtk_dfs)

21it [02:28,  7.09s/it]


In [6]:
wtk_df_2019 = pd.concat(wtk_dfs)
wtk_df_2019.head()

Unnamed: 0,datetime,ws,wd,tid
0,2019-01-01 00:00:00+00:00,5.801474,137.273211,p1w1
1,2019-01-01 01:00:00+00:00,5.540089,137.964004,p1w1
2,2019-01-01 02:00:00+00:00,5.250237,138.607502,p1w1
3,2019-01-01 03:00:00+00:00,4.937005,139.520113,p1w1
4,2019-01-01 04:00:00+00:00,4.881869,142.493312,p1w1


In [7]:
len(wtk_df_2019)

183960

In [8]:
# The following causes the error: Already tz-aware, use tz_convert to convert.
#wtk_df_2019['packet_date'] = \
#    pd.to_datetime(wtk_df_2019["datetime"]).dt.tz_localize('UTC')
# Skip localization: 
wtk_df_2019['packet_date'] = wtk_df_2019["datetime"]

#Bergey
#wtk_df_2019[['tid','packet_date','ws','wd']].\
#    to_csv("01 Bergey Turbine Data/wtk_led_2019.csv.bz2",index=False)

#One Energy
wtk_df_2019[['tid','packet_date','ws','wd']].\
    to_csv("01 One Energy Turbine Data/wtk_led_2019.csv.bz2",index=False)

In [9]:
wtk_df_2019.tid.value_counts()

p1w1      8760
p2wg2     8760
p6l2      8760
p6l1      8760
p5w1      8760
p4w3      8760
p4w2      8760
p4w1      8760
p3wtg1    8760
p2wg3     8760
p2wg1     8760
p1w2      8760
p1v2      8760
p1v1      8760
p1z6      8760
p1z5      8760
p1z4      8760
p1z3      8760
p1z2      8760
p1z1      8760
p6l3      8760
Name: tid, dtype: int64