## WTK Data Preparation

Caleb Phillips (caleb.phillips@nrel.gov), Dmitry Duplyakin (dmitry.duplyakin@nrel.gov) and Jenna Ruzekowicz (jenna.ruzekowicz@nrel.gov)

The purpose of this notebook is to read in WTK and WTK-LED data at the turbine locations (the entire time period available, spatially and vertically interpolated).

Notes: 
Might need to install Rex if it isn't installed already:
conda install nrel-rex --channel=nrel

More about rex: https://github.com/NREL/rex
2018 5-min monthly h5 (the file you referenced on the 21st):
/campaign/tap/CONUS/wtk/5min/2018/{month}/conus_2018-{month}.h5
 
2018 5-min yearly h5 slices:
/shared-projects/wtk-led/CONUS/wtk/2018/yearly_h5/conus_2018_{height}m.h5
 
2019 60-min yearly h5:
/campaign/tap/CONUS/wtk/60min/2019/conus_2019.h5

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

import h5pyd
from rex.resource_extraction import MultiYearWindX
from dw_tap.data_fetching import get_data_wtk_led_on_eagle 
from dw_tap.data_fetching import getData

from tqdm import tqdm

from site_index import SiteIndex

#Added for OneEnergy
from site_index_oe import SiteIndexOE

### Read in index of turbines

If running just the windtoolkit code, you only need this not the next two sections.

In [2]:
#Bergey
#index = SiteIndex()

#One Energy
index = SiteIndexOE()

index.head()

Unnamed: 0,APRS ID,AID,Public Site Name,Internal Site Name,State,Latitude,Longitude,Hub Height (m),Lidar Quality,Lidar Collection Year,Site Type,Site Notes,Building Data Quality,Turbine,Periods with Consistent Generation Data,Met Tower,Met Tower Latitude,Met Tower Longitude,Measurement Height (m),Measurement Privacy
0,t007,A2719,Fremont,Fremont,MN,43.918622,-91.899498,41,N/A (Legacy Data),,Rural,No lidar data. This turbine is absolutely surr...,"Partial, some missing",Bergey Excel 10,2012/01 - 2013/12 (outage starting mid-2013/11...,mn_prairiestar.PROPRIETARY.pruf.csv,43.673,-92.698,50.0,Proprietary
1,t024,A2672,Washoe,Thomas Danzinger,NV,39.331088,-119.82023,30,QL1,2020.0,Rural/Suburban,Straight forward high desert site outside Reno...,Good coverage,Bergey Excel 10,2011/01 - 2022/12,,,,,
2,t028,A3479,Towamensing,Derr,PA,40.851353,-75.598395,30,QL2,2019.0,Suburban,Impressive amount of vegetation – some section...,"Partial, some missing",Bergey Excel 10,2011/01 - 2022/12,,,,,
3,t034,A2715,Lycoming,Dan Poor,NY,43.524158,-76.37229,31,QL2,2018.0,Rural,"Lakeshore, heavily forested.",Good coverage,Bergey Excel 10,"2013/01 - 2017/12, 2019/01 - 2021/12",ny_oswego.qc.csv,43.464,-76.511,15.0,Public
4,t041,A2272,Rockford,Fossil Park,IA,43.047418,-92.981674,37,QL2,2020.0,Rural,"Very simple site, one building and a couple sm...",Good coverage,Bergey Excel 10,"2013/01 - 2013/12, 2015/01 - 2022/12",,,,,


### Fetch Wind Toolkit Data

In [3]:
# Open the wind data "file"
# server endpoint, username, password are found in ~/.hscfg
f = h5pyd.File("/nrel/wtk-us.h5", 'r', bucket="nrel-pds-hsds") 

def fetch_dfs(index,wtk_dfs=[]):
    for row in tqdm(index.itertuples()):
        tid = row[1]
        lat = row[6]
        lon = row[7]
        z_turbine = row[8]
        # avoid re-running things that have already run
        if tid in [x['tid'][0] for x in wtk_dfs]:
            print("Skipping "+str(tid))
            continue
        atmospheric_df = getData(f, lat, lon, z_turbine, "IDW", 
                                 power_estimate=True,
                                 inverse_monin_obukhov_length=True)
        atmospheric_df['tid'] = tid
        wtk_dfs.append(atmospheric_df)

    return wtk_dfs

# in case HSDS has a connection error we will try a second time
try:
    wtk_dfs = fetch_dfs(index)
except ConnectionError:
    print("Caught a connection error, trying to resume...")
    wtk_dfs = fetch_dfs(indx,wtk_dfs)

2it [05:58, 179.18s/it]


KeyboardInterrupt: 

In [None]:
wtk_df = pd.concat(wtk_dfs)

In [None]:
wtk_df.head()

In [7]:
wtk_df['packet_date'] = pd.to_datetime(wtk_df["datetime"]).dt.tz_localize('UTC')

#Bergey
#wtk_df.to_csv("01 Bergey Turbine Data/wtk_tp.csv.bz2",index=False)

#One Energy
wtk_df.to_csv("01 One Energy Turbine Data/wtk_tp.csv.bz2",index=False)

### Fetch WTK-LED Data

**Requirement:** The following code must run on `Eagle` to access h5 files for WTK-LED

#### 2018 5-minute data from WTK-LED

In [5]:
#Bergy
#index = pd.read_csv("01 Bergey Turbine Data/bergey_sites.csv")

#One Energy
index = pd.read_csv("01 One Energy Turbine Data/OneEnergyTurbineData.csv")

index.head()

Unnamed: 0,APRS ID,Public Site Name,Internal Site Name,State,Latitude,Longitude,Hub Height (m),Turbine,Periods with Consistent Generation Data,Met Tower,Met Tower Latitude,Met Tower Longitude,Measurement Height (m)
0,t007,Fremont,Fremont,MN,43.918622,-91.899498,41,Bergey Excel 10,2012/01 - 2013/12 (outage starting mid-2013/11...,,,,
1,t024,Washoe,Thomas Danzinger,NV,39.331088,-119.820234,30,Bergey Excel 10,2011/01 - 2022/12,,,,
2,t028,Towamensing,Derr,PA,40.851353,-75.598395,30,Bergey Excel 10,2011/01 - 2022/12,,,,
3,t034,Lycoming,Dan Poor,NY,43.524158,-76.37229,31,Bergey Excel 10,"2013/01 - 2017/12, 2019/01 - 2021/12",Oswego,43.464,-76.511,15.0
4,t041,Rockford,Fossil Park,IA,43.047418,-92.981674,37,Bergey Excel 10,"2013/01 - 2013/12, 2015/01 - 2022/12",,,,


In [6]:
def fetch_dfs_wtk_led_2018(index, wtk_dfs=[]):
    # 12 monthly files for 2018
    files = ['/campaign/tap/CONUS/wtk/5min/2018/%s/conus_2018-%s.h5' % \
             (str(i).zfill(2), str(i).zfill(2)) for i in range(1,13)]
    
    for row in tqdm(index.itertuples()):
        tid = row[1]
        lat = row[5]
        lon = row[6]
        z_turbine = row[7]
        # avoid re-running things that have already run
        if tid in [x['tid'][0] for x in wtk_dfs]:
            print("Skipping "+str(tid))
            continue
            
        atmospheric_df = pd.DataFrame()
        # Iterate over all monthly files
        for file in files:
            myr = MultiYearWindX(file, hsds=False)
            d = get_data_wtk_led_on_eagle(myr, 
                                          lat, lon, z_turbine, "IDW", 
                                          power_estimate=False,
                                          start_time_idx=None, 
                                          end_time_idx=None,
                                          time_stride=None)
            atmospheric_df = pd.concat([atmospheric_df, d])
        
        atmospheric_df['tid'] = tid
        atmospheric_df.reset_index(drop=True, inplace=True)
        wtk_dfs.append(atmospheric_df)

    return wtk_dfs

# in case HSDS has a connection error we will try a second time
try:
    wtk_dfs = fetch_dfs_wtk_led_2018(index)
except ConnectionError:
    print("Caught a connection error, trying to resume...")
    wtk_dfs = fetch_dfs_wtk_led_2018(indx,wtk_dfs)

19it [30:21, 95.86s/it]


In [7]:
wtk_df_2018 = pd.concat(wtk_dfs)
wtk_df_2018.head()

Unnamed: 0,datetime,ws,wd,tid
0,2018-01-01 00:00:00+00:00,8.338811,296.537156,t007
1,2018-01-01 00:05:00+00:00,8.282,297.003766,t007
2,2018-01-01 00:10:00+00:00,8.18532,297.41583,t007
3,2018-01-01 00:15:00+00:00,8.10486,297.803657,t007
4,2018-01-01 00:20:00+00:00,8.058473,298.054636,t007


In [8]:
len(wtk_df_2018)

1997280

In [11]:
wtk_df_2018.tid.value_counts()

t007    105120
t139    105120
t207    105120
t192    105120
t183    105120
t182    105120
t170    105120
t169    105120
t140    105120
t135    105120
t024    105120
t133    105120
t114    105120
t083    105120
t074    105120
t041    105120
t034    105120
t028    105120
t221    105120
Name: tid, dtype: int64

In [10]:
# The following causes the error: Already tz-aware, use tz_convert to convert.
#wtk_df_2018['packet_date'] = \
#    pd.to_datetime(wtk_df_2018["datetime"]).dt.tz_localize('UTC')
# Skip localization: 
wtk_df_2018['packet_date'] = wtk_df_2018["datetime"]

#Bergey
#wtk_df_2018[['tid','packet_date','ws','wd']].\
#    to_csv("01 Bergey Turbine Data/wtk_led_2018.csv.bz2",index=False)

#One Energy
wtk_df_2018[['tid','packet_date','ws','wd']].\
    to_csv("01 One Energy Turbine Data/wtk_led_2018.csv.bz2",index=False)

#### 2019 hourly data from WTK-LED

In [12]:
def fetch_dfs_wtk_led_2019(index, wtk_dfs=[]):

    # 2019 hourly file
    myr = MultiYearWindX('/campaign/tap/CONUS/wtk/60min/2019/conus_2019.h5', hsds=False)
    
    for row in tqdm(index.itertuples()):
        tid = row[1]
        lat = row[5]
        lon = row[6]
        z_turbine = row[7]
        # avoid re-running things that have already run
        if tid in [x['tid'][0] for x in wtk_dfs]:
            print("Skipping "+str(tid))
            continue
            
        atmospheric_df = pd.DataFrame()

        atmospheric_df = get_data_wtk_led_on_eagle(myr, 
                                          lat, lon, z_turbine, "IDW", 
                                          power_estimate=False,
                                          start_time_idx=None, 
                                          end_time_idx=None,
                                          time_stride=None)
        
        atmospheric_df['tid'] = tid
        wtk_dfs.append(atmospheric_df)

    return wtk_dfs

# in case HSDS has a connection error we will try a second time
try:
    wtk_dfs = fetch_dfs_wtk_led_2019(index)
except ConnectionError:
    print("Caught a connection error, trying to resume...")
    wtk_dfs = fetch_dfs_wtk_led_2019(indx,wtk_dfs)

19it [02:27,  7.77s/it]


In [13]:
wtk_df_2019 = pd.concat(wtk_dfs)
wtk_df_2019.head()

Unnamed: 0,datetime,ws,wd,tid
0,2019-01-01 00:00:00+00:00,13.028949,18.310087,t007
1,2019-01-01 01:00:00+00:00,13.251175,16.295881,t007
2,2019-01-01 02:00:00+00:00,11.943526,15.941548,t007
3,2019-01-01 03:00:00+00:00,12.862984,11.711439,t007
4,2019-01-01 04:00:00+00:00,11.938656,5.323972,t007


In [14]:
len(wtk_df_2019)

166440

In [15]:
# The following causes the error: Already tz-aware, use tz_convert to convert.
#wtk_df_2019['packet_date'] = \
#    pd.to_datetime(wtk_df_2019["datetime"]).dt.tz_localize('UTC')
# Skip localization: 
wtk_df_2019['packet_date'] = wtk_df_2019["datetime"]

#Bergey
#wtk_df_2019[['tid','packet_date','ws','wd']].\
#    to_csv("01 Bergey Turbine Data/wtk_led_2019.csv.bz2",index=False)

#One Energy
wtk_df_2019[['tid','packet_date','ws','wd']].\
    to_csv("01 One Energy Turbine Data/wtk_led_2019.csv.bz2",index=False)

In [16]:
wtk_df_2019.tid.value_counts()

t007    8760
t139    8760
t207    8760
t192    8760
t183    8760
t182    8760
t170    8760
t169    8760
t140    8760
t135    8760
t024    8760
t133    8760
t114    8760
t083    8760
t074    8760
t041    8760
t034    8760
t028    8760
t221    8760
Name: tid, dtype: int64