## Run ANL's LOM: PILOWF

Jenna Ruzekowicz (jenna.ruzekowicz@nrel.gov), Caleb Phillips (caleb.phillips@nrel.gov), and Dmitry Duplyakin (dmitry.duplyakin@nrel.gov)

The purpose of this notebook is to read in inflow data, load obstacle data, and run the LOMs.

Output is saved into files named:

 > `bergey|oneenergy_anl|lanl_tid_windSource_obstacleMode.csv.bz2`, 
 
 > where `windSource` is one of: `wtk`, `wtk_led_2018`, `wtk_led_2019`, `wtk_bc`, `wtk_led_bc` (`bc` referes to bias corrected versions) 
 
 > and `obstacleMode` is one of: `bldgsonly`, `treesasbldgs`, `bldgsandtrees`

### Work notes:

- Currently only ANL model
- Sites t007 and t074 are currently excluded becuase they don't have height data matching other sites
- Code looks for `"%s/%sv2.json" % (obstacle_data_dir, tid)` files for site obstacles inside `02 Input For Models` dir
- Sites t207 take a very long time! (slowest site)
- Notice that based on the plots at the end of the notebook ws-adjusted = ws (exactly) for quite a few sites. No negative ws-adjusted values observed yet for this set of inputs.
- `wind_sources = ["wtk", "wtk_led_2019", "wtk_bc", "wtk_led_bc"]` -- not including "wtk_led_2018" yet becuase 5-minuted data for 2018 is very large and processing will be slower than for other options here; will add it for final/more complete processing
- `overwrite` (if not set) flag allows to skip processing previously studied site x wind_source combinations (if output files with matching names are found)

In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import matplotlib.pyplot as plt
from dw_tap.lom import run_lom
import os
import seaborn as sns
import glob
from dw_tap.data_processing import _LatLon_To_XY, filter_obstacles
import subprocess
import shutil

%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
index = pd.read_csv("01 Bergey Turbine Data/bergey_sites.csv")
index.head()

### Select which sites need to be processed, wind data sources, and obstacles modes

In [None]:
# Small test with several sites
#selected = ["t133", "t135"]
#selected = ["t034", "t133", "t135"]
#selected = ["t207"]

# Process all sites:
selected = index["APRS ID"].tolist()

# Remove 2 sites that currently don't have obstacle descriptions with the heights based on lidar data
selected = [x for x in selected if not(x in ["t007", "t074"])]

# Remove slowest site for now
selected = [x for x in selected if not(x in ["t207"])]
print(selected)

wind_sources = ["wtk", "wtk_bc"] # Choices here: "wtk", "wtk_led_2018", "wtk_led_2019", "wtk_bc"

obstacle_modes = ["bldgsonly_100m"] #"bldgsonly_100m"] #"bldgsonly", "treesasbldgs"] #, "bldgsandtrees"] # Choices here: `bldgsonly`, `bldgsandtrees`

### Load wind data

In [None]:
atmospheric_inputs = {}

for wind_source in wind_sources:

    if wind_source == "wtk":

        wtk_df = pd.read_csv("01 Bergey Turbine Data/wtk.csv.bz2")

        # Create dict with dataframes that correspond to selected tid's
        dfs_by_tid = {}
        for tid in selected:
            dfs_by_tid[tid] = wtk_df[wtk_df["tid"] == tid].reset_index(drop=True)
            #display(dfs_by_tid[tid].head(3))
        
        atmospheric_inputs[wind_source] = dfs_by_tid

    elif wind_source == "wtk_led_2018":

        wtk_led_2018 = pd.read_csv("01 Bergey Turbine Data/wtk_led_2018.csv.bz2")

        # Create dict with dataframes that correspond to selected tid's
        dfs_by_tid = {}
        for tid in selected:
            dfs_by_tid[tid] = wtk_led_2018[wtk_led_2018["tid"] == tid].copy().reset_index(drop=True)
            dfs_by_tid[tid]["datetime"] = dfs_by_tid[tid]["packet_date"]
            #display(dfs_by_tid[tid].head(3))
        
        atmospheric_inputs[wind_source] = dfs_by_tid

    elif wind_source == "wtk_led_2019":

        wtk_led_2019 = pd.read_csv("01 Bergey Turbine Data/wtk_led_2019.csv.bz2")

        # Create dict with dataframes that correspond to selected tid's
        dfs_by_tid = {}
        for tid in selected:
            dfs_by_tid[tid] = wtk_led_2019[wtk_led_2019["tid"] == tid].copy().reset_index(drop=True)
            dfs_by_tid[tid]["datetime"] = dfs_by_tid[tid]["packet_date"]
            #display(dfs_by_tid[tid].head(3))
            
        atmospheric_inputs[wind_source] = dfs_by_tid

    elif wind_source == "wtk_bc":
        wtk_bc_df = pd.read_csv("02 Bias Correction/wtk_bc.csv.bz2")
        
        # Create dict with dataframes that correspond to selected tid's
        dfs_by_tid = {}
        for tid in selected:
            dfs_by_tid[tid] = wtk_bc_df[wtk_bc_df["tid"] == tid].reset_index(drop=True)
            
            # Actually use bias corrected wind speeds for further steps (overwrite original ws)
            dfs_by_tid[tid]["ws"] = dfs_by_tid[tid]["ws_bc"]
            
            #display(dfs_by_tid[tid].head(3))
        
        atmospheric_inputs[wind_source] = dfs_by_tid
        
    elif wind_source == "wtk_led_bc":
        wtk_led_bc_df = pd.read_csv("02 Bias Correction/wtk_led_bc.csv.bz2")
        
        # Create dict with dataframes that correspond to selected tid's
        dfs_by_tid = {}
        for tid in selected:
            dfs_by_tid[tid] = wtk_led_bc_df[wtk_led_bc_df["tid"] == tid].reset_index(drop=True)
            
            # Actually use bias corrected wind speeds for further steps (overwrite original ws)
            dfs_by_tid[tid]["ws"] = dfs_by_tid[tid]["ws_bc"]
            
            #display(dfs_by_tid[tid].head(3))
        
        atmospheric_inputs[wind_source] = dfs_by_tid
        
    else:
        print("Unsupported wind_source selected:", wind_source)

### Load obstacle data

In [None]:
sites_with_tall_blgs = [] 

obstacle_inputs = {}
for tid in selected:
    #print("Processing tid: ", tid)
    
    index_row = index[index["APRS ID"] == tid].iloc[0]
    z_turbine = index_row["Hub Height (m)"]
    
    obstacle_data_dir = "01 Bergey Turbine Data/3dbuildings_geojson"
    obstacle_data_file = "%s/%sv2.json" % (obstacle_data_dir, tid)
    
    if os.path.exists(obstacle_data_file):
        #print("BEFORE filtering (%s):" % obstacle_data_file)
        #display(gpd.read_file(obstacle_data_file))
        
        obstacle_df = filter_obstacles(tid,
                                       gpd.read_file(obstacle_data_file), 
                                       include_trees=True, 
                                       turbine_height_for_checking=z_turbine)
        obstacle_df["tid"] = tid
        obstacle_inputs[tid] = obstacle_df
        
        #print("AFTER filtering (%s):" % obstacle_data_file)
        #display(obstacle_df)
    else:
        print("Can't access: %s. Skipping" % obstacle_data_file)

all_obstacle_inputs = pd.concat(obstacle_inputs.values())
display(all_obstacle_inputs)

In [None]:
# Save combined and filtered obstacles dataframe into a file
#obstacle_data_dir = "01 Bergey Turbine Data/3dbuildings_geojson"
#dest_file = "%s/all_obstacles.json" % (obstacle_data_dir)
#all_obstacle_inputs.to_file(dest_file, driver="GeoJSON", index=False)

In [None]:
# Quick vis:
for tid, obstacle_df in obstacle_inputs.items():
    obstacle_df.plot(figsize=(2,2))

### Run ANL's LOM

In [None]:
# Working sequential version -- one site at a time

# # This flag allows overwriting previously saved files with results if they are found in the specified directory dest_dir 
# overwrite = False

# # Will be used in the filenames
# site_type = "bergey"

# # Will be used in the filenames
# model_type = "anl"

# dest_dir = "03 Model Outputs"
# if not os.path.exists(dest_dir):
#     os.makedirs(dest_dir)    
    
# for tid in tqdm(selected):
    
#     for wind_source in wind_sources:
        
#         for obstacle_mode in obstacle_modes:
        
#             dest_filename = "%s/%s_%s_%s_%s_%s.csv.bz2" % (dest_dir, site_type, model_type, tid, wind_source, obstacle_mode)
            
#             if (not overwrite) and (os.path.exists(dest_filename)):
#                 print("Found previously saved %s); overwrite flag is off. Skipping to next config." % (dest_filename))
#             else:
#                 row = index[index["APRS ID"] == tid].iloc[0]
#                 #print(row)
#                 lat = row["Latitude"]
#                 lon = row["Longitude"]
#                 z_turbine = row["Hub Height (m)"]
#                 xy_turbine = [np.array([lon, lat])]

#                 if obstacle_mode == "bldgsonly":
#                     obs_df = obstacle_inputs[tid]
#                     obs_df = obs_df[obs_df["feature_type"] == "building"].reset_index(drop=True)
#                 elif obstacle_mode == "bldgsandtrees":
#                     # Assume trees pass the filtering run above in this notebook 
#                     obs_df = obstacle_inputs[tid]
                
#                 predictions_df = run_lom(atmospheric_inputs[wind_source][tid], \
#                                          obs_df, \
#                                          xy_turbine, z_turbine, \
#                                          check_distance=True)

#                 # Add LOM output back to the more complete input dataframe
#                 atmospheric_inputs[wind_source][tid]["ws-adjusted"] = predictions_df["ws-adjusted"]   
#                 atmospheric_inputs[wind_source][tid].to_csv(dest_filename, index=False)

In [None]:
# Prepare files for PILOWF + multiprocessing & run LOM

# This flag allows overwriting previously saved files with results if they are found in the specified directory dest_dir 
overwrite = True

# Will be used in the filenames
site_type = "bergey"

# Will be used in the filenames
model_type = "anl"

dest_dir = "01 Bergey Turbine Data/pilowf_inputs/"

script_path = "./run_pilowf_mp.py"
    
for wind_source in wind_sources:

    for obstacle_mode in obstacle_modes:

        # Make sure to start from scratch and not reuse previously saved inputs
        if os.path.exists(dest_dir):
            shutil.rmtree(dest_dir, ignore_errors=True)
        os.makedirs(dest_dir)    
        
        for tid in selected:
            row = index[index["APRS ID"] == tid].iloc[0]
            #print(row)
            lat = row["Latitude"]
            lon = row["Longitude"]
            z_turbine = row["Hub Height (m)"]
            xy_turbine = [np.array([lon, lat])]

            if obstacle_mode == "bldgsonly":
                obs_df = obstacle_inputs[tid]
                obs_df = obs_df[obs_df["feature_type"] == "building"].reset_index(drop=True)
            elif obstacle_mode == "bldgsandtrees":
                # Assume trees pass the filtering run above in this notebook  
                obs_df = obstacle_inputs[tid]
            elif obstacle_mode == "treesasbldgs":
                # Assume trees pass the filtering run above in this notebook 
                obs_df = obstacle_inputs[tid]  
            elif obstacle_mode == "bldgsonly_100m":
                # Assume trees pass the filtering run above in this notebook 
                obs_df = obstacle_inputs[tid].copy()

                print("# of obs (before 100m filtering):", len(obs_df))
                obs_df = filter_obstacles(tid,
                                          obs_df,
                                          include_trees=False, 
                                          turbine_height_for_checking=z_turbine,
                                          limit_to_radius_in_m=100.0,
                                          turbine_lat_lon=(lat, lon))
                print("# of obs (after 100m filtering):", len(obs_df))
                
            elif obstacle_mode == "treesasbldgs_100m":
                # Assume trees pass the filtering run above in this notebook 
                obs_df = obstacle_inputs[tid].copy()

                print("# of obs (before 100m filtering):", len(obs_df))
                obs_df = filter_obstacles(tid,
                                          obs_df,
                                          include_trees=True, 
                                          turbine_height_for_checking=z_turbine,
                                          limit_to_radius_in_m=100.0,
                                          turbine_lat_lon=(lat, lon))
                print("# of obs (after 100m filtering):", len(obs_df))
            
            
            if len(obs_df) == 0:
                print("tid=%s: Obstacle set is empty after filtering. ws-adjusted=ws for this cases." % tid)
            # obs_df.to_file() breaks if empty 
            
                output_dest = "03 Model Outputs/%s_%s_%s_%s_%s.csv.bz2" % (site_type, model_type, tid, wind_source, obstacle_mode)
                
                if (not overwrite) and (os.path.exists(output_dest)):
                    print("Found previously saved %s); overwrite flag is off. Skipping to next config." % (dest_filename))
                else:
                    res = atmospheric_inputs[wind_source][tid].copy()
                    res["ws-adjusted"] = res["ws"]
                    #res.to_csv(output_dest, index=False)  
                    print("Saved output:", output_dest)

                # Skip the following and go to the next tid x obstacle_mode combination
                continue
            
            
            # Save inputs for PILOWF into separate files
            obs_df.to_file("%s/%s-obstacles.json" % (dest_dir, tid),\
                           driver="GeoJSON", index=False)
            atmospheric_inputs[wind_source][tid].to_csv("%s/%s-atmospheric.csv.bz2" % (dest_dir, tid), index=False)
        
        # It is expecteed that inputs_dir now has a set of individual inputs (atmosperic file and obstacle file for each tid)
        
        # Must use subprocess as a way of wrapping/calling the python script becuase that script uses multiprocessing
        # and other methods seem to break
        # "<TID>" in "--output_filename_pattern" will replaces with actual TIDs inside the script with parallel processing
        
        # subprocess.run(["python", script_path,
        #             "--inputs_dir", dest_dir, \
        #             "--index_file", "01 Bergey Turbine Data/bergey_sites.csv", \
        #             "--output_filename_pattern", \
        #                 "03 Model Outputs/%s_%s_%s_%s_%s.csv.bz2" % (site_type, model_type, "<TID>", wind_source, obstacle_mode), \
        #             "--procs", "8"],
        #            #stdout=subprocess.DEVNULL, # This addition suppresses entire (lengthy) output
        #            cwd="./")

In [None]:
# Check one case and confirm that there is indeed difference between
# bldgsonly and bldgsandtrees

# f1 = "03 Model Outputs/bergey_anl_t034_wtk_bc_bldgsonly.csv.bz2"
# df1 = pd.read_csv(f1)
# df1

# f2 = "03 Model Outputs/bergey_anl_t034_wtk_bc_bldgsandtrees.csv.bz2"
# df2 = pd.read_csv(f2)
# df2

# Looking for non-zero mean here to see the difference between two outputs
# (df2["ws-adjusted"] - df1["ws-adjusted"]).mean()

In [None]:
# Quick vis of data in produced files

# for f in glob.iglob("%s/*" % dest_dir):
#     df = pd.read_csv(f)
#     fig = plt.gcf()
#     fig.set_size_inches(2.5,2.5)
#     sns.scatterplot(x=df["ws"], \
#                     y=df["ws-adjusted"], alpha=0.2).set(title=os.path.basename(f));
#     plt.show()