## Performance Analysis for Daily Energy Production

Caleb Phillips (caleb.phillips@nrel.gov) and Jenna Ruzekowicz (jenna.ruzekowicz@nrel.gov)

The purpose of this notebook is to read in computed predictions and compare them to actual observations, computing metrics. 

In [1]:
from common import *
import pandas as pd
import numpy as np
from tqdm import tqdm
import glob
import re
from dw_tap.power_output import estimate_power_output
import os.path
import importlib
import power_output
import site_index
import plotly.express as px

In [2]:
importlib.reload(site_index)
index = site_index.SiteIndex()

### Load Ground Truth (Bergey Daily Data)

In [3]:
daily_bergey = pd.read_csv("01 Bergey Turbine Data/daily_summaries.csv.bz2")
daily_bergey = daily_bergey.merge(index.index[['APRS ID','AID']].rename(columns={'APRS ID':'tid'}))
daily_bergey['date'] = pd.to_datetime(daily_bergey['date'],utc=True).dt.date
daily_bergey.head()

Unnamed: 0,date,n,energy_kwh,power_min_w,power_max_w,power_avg_w,soft_grid,faults,AID,tid
0,2020-02-19,6422,23.0,-54,4780,1333,No,,A2719,t007
1,2020-02-19,6422,23.0,-54,4780,1333,No,,A2719,t007
2,2020-02-18,8658,40.0,-49,11629,1735,No,,A2719,t007
3,2020-02-17,8657,21.0,-38,6625,928,No,,A2719,t007
4,2020-02-16,8659,1.0,-62,1668,75,No,,A2719,t007


#### Evaluate faults

In [30]:
pd.set_option('display.max_rows', 500)
daily_bergey[['soft_grid']].drop_duplicates().head(500)

Unnamed: 0,soft_grid
0,No
405,0.0%
992,1.4%
1037,0.6%
1049,0.5%
1066,0.2%
1100,0.3%
1175,0.1%
1207,0.4%
1208,13.4%


In [63]:
daily_bergey["fault"] = "None"
daily_bergey.loc[daily_bergey['faults'].str.contains("FAULT"),"fault"] = "Fault"
daily_bergey.loc[daily_bergey['faults'].str.contains("WAITING INITIALIZING"),"fault"] = "Reset"
daily_bergey.loc[daily_bergey['faults'].str.contains("MANUAL STOP"),"fault"] = "Stopped"
daily_bergey.loc[daily_bergey['soft_grid'].str.contains("%"),"fault"] = "Curtailed"
daily_bergey["fault"].value_counts()

None         56302
Reset         4471
Fault         3798
Curtailed     2482
Stopped        136
Name: fault, dtype: int64

### Load Prediction Data for all Models/Sites

In [4]:
providers = ["bergey"]
models = ["perera","anl"]
wind_sources = ["wtk","wtk_bc","wtk_led_2018","wtk_led_2019","wtk_led_bc"]

dfs = []

for provider in providers:
    for model in models:
        for wind_source in wind_sources:
            for tid in index.tids(True):
                fname = f"03 Model Outputs/{provider}_{model}_{tid}_{wind_source}.csv.bz2"
                if not os.path.exists(fname):
                    continue
                
                print(fname)
                
                d = pd.read_csv(fname)
                d["model"] = model
                d["wind_source"] = wind_source
                d["provider"] = provider
                d["tid"] = tid
                dfs.append(d[["model","wind_source","provider","tid","datetime","ws-adjusted"]])
                
                if model == "perera":
                    d2 = d.copy()
                    d2["ws-adjusted"] = d2["ws-adjusted-2"]
                    d2["model"] = "shelter"
                    dfs.append(d2[["model","wind_source","provider","tid","datetime","ws-adjusted"]])
                    
                    d3 = d.copy()
                    d3["ws-adjusted"] = d3["ws-adjusted-3"]
                    d3["model"] = "shelter+"
                    dfs.append(d3[["model","wind_source","provider","tid","datetime","ws-adjusted"]])

03 Model Outputs/bergey_perera_t024_wtk.csv.bz2
03 Model Outputs/bergey_perera_t028_wtk.csv.bz2
03 Model Outputs/bergey_perera_t034_wtk.csv.bz2
03 Model Outputs/bergey_perera_t041_wtk.csv.bz2
03 Model Outputs/bergey_perera_t083_wtk.csv.bz2
03 Model Outputs/bergey_perera_t114_wtk.csv.bz2
03 Model Outputs/bergey_perera_t133_wtk.csv.bz2
03 Model Outputs/bergey_perera_t135_wtk.csv.bz2
03 Model Outputs/bergey_perera_t139_wtk.csv.bz2
03 Model Outputs/bergey_perera_t140_wtk.csv.bz2
03 Model Outputs/bergey_perera_t169_wtk.csv.bz2
03 Model Outputs/bergey_perera_t170_wtk.csv.bz2
03 Model Outputs/bergey_perera_t182_wtk.csv.bz2
03 Model Outputs/bergey_perera_t183_wtk.csv.bz2
03 Model Outputs/bergey_perera_t192_wtk.csv.bz2
03 Model Outputs/bergey_perera_t207_wtk.csv.bz2
03 Model Outputs/bergey_perera_t221_wtk.csv.bz2
03 Model Outputs/bergey_perera_t034_wtk_bc.csv.bz2
03 Model Outputs/bergey_perera_t133_wtk_bc.csv.bz2
03 Model Outputs/bergey_perera_t140_wtk_bc.csv.bz2
03 Model Outputs/bergey_perera_

In [5]:
bigdf = pd.concat(dfs)
bigdf.head()

Unnamed: 0,model,wind_source,provider,tid,datetime,ws-adjusted
0,perera,wtk,bergey,t024,2007-01-01 00:00:00,3.995366
1,perera,wtk,bergey,t024,2007-01-01 01:00:00,3.408681
2,perera,wtk,bergey,t024,2007-01-01 02:00:00,4.416225
3,perera,wtk,bergey,t024,2007-01-01 03:00:00,4.88642
4,perera,wtk,bergey,t024,2007-01-01 04:00:00,4.23436


In [6]:
importlib.reload(power_output)
bigdf["power_kw"] = power_output.Bergey10.windspeed_to_kw(bigdf,'ws-adjusted')
bigdf['datetime'] = pd.to_datetime(bigdf['datetime'],format="%Y-%m-%d %H:%M:%S",utc=True) # this is slow
bigdf['date'] = bigdf['datetime'].dt.date
bigdf.head()

Unnamed: 0,model,wind_source,provider,tid,datetime,ws-adjusted,power_kw,date
0,perera,wtk,bergey,t024,2007-01-01 00:00:00+00:00,3.995366,0.397321,2007-01-01
1,perera,wtk,bergey,t024,2007-01-01 01:00:00+00:00,3.408681,0.201507,2007-01-01
2,perera,wtk,bergey,t024,2007-01-01 02:00:00+00:00,4.416225,0.559754,2007-01-01
3,perera,wtk,bergey,t024,2007-01-01 03:00:00+00:00,4.88642,0.785896,2007-01-01
4,perera,wtk,bergey,t024,2007-01-01 04:00:00+00:00,4.23436,0.486365,2007-01-01


In [17]:
daily_summaries = bigdf[['model','wind_source','provider','tid','date','power_kw','ws-adjusted']]\
    .groupby(['model','wind_source','provider','tid','date']).agg({ 'power_kw': ['count','sum'], 
                                                                    'ws-adjusted': ['mean','max','min','median'] })
daily_summaries.columns = ['_'.join(col) for col in daily_summaries.columns.values]
daily_summaries = daily_summaries.reset_index()
# scale the 5 min data so that it is kwh (too)
daily_summaries["power_kw_sum"] = daily_summaries["power_kw_sum"]*(daily_summaries["power_kw_count"]/24)
daily_summaries.head()

Unnamed: 0,model,wind_source,provider,tid,date,power_kw_count,power_kw_sum,ws-adjusted_mean,ws-adjusted_max,ws-adjusted_min,ws-adjusted_median
0,anl,wtk,bergey,t024,2007-01-01,24,2.540081,2.159771,4.88642,0.279416,1.821794
1,anl,wtk,bergey,t024,2007-01-02,24,20.799516,4.873958,6.1185,1.97474,5.135209
2,anl,wtk,bergey,t024,2007-01-03,24,105.467064,8.987273,23.404978,0.367416,7.612774
3,anl,wtk,bergey,t024,2007-01-04,24,226.216302,15.132936,24.304735,7.243944,14.919122
4,anl,wtk,bergey,t024,2007-01-05,24,125.548508,7.974929,13.008134,0.52971,8.504868


### Create Merged Dataframe

In [67]:
merged_df = daily_summaries.merge(daily_bergey[['date','tid','energy_kwh','fault']],on=['tid','date'])
merged_df.head()

Unnamed: 0,model,wind_source,provider,tid,date,power_kw_count,power_kw_sum,ws-adjusted_mean,ws-adjusted_max,ws-adjusted_min,ws-adjusted_median,energy_kwh,fault
0,anl,wtk,bergey,t024,2010-05-17,24,151.764014,9.221244,13.932734,2.822109,9.316758,0.0,
1,anl,wtk,bergey,t024,2010-05-17,24,151.764014,9.221244,13.932734,2.822109,9.316758,0.0,
2,perera,wtk,bergey,t024,2010-05-17,24,147.64746,9.090354,13.663213,2.814173,9.297537,0.0,
3,perera,wtk,bergey,t024,2010-05-17,24,147.64746,9.090354,13.663213,2.814173,9.297537,0.0,
4,shelter,wtk,bergey,t024,2010-05-17,24,150.489371,9.1878,13.927745,2.814173,9.297537,0.0,


In [135]:
# using 300 kwh as the maximum daily production
merged_df["relative_error_pct"] = ((merged_df["power_kw_sum"] - merged_df["energy_kwh"])/300)*100 

#### Over-predictions (impossibly high energy values)

Seems to imply something is wrong with the Perera model -- need to investigate...

In [95]:
# Percentage of predicted energy values that appear bogus (>300 kwh)
merged_df.loc[merged_df["power_kw_sum"] > 300,"model"].value_counts()*100/merged_df["model"].value_counts()

anl          1.182412
perera      17.694587
shelter     18.153174
shelter+    18.156294
Name: model, dtype: float64

In [104]:
merged_df.loc[(merged_df["power_kw_sum"] > 300) & (merged_df["model"] == "perera"),"tid"].value_counts()

t034    744
t170    691
t140    374
t135    373
t139    370
t192    368
t041    363
t114    363
t221    361
t182    357
t024    336
t207    242
t183    238
t083    220
t028    144
t133    126
t169      2
Name: tid, dtype: int64

In [105]:
merged_df.loc[(merged_df["power_kw_sum"] > 300) & (merged_df["model"] == "anl"),"tid"].value_counts()

t034    294
t083      1
Name: tid, dtype: int64

### Performance Plots

In [136]:
filtered_merged_df = merged_df[(merged_df["fault"] == "None") & (merged_df["power_kw_sum"] <= 300)]

In [None]:
fig = px.scatter(filtered_merged_df,x="energy_kwh",y="power_kw_sum",
                 facet_row="model",facet_col="wind_source",
                 labels={"power_kw_sum":"Pred(kwh)","energy_kwh":"Obs(kwh)"})
fig.show()

In [None]:
fig = px.scatter(merged_df,x="energy_kwh",y="power_kw_sum",
                 facet_row="model",facet_col="wind_source",color="fault",
                 labels={"power_kw_sum":"Pred(kwh)","energy_kwh":"Obs(kwh)"})
fig.update_yaxes(matches=None)
fig.update_xaxes(matches=None)
fig.show()

In [None]:
fig = px.density_contour(filtered_merged_df,x="energy_kwh",y="power_kw_sum",
                         facet_row="model",facet_col="wind_source",
                         labels={"power_kw_sum":"Pred(kwh)","energy_kwh":"Obs(kwh)"})
fig.update_xaxes(range=[0, 50])
fig.update_yaxes(range=[0, 50])
fig.update_traces(contours_coloring="fill", contours_showlabels = True)
fig.show()

In [147]:
# Defining function for 50th Percentile
p25 = lambda x: x.quantile(0.25)
p75 = lambda x: x.quantile(0.75)

perf_summary = filtered_merged_df.groupby(["model","wind_source"]).\
    agg({ "relative_error_pct": ["mean","median",p25,p75] }).\
    rename(columns={"<lambda_0>":'25%ile',"<lambda_1>":'75%ile'})
perf_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,relative_error_pct,relative_error_pct,relative_error_pct,relative_error_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,25%ile,75%ile
model,wind_source,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
anl,wtk,6.468869,4.02022,-1.009716,12.680447
anl,wtk_bc,5.877847,3.412627,-5.014758,14.418755
anl,wtk_led_2019,18.590453,12.545293,2.602122,30.649272
perera,wtk,3.825857,2.498482,-3.394311,10.122712
perera,wtk_bc,-0.997745,-0.838901,-9.658032,7.862026
perera,wtk_led_2018,36.994848,37.580526,8.562875,65.764929
perera,wtk_led_2019,14.655332,9.468049,0.878684,25.03286
perera,wtk_led_bc,-0.048104,0.55779,-8.534888,4.05208
shelter,wtk,5.728042,3.627468,-1.48916,11.781661
shelter,wtk_bc,2.433033,2.332348,-5.137235,10.570167


In [None]:
fig = px.box(filtered_merged_df,x="model",y="relative_error_pct",color="wind_source",
             labels={"relative_error_pct":"Relative Error (%)","model":"Model"},
             points=False)
fig.show()

In [None]:
fig = px.box(filtered_merged_df,x="wind_source",y="relative_error_pct",color="model",
             labels={"relative_error_pct":"Relative Error (%)","wind_source":"Wind Source"},
             points=False)
fig.show()

#### Summary

 - All models produce high errors sometimes, but the bulk of errors (IQR) are within +/- 10%
 - WTK LED 2018 has very high bias, 2019 appears better
 - Bias correction makes a very large difference, especially for WTK LED
 - The Perera suite of models produces extremely high estimates in some cases, which is likely a bug
 - Overall Perera outperforms ANL (!)