## Performance Analysis for Daily Energy Production

Caleb Phillips (caleb.phillips@nrel.gov)

The purpose of this notebook is to read in computed predictions and compare them to actual observations, computing metrics. 

In [None]:
from common import *
import pandas as pd
import numpy as np
from tqdm import tqdm
import glob
import re
from dw_tap.power_output import estimate_power_output
import os.path
import importlib
import power_output
import site_index
import plotly.express as px

In [None]:
index = site_index.SiteIndex()

### Load Ground Truth (Bergey Daily Data)

In [None]:
daily_bergey = pd.read_csv("01 Bergey Turbine Data/daily_summaries.csv.bz2")
daily_bergey = daily_bergey.merge(index.index[['APRS ID','AID']].rename(columns={'APRS ID':'tid'}))
daily_bergey['date'] = pd.to_datetime(daily_bergey['date'],utc=True).dt.date
daily_bergey.head()

#### Evaluate faults

In [None]:
pd.set_option('display.max_rows', 500)
daily_bergey[['soft_grid']].drop_duplicates().head(500)

In [None]:
daily_bergey["fault"] = "None"
daily_bergey.loc[daily_bergey['faults'].str.contains("FAULT"),"fault"] = "Fault"
daily_bergey.loc[daily_bergey['faults'].str.contains("WAITING INITIALIZING"),"fault"] = "Reset"
daily_bergey.loc[daily_bergey['faults'].str.contains("MANUAL STOP"),"fault"] = "Stopped"
daily_bergey.loc[daily_bergey['soft_grid'].str.contains("%"),"fault"] = "Curtailed"
daily_bergey["fault"].value_counts()

### Load Prediction Data for all Models/Sites

In [None]:
providers = ["bergey"]
models = ["perera","anl"]
wind_sources = ["wtk","wtk_bc","wtk_led_2018","wtk_led_2019","wtk_led_bc"]

dfs = []

for provider in providers:
    for model in models:
        for wind_source in wind_sources:
            for tid in index.tids(True):
                fname = f"03 Model Outputs/{provider}_{model}_{tid}_{wind_source}.csv.bz2"
                if not os.path.exists(fname):
                    continue
                
                print(fname)
                
                d = pd.read_csv(fname)
                d["model"] = model
                d["wind_source"] = wind_source
                d["provider"] = provider
                d["tid"] = tid
                dfs.append(d[["model","wind_source","provider","tid","datetime","ws-adjusted"]])
                
                if model == "perera":
                    d2 = d.copy()
                    d2["ws-adjusted"] = d2["ws-adjusted-2"]
                    d2["model"] = "shelter"
                    dfs.append(d2[["model","wind_source","provider","tid","datetime","ws-adjusted"]])
                    
                    d3 = d.copy()
                    d3["ws-adjusted"] = d3["ws-adjusted-3"]
                    d3["model"] = "shelter+"
                    dfs.append(d3[["model","wind_source","provider","tid","datetime","ws-adjusted"]])

In [None]:
# No model data
for provider in providers:
    for wind_source in tqdm(wind_sources):
        if wind_source == "wtk":
            d = pd.read_csv("01 Bergey Turbine Data/wtk.csv.bz2")
            d.rename(columns={"ws":"ws-adjusted"},inplace=True)
            
        elif wind_source == "wtk_led_2018":
            d = pd.read_csv("01 Bergey Turbine Data/wtk_led_2018.csv.bz2")
            d.rename(columns={"ws":"ws-adjusted","packet_date":"datetime"},inplace=True)
            
        elif wind_source == "wtk_led_2019":
            d = pd.read_csv("01 Bergey Turbine Data/wtk_led_2019.csv.bz2")
            d.rename(columns={"ws":"ws-adjusted","packet_date":"datetime"},inplace=True)
            
        elif wind_source == "wtk_bc":
            d = pd.read_csv("02 Bias Correction/wtk_bc.csv.bz2")
            d.rename(columns={"ws_bc":"ws-adjusted"},inplace=True)
            
        elif wind_source == "wtk_led_bc":
            d = pd.read_csv("02 Bias Correction/wtk_led_bc.csv.bz2")
            d.rename(columns={"ws_bc":"ws-adjusted"},inplace=True)
            
        else:
            print("Unsupported wind source")
            
        d["wind_source"] = wind_source
        d["model"] = "none"
        d["provider"] = provider
        dfs.append(d[["model","wind_source","provider","tid","datetime","ws-adjusted"]])

In [None]:
bigdf = pd.concat(dfs)
bigdf.head()

In [None]:
bigdf["power_kw"] = power_output.Bergey10.windspeed_to_kw(bigdf,'ws-adjusted')
bigdf['datetime'] = pd.to_datetime(bigdf['datetime'],format="%Y-%m-%d %H:%M:%S",utc=True) # this is slow
bigdf['date'] = bigdf['datetime'].dt.date
bigdf.head()

In [None]:
daily_summaries = bigdf[['model','wind_source','provider','tid','date','power_kw','ws-adjusted']]\
    .groupby(['model','wind_source','provider','tid','date']).agg({ 'power_kw': ['count','sum'], 
                                                                    'ws-adjusted': ['mean','max','min','median'] })
daily_summaries.columns = ['_'.join(col) for col in daily_summaries.columns.values]
daily_summaries = daily_summaries.reset_index()
# scale the 5 min data so that it is kwh (too)
daily_summaries["power_kw_sum"] = daily_summaries["power_kw_sum"]*(daily_summaries["power_kw_count"]/24)
daily_summaries.head()

### Create Merged Dataframe

In [None]:
merged_df = daily_summaries.merge(daily_bergey[['date','tid','energy_kwh','fault']],on=['tid','date'])
merged_df.head()

In [None]:
# using 300 kwh as the maximum daily production
merged_df["relative_error_pct"] = ((merged_df["power_kw_sum"] - merged_df["energy_kwh"])/300)*100 

merged_df['date'] = pd.to_datetime(merged_df['date'])
merged_df['month'] = merged_df['date'].dt.month

#### Over-predictions (impossibly high energy values)

Seems to imply something is wrong with the Perera model -- need to investigate...

In [None]:
# Percentage of predicted energy values that appear bogus (>300 kwh)
merged_df.loc[merged_df["power_kw_sum"] > 300,"model"].value_counts()*100/merged_df["model"].value_counts()

In [None]:
merged_df.loc[(merged_df["power_kw_sum"] > 300) & (merged_df["model"] == "perera"),"tid"].value_counts()

In [None]:
merged_df.loc[(merged_df["power_kw_sum"] > 300) & (merged_df["model"] == "anl"),"tid"].value_counts()

### Performance Plots: Overall

In [None]:
filtered_merged_df = merged_df[(merged_df["fault"] == "None") & (merged_df["power_kw_sum"] <= 300)]

In [None]:
fig = px.scatter(filtered_merged_df,x="energy_kwh",y="power_kw_sum",
                 facet_row="model",facet_col="wind_source",
                 labels={"power_kw_sum":"Pred(kwh)","energy_kwh":"Obs(kwh)"})
fig.show()

In [None]:
fig = px.scatter(merged_df,x="energy_kwh",y="power_kw_sum",
                 facet_row="model",facet_col="wind_source",color="fault",
                 labels={"power_kw_sum":"Pred(kwh)","energy_kwh":"Obs(kwh)"})
fig.update_yaxes(matches=None)
fig.update_xaxes(matches=None)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

In [None]:
fig = px.density_contour(filtered_merged_df,x="energy_kwh",y="power_kw_sum",
                         facet_row="model",facet_col="wind_source",
                         labels={"power_kw_sum":"Pred(kwh)","energy_kwh":"Obs(kwh)"})
fig.update_xaxes(range=[0, 40])
fig.update_yaxes(range=[0, 40])
fig.update_traces(contours_coloring="fill", contours_showlabels = True)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

In [None]:
# Defining function for 50th Percentile
p25 = lambda x: x.quantile(0.25)
p75 = lambda x: x.quantile(0.75)

perf_summary = filtered_merged_df.groupby(["model","wind_source"]).\
    agg({ "relative_error_pct": ["mean","median",p25,p75] }).\
    rename(columns={"<lambda_0>":'25%ile',"<lambda_1>":'75%ile'})
perf_summary

In [None]:
fig = px.box(filtered_merged_df,x="model",y="relative_error_pct",color="wind_source",
             labels={"relative_error_pct":"Relative Error (%)","model":"Model"},
             points=False)
fig.show()

In [None]:
fig = px.box(filtered_merged_df,x="wind_source",y="relative_error_pct",color="model",
             labels={"relative_error_pct":"Relative Error (%)","wind_source":"Wind Source"},
             points=False)
fig.show()

In [None]:
byyear = filtered_merged_df.copy()
byyear['year'] = byyear['date'].dt.year
byyear['error'] = byyear['power_kw_sum'] - byyear['energy_kwh']
byyear = byyear[['model','wind_source','year','error','energy_kwh']].groupby(['model','wind_source','year']).sum().reset_index()
byyear['pct'] = 100*byyear["error"]/byyear['energy_kwh']
byyear

In [None]:
px.box(byyear,y="pct",x="wind_source",color="model")

In [None]:
px.box(byyear[(byyear["wind_source"].isin(["wtk","wtk_bc"]))],y="pct",x="wind_source",color="model",
       labels={"wind_source":"Wind Source","pct": "Annualized Error Percentage (%)","model":"Model"})

#### Performance Drill Down: ANL

In [None]:
dc = filtered_merged_df[filtered_merged_df["model"] == "anl"].groupby(["wind_source","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.line(dc,x="month",y="relative_error_pct",color="wind_source",markers=True,
        labels={"relative_error_pct":"Median Relative Error (%)","month":"Month"},
        title="Median performance of ANL Model vs. Month of Year (all sites)")

In [None]:
dc = filtered_merged_df[(filtered_merged_df["model"] == "anl") & (filtered_merged_df["wind_source"] == "wtk")].groupby(["tid","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.imshow(dc.pivot(index="month",columns="tid")["relative_error_pct"],
          title="Median Error by Site and Month for ANL/WTK Corrected",
          color_continuous_scale='Portland')

In [None]:
dc = filtered_merged_df[(filtered_merged_df["model"] == "anl") & (filtered_merged_df["wind_source"] == "wtk_bc")].groupby(["tid","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.imshow(dc.pivot(index="month",columns="tid")["relative_error_pct"],
          title="Median Error by Site and Month for ANL/WTK-Bias Corrected",
          color_continuous_scale='Portland')

#### Performance Drill Down: Perera

In [None]:
dc = filtered_merged_df[filtered_merged_df["model"] == "perera"].groupby(["wind_source","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.line(dc,x="month",y="relative_error_pct",color="wind_source",markers=True,
        labels={"relative_error_pct":"Median Relative Error (%)","month":"Month"},
        title="Median performance of Perera vs. Month of Year (all sites)")

In [None]:
dc = filtered_merged_df[filtered_merged_df["model"] == "perera"].groupby(["wind_source","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.line(dc[dc["wind_source"] != 'wtk_led_2018'],x="month",y="relative_error_pct",color="wind_source",markers=True,
        labels={"relative_error_pct":"Median Relative Error (%)","month":"Month"},
        title="Median performance of Perera vs. Month of Year (all sites)")

In [None]:
dc = filtered_merged_df[(filtered_merged_df["model"] == "perera") & (filtered_merged_df["wind_source"] == "wtk")].groupby(["tid","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.imshow(dc.pivot(index="month",columns="tid")["relative_error_pct"],
          title="Median Error by Site and Month for Perera/WTK Corrected",
          color_continuous_scale='Portland')

In [None]:
dc = filtered_merged_df[(filtered_merged_df["model"] == "perera") & (filtered_merged_df["wind_source"] == "wtk_bc")].groupby(["tid","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.imshow(dc.pivot(index="month",columns="tid")["relative_error_pct"],
          title="Median Error by Site and Month for Perera/WTK Corrected",
          color_continuous_scale='Portland')

#### Performance Drill Down: No Model

In [None]:
dc = filtered_merged_df[filtered_merged_df["model"] == "none"].groupby(["wind_source","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.line(dc,x="month",y="relative_error_pct",color="wind_source",markers=True,
        labels={"relative_error_pct":"Median Relative Error (%)","month":"Month"},
        title="Median performance of Wind Resource (no Model) vs. Month of Year (all sites)")

In [None]:
dc = filtered_merged_df[(filtered_merged_df["model"] == "none") & (filtered_merged_df["wind_source"] == "wtk")].groupby(["tid","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.imshow(dc.pivot(index="month",columns="tid")["relative_error_pct"],
          title="Median Error by Site and Month for No-Model/WTK",
          color_continuous_scale='Portland')

In [None]:
dc = filtered_merged_df[(filtered_merged_df["model"] == "none") & (filtered_merged_df["wind_source"] == "wtk_bc")].groupby(["tid","month"]).agg({"relative_error_pct": "median"}).reset_index()
px.imshow(dc.pivot(index="month",columns="tid")["relative_error_pct"],
          title="Median Error by Site and Month for No-Model/WTK-Bias Corrected",
          color_continuous_scale='Portland')

#### Summary

 - All models produce high errors sometimes, but the bulk of errors (IQR) are within +/- 10%
 - WTK LED 2018 has very high bias, 2019 appears better
 - Bias correction makes a very large difference, especially for WTK LED
 - The Perera suite of models produces extremely high estimates in some cases, which is likely a bug
 - Overall Perera outperforms ANL (!)
 - When resource is not bias corrected, the resource data error dominates, when corrected, the obstacle model error dominates. Residual errors in the resource tend to propagate through the models.
 - Sites 34 and 183 have a high residual bias (resource is overestimated), sites 170 and 28 have low residual bias (resource is underestimated). Curiously, ANL manages to "fix" 183, but not 34. Perera fixes neither. For underestimates, the models seem ill-equipped to fix the problem.
   - Site 34: on southern coast of lake ontario, many trees (wind from SW and SE)
   - Site 183: by lakeland college, large building to the NE, otherwise open (wind primarily from S, sometimes NW)
   - Site 170: jackson college, built environment to the S (wind varies, predominately SW)
   - Site 28: stagecoach rd, rural/suburban but many mature trees around turbine (wind mostly W and NE)

#### Wind Roses

In [None]:
px.data.wind()

In [None]:
wtk = pd.read_csv("01 Bergey Turbine Data/wtk.csv.bz2")
wtk.head()

In [None]:
wtk["ws_rounded"] = wtk["ws"].round()
wtk["wd_rounded"] = 10*(wtk["wd"]/10).astype(int)
dc = wtk.copy()[["tid","wd_rounded","ws_rounded"]]
dc["n"] = 1
dc = dc.groupby(["tid","wd_rounded","ws_rounded"]).sum()
dc = dc.reset_index()
dc['p'] = np.round(100*dc['n']/61368,1)

In [None]:
dc

In [None]:
for tid in index.tids():
    fig = px.bar_polar(dc[dc["tid"] == tid], r="p", theta="wd_rounded",
                   color="ws_rounded",
                   color_continuous_scale= 'Jet',
                   labels={"ws_rounded": "Windspeed (m/s)","p": "Percent (%)"},
                   title=tid)
    fig.show()