# Bias Correction

Caleb Phillips (caleb.phillips@nrel.gov), Lindsay Sheridan (lindsay.sheridan@pnnl.gov), Jenna Ruzekowicz (jenna.ruzekowicz@nrel.gov) and Dmitry Duplyakin (dmitry.duplyakin@nrel.gov)

This notebook will read resource data and reference observation data and use it to compute a bias corrected version of the resource data (by multiple linear regression) for those sites where reference data have been identified.

In [None]:
import pandas as pd
import h5pyd
from dw_tap.data_fetching import getData
from tqdm import tqdm
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import plotly.express as px
import numpy as np
import site_index

fetch_wtk_data = False

In [None]:
sites = site_index.SiteIndex()

### Download WTK Data at Each Bias Location (or read from cache)

In [None]:
if fetch_wtk_data:
    # Open the wind data "file"
    # server endpoint, username, password are found in ~/.hscfg
    f = h5pyd.File("/nrel/wtk-us.h5", 'r', bucket="nrel-pds-hsds") 

    def fetch_dfs(index,wtk_dfs=[],debug=False):
        for row in tqdm(index.to_dict(orient="records")):
            tid = row['APRS ID']
            lat = row['Met Tower Latitude']
            lon = row['Met Tower Longitude']
            heights = str(row['Measurement Height (m)'])
            if np.isnan(lat) or np.isnan(lon):
                continue
            for h in heights.split(","):
                h = int(float(h))
                if(debug):
                    print("Fetching data for turbine %s (%f,%f) at height %d" % (tid,lat,lon,h))
                atmospheric_df = getData(f, lat, lon, h, "IDW", 
                                         power_estimate=False,
                                         inverse_monin_obukhov_length=True)
                atmospheric_df['tid'] = tid
                atmospheric_df['h'] = h
                wtk_dfs.append(atmospheric_df)

        return wtk_dfs

    # in case HSDS has a connection error we will try a second time
    wtk_dfs = fetch_dfs(sites.index)
    wtk_dfs = pd.concat(wtk_dfs)
    wtk_dfs.to_csv("02 Bias Correction/wtk_met.csv.bz2")
    wtk_dfs.head()

else:
    wtk_dfs = pd.read_csv("02 Bias Correction/wtk_met.csv.bz2")
    wtk_dfs["datetime"] = pd.to_datetime(wtk_dfs["datetime"])

### Read in the met tower data, align with WTK and fit models - example of t034

Note that the below is needlessly verbose, repeating code for each site and would be much cleaner in a loop. I've done it this way so we can look at the fit/plots for each site, but may clean up in the future.

In [None]:
bc_dfs = [] # dataframe to hold bias corrected data for each site

In [None]:
def prepare_dataframe(tid,wtk_dfs,sites):
    minfo = sites.lookup_by_tid(tid)
    mfile = minfo["Met Tower"]
    mheight = int(minfo["Measurement Height (m)"])
    mdf = pd.read_csv("02 Bias Correction/%s" % mfile)
    mdf = mdf.rename(columns={'Time': 'datetime', "Spd%dm" % mheight: 'ws_obs', "Dir%dm" % mheight: 'wd_obs'})
    mdf['datetime'] = pd.to_datetime(mdf['datetime'])
    print("Met data runs from %s to %s" % (mdf['datetime'].min(),mdf['datetime'].max()))
    mdf = mdf.merge(wtk_dfs[wtk_dfs['tid'] == tid],on='datetime',how='left').dropna()
    mdf['hour'] = mdf['datetime'].dt.hour
    mdf['month'] = mdf['datetime'].dt.month
    return mdf

In [None]:
mdf = prepare_dataframe('t034',wtk_dfs,sites)
mdf.head()

In [None]:
mod = sm.OLS(mdf["ws_obs"],sm.add_constant(mdf[["ws","wd","hour","month"]]))
res = mod.fit()
print(res.summary())

In [None]:
# NNLS version requires Sklearn because statsmodels doesn't have NNLS
def regression_results(y_true, y_pred):

    # Regression metrics
    #explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    #mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    #print('explained_variance: ', round(explained_variance,4))    
    #print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

fit = LinearRegression().fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])
regression_results(fit.predict(mdf[["ws","hour","month","wd"]]),mdf["ws_obs"])

There may be some value in exploring nonlinear models (MARS, RF etc.)

In [None]:
#from pyearth import Earth
#mars = Earth()
#mars.fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])

In [None]:
def plot_bc_pointcloud(mdf):
    fig = px.scatter(mdf,x='ws', y='ws_obs',labels={"ws":"WTK Windspeed (m/s)","ws_obs":"Observed Windspeed (m/s)"})
    fig.update_xaxes(range=[-5,25])
    fig.update_yaxes(range=[-5,25])
    fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}])
    fig.show()
    
plot_bc_pointcloud(mdf)

Apply models to WTK data at site locations using NNLS model

In [None]:
sitewtk = pd.read_csv("01 Bergey Turbine Data/wtk.csv.bz2")
sitewtk.head()

In [None]:
def do_correction(sitewtk,tid,fit):
    chunk = sitewtk[sitewtk['tid'] == tid].reset_index()
    chunk['datetime'] = pd.to_datetime(chunk['datetime'])
    chunk['hour'] = chunk['datetime'].dt.hour
    chunk['month'] = chunk['datetime'].dt.month
    #chunk["ws_bc"] = res.predict(sm.add_constant(chunk[["ws","hour","month","wd"]]))
    chunk["ws_bc"] = fit.predict(chunk[["ws","hour","month","wd"]])
    chunk.loc[chunk["ws_bc"] < 0,"ws_bc"] = 0
    return chunk

chunk = do_correction(sitewtk,'t034',fit)
bc_dfs.append(chunk)
chunk.head()

In [None]:
chunk[chunk['ws_bc'] < 0].describe()

In [None]:
chunk[chunk['ws_bc'] < 0]

In [None]:
def plot_correction(chunk):
    fig = px.scatter(chunk,x='ws', y='ws_bc',color="wd",labels={'ws':"WTK Windspeed (mps)",'ws_bc':"Bias-Corrected Winspeed"})
    fig.update_xaxes(range=[-5,22])
    fig.update_yaxes(range=[-5,22])
    fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}])
    fig.show()
    
plot_correction(chunk)

In [None]:
chunk['ws_bc'].min()

### Site 83 (California)

No overlap with wind toolkit, need wind toolkit LED

In [None]:
mdf = prepare_dataframe('t083',wtk_dfs,sites)
mdf.head()

### Site 133 (Illinois)

In [None]:
mdf = prepare_dataframe('t133',wtk_dfs,sites)
mdf.head()

In [None]:
fit = LinearRegression().fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])
regression_results(fit.predict(mdf[["ws","hour","month","wd"]]),mdf["ws_obs"])

In [None]:
plot_bc_pointcloud(mdf)

In [None]:
chunk = do_correction(sitewtk,'t133',fit)
bc_dfs.append(chunk)
chunk.head()

In [None]:
plot_correction(chunk)

### Site 140 (New York)

In [None]:
mdf = prepare_dataframe('t140',wtk_dfs,sites)
mdf.head()

In [None]:
fit = LinearRegression().fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])
regression_results(fit.predict(mdf[["ws","hour","month","wd"]]),mdf["ws_obs"])

In [None]:
plot_bc_pointcloud(mdf)

In [None]:
chunk = do_correction(sitewtk,'t140',fit)
bc_dfs.append(chunk)
chunk.head()

In [None]:
plot_correction(chunk)

### Site 170 (Ohio)

In [None]:
mdf = prepare_dataframe('t170',wtk_dfs,sites)
mdf.head()

In [None]:
fit = LinearRegression().fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])
regression_results(fit.predict(mdf[["ws","hour","month","wd"]]),mdf["ws_obs"])

In [None]:
plot_bc_pointcloud(mdf)

In [None]:
chunk = do_correction(sitewtk,'t170',fit)
bc_dfs.append(chunk)
chunk.head()

In [None]:
plot_correction(chunk)

### Site 183 (Illinois)

In [None]:
mdf = prepare_dataframe('t183',wtk_dfs,sites)
mdf.head()

In [None]:
fit = LinearRegression().fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])
regression_results(fit.predict(mdf[["ws","hour","month","wd"]]),mdf["ws_obs"])

In [None]:
plot_bc_pointcloud(mdf)

In [None]:
chunk = do_correction(sitewtk,'t183',fit)
bc_dfs.append(chunk)
chunk.head()

In [None]:
plot_correction(chunk)

### Site 192 (Vermont)

In [None]:
mdf = prepare_dataframe('t192',wtk_dfs,sites)
mdf.head()

In [None]:
fit = LinearRegression().fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])
regression_results(fit.predict(mdf[["ws","hour","month","wd"]]),mdf["ws_obs"])

In [None]:
plot_bc_pointcloud(mdf)

In [None]:
chunk = do_correction(sitewtk,'t192',fit)
bc_dfs.append(chunk)
chunk.head()

In [None]:
plot_correction(chunk)

### Site 207 (Illinois)

In [None]:
mdf = prepare_dataframe('t207',wtk_dfs,sites)
mdf.head()

In [None]:
fit = LinearRegression().fit(mdf[["ws","hour","month","wd"]],mdf["ws_obs"])
regression_results(fit.predict(mdf[["ws","hour","month","wd"]]),mdf["ws_obs"])

In [None]:
plot_bc_pointcloud(mdf)

In [None]:
chunk = do_correction(sitewtk,'t207',fit)
bc_dfs.append(chunk)
chunk.head()

In [None]:
plot_correction(chunk)

## Save Bias Corrected Version

In [None]:
bcdf = pd.concat(bc_dfs)
del bcdf['packet_date']
bcdf.head()

In [None]:
bcdf.to_csv("02 Bias Correction/wtk_bc.csv.bz2")