In [29]:
import os

import bayes_net_utils as bn
import pandas as pd

pd.options.display.width = None

# Bayesian network predictions for the hindcast period

Notebook to generate predictions from:

* Bayesian network (GBN) with or without met nodes
* Seasonal naive forecast

The GBNs were fit in the previous notebook and saved as RData. This notebook is in Python, to make it easier for me to manipulate the data and results. It basically provides simple wrappers around the R code  required to generate predictions using the BNs. The R code required to run the Bayesian network and generate predictions has been refactored into the R function bayes_net_predict in bayes_net_utils.R. There is also a Python function of the same name in bayes_net_utils.py, which provides a simple "wrapper" around the R fucntion and some minor additional calculations.

Currently this notebook **needs running twice if there are any updates to the data**, once with met_source = "metno" (to include wind speed and rain sum nodes), and again with met_source = "nomet" to exclude these met nodes. This should be tidied.

# User options

In [30]:
# User options

met_source = 'nomet'  # Source of met data used to create data for driving predictions? 'metno', or 'nomet'

# Set start and end years (of data used to fit the network and evidence data used when making predictions)
# Available years vary by met source type (e.g. met.no data only goes to 2018, nomet currently to 2020)
st_yr = 1981
end_yr = 2018

# Standard deviation of the box cox transformed cyanobacteria observations
# For use when back-transforming cyanobacteria predictions (with bias-adjustment)
# Mean of period 1981-2018 is 1.32 (with lambda of 0.1)
sigma_obs_cyano = pd.read_csv(r"../Data/standard_deviation_boxcox_cyano_obs.csv", index_col=0).squeeze().item()

# Fitted bnlearn objects
rfile_fpath = "../Data/RData/Vansjo_fitted_GaussianBN_%s_%s-%s.rds" % (met_source, st_yr, end_yr)

# Pre-calculated standard deviations (from fitting of GBN in NB 02)
sd_fpath = "../Data/FittedNetworkDiagnostics/GBN_%s_%s-%s_stdevs.csv" % (met_source, st_yr, end_yr)

# The 'evidence' csv (data that will be used to drive the predictions)
ev_path = r'../Data/DataForPrediction/%s/DataForPrediction_GBN_%s_%s-%s.csv' % (met_source, met_source, st_yr, end_yr)

# Filepath to observations, for calculating seasonally naive forecast
obs_fpath = '../Data/Observations_GBN_target_nodes/seasonal_obs_GBN_1980-2019.csv'

# Outfolder to save predictions in
out_folder = r'../Data/Predictions'
out_fname = 'GBN_prediction_%s_%s-%s_biasadjusted-cyano.csv' % (met_source, st_yr, end_yr)

# Filepath for saving predictions from seasonal naive forecast
out_naive_fname = 'Prediction_naive_%s-%s.csv' % (st_yr, end_yr)

# Function to predict multiple years at once

If you are just predicting for one season, you can use bn.bayes_net_predict by itself. The function below works too, but is particularly useful for producing predictions for all years in a historic test period.

In [31]:
def bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df, sigma_obs_cyano, met_source):
    """
    Loop over rows in evidence dataframe and make predictions for each row (year)
    using the fitted BN object supplied, and concatenate results into a single df.

    Inputs:
    - filepath to fitted BN object
    - filepath to standard deviations of fitted bn
    - dataframe containing evidence to use when making predictions
      (column names must match BN node names)
    - sigma_obs_cyano: standard deviation of box cox transformed cyanobacteria
      observations, for use when back-transforming GBN cyanobacteria predictions
    - met_source: whether to use a GBN with met nodes or not

    Returns: dataframe of predictions for each year in ev_df, including expected value,
             probability of being above or below WFD-relevant thresholds
    """
    df_list = []
    for idx, row in ev_df.iterrows():

        if met_source == 'nomet':
            # Run Bayesian network in R, no met nodes
            df = bn.bayes_net_predict_nomet(rfile_fpath,
                                            sd_fpath,
                                            float(row['year']),
                                            float(row['chla_prevSummer']),
                                            float(row['colour_prevSummer']),
                                            float(row['TP_prevSummer']),
                                            sigma_obs_cyano
                                            )
        else:
            # Run Bayesian network in R, with met nodes
            df = bn.bayes_net_predict(rfile_fpath,
                                      sd_fpath,
                                      float(row['year']),
                                      float(row['chla_prevSummer']),
                                      float(row['colour_prevSummer']),
                                      float(row['TP_prevSummer']),
                                      float(row['wind_speed']),
                                      float(row['rain']),
                                      sigma_obs_cyano
                                      )
        df_list.append(df)

    # Merge results from all years
    df = pd.concat(df_list, sort=True)
    df.reset_index(drop=True, inplace=True)

    # Re-order cols
    df = df[['year', 'node', 'threshold', 'prob_below_threshold',
             'prob_above_threshold', 'expected_value', 'sd', 'WFD_class']]

    return df

# Generate GBN predictions

In [32]:
# Output filename for GBN predictions
out_path = os.path.join(out_folder, out_fname)

# Read in evidence
ev_df = pd.read_csv(ev_path)

# Predict and save to csv
df = bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df, sigma_obs_cyano, met_source)

df.to_csv(out_path, index=False)

df

Unnamed: 0,year,node,threshold,prob_below_threshold,prob_above_threshold,expected_value,sd,WFD_class
0,1981,chla,20.0,0.54,0.46,19.600,3.850,0
1,1981,colour,47.3,0.99,0.01,27.100,9.200,0
2,1981,cyano,1.0,0.20,0.80,3.830,0.736,1
3,1981,TP,29.5,0.02,0.98,37.300,3.820,1
4,1982,chla,20.0,0.97,0.03,11.900,3.850,0
...,...,...,...,...,...,...,...,...
147,2017,TP,29.5,0.97,0.03,22.200,3.820,0
148,2018,chla,20.0,0.98,0.02,11.300,3.850,0
149,2018,colour,47.3,0.76,0.24,40.800,9.200,0
150,2018,cyano,1.0,0.80,0.20,0.785,0.736,0


# Predictions from simple benchmark model (seasonal naive forecast)

Simplest possible model: target season = previous season

In [33]:
# Read in observations
seasonal_obs_df = pd.read_csv(obs_fpath, index_col=0)

# Predict
naive_sim_df_wide = seasonal_obs_df.shift(+1).loc[st_yr:]

# Reformat to long format
naive_sim_df_wide = naive_sim_df_wide.reset_index()
sim_naive = pd.melt(
    naive_sim_df_wide,
    id_vars=["year"],
    value_vars=["TP", "chla", "cyano", "colour"],
    var_name="node",
    value_name="expected_value",
)

# Add predicted class
# Dictionary of thresholds to use. N.B. Also defined in bayes_net_utils.R (as boundaries_list)
boundaries_dict = {
    "TP": 29.5,  # Middle of 'Moderate' class
    "chla": 20.0,  # M-P boundary. WFD boundaries: [10.5, 20.0]. Only 6 observed points under 10.5 so merge G & M
    "colour": 47.3,  # 66th percentile (i.e. upper tercile). No management implications
    "cyano": 1.0,  # M-P boundary is 2.0, but there were only 2 values in this class. Plenty above 2 tho
}

sim_naive["WFD_class"] = sim_naive[["node", "expected_value"]].apply(
    lambda x: bn.discretize([boundaries_dict[x.node]], x.expected_value), axis=1
)

# Save to csv
out_path = os.path.join(out_folder, out_naive_fname)

sim_naive.to_csv(out_path)

sim_naive.tail()

Unnamed: 0,year,node,expected_value,WFD_class
151,2015,colour,41.863636,0.0
152,2016,colour,52.833333,1.0
153,2017,colour,52.0,1.0
154,2018,colour,42.0,0.0
155,2019,colour,36.333333,0.0
