In [1]:
# Import packages
library(tidyverse)
library(bnlearn)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘bnlearn’


The following object is masked from ‘package:stats’:

    sigma




# Intro

Notebook for make water quality predictions for the next season using a pre-fitted bayesian network, saved as an R object (fitted in notebook Fit_BN_1Season).

We want to make predictions for the following network nodes:
* TP (mean TP concentration over the growing season, mg/l)
* chla (mean colour over the growing season, mg/l)
* colour_summer (mean colour over the growing season)
* cyano (maximum cyano bacterial biovolume observed during the growing season, mg/l)

Part of WATExR project. Leah JB, Jan 2020.

# Set up

In [4]:
# Filepaths

met_source = 'metno' # choose from 'metno' or 'era5'

# End year of desired training data (check BayesianNetwork/Data/BN_TrainingData folder for available date ranges)
end_yr_li = list('metno'=2018,
                 'era5'=2019)

# Fitted Bayesian network R object
rfile_fpath = sprintf("../Data/RData/Vansjo_fitted_GaussianBN_%s_1981-%s.rds", met_source, end_yr_li[[met_source]])

# Standard deviation info from fitted BN
sd_fpath = sprintf("../Data/FittedNetworkDiagnostics/GBN_%s_1981-%s_stdevs.csv", met_source, end_yr_li[[met_source]])

# Data to set as evidence for making predictions
driving_data_fpath = "../Data/DataForPrediction/GaussianBN_DataForPrediction_1Season.csv"

# Location for output dataframe of predictions
out_fpath = "../Data/BN_output_prediction/GaussianBN_Prediction_for_1_season.csv"

In [5]:
# Read in fitted bayesian network
fitted_BN = readRDS(rfile_fpath)
# fitted_BN

In [6]:
# Read in (and if necessary format) data to use in making predictions
driving_data = read.csv(file=driving_data_fpath, header=TRUE, sep=",", row.names = 1)
driving_data[1:ncol(driving_data)] = lapply(driving_data[1:ncol(driving_data)], as.numeric) # Convert any integer cols to numeric

driving_data

Unnamed: 0_level_0,chla_prevSummer,colour_prevSummer,TP_prevSummer,wind_speed,rain
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2020,12.86667,42,19.66667,3.968478,312.809


# Make prediction

For each nodes we want predictions for (TP, chla, colour_summer, cyano'), predict:
- Expected value
- Append standard error info (read from file)
- Probability of being in different WFD-relevant classes
- Save df

In [7]:
# Nodes to make predictions for. Must match nodes present in the fitted BN.
# Add check that list is sorted alphabetically, as concatenation of final df assumes this
nodes_to_predict = sort(c('chla','colour','cyano', 'TP'))

## Expected value

In [8]:
set.seed(1)

# First, make an empty list to populate below
expectedValue_li = vector(mode = "list", length = 0)

for (node in nodes_to_predict)
    
    {pred = predict(fitted_BN,
                data=driving_data,
                node=node,
                method='bayes-lw',
                n=10000)
     
     # If node is cyano, then remove the boxcox transformation before adding expected value to list
     if (node=="cyano")
         {pred = (pred*0.1 + 1)**(1/0.1)} # 0.1 is lambda value chosen in transformation
     
     expectedValue_li[[node]] = pred # Update list with value for this node
    }

expectedValue_li = expectedValue_li[order(names(expectedValue_li))] # Sort alphabetically
expectedValue_li

## Standard deviation (standard error) info

This is particularly relevant for the operational tool, as extra info to accompany the predictions.

In [9]:
# Add pre-saved standard deviation (error) information for each node
sds = read.csv(file=sd_fpath, header=TRUE, sep=",")
sds

node,sd
<fct>,<dbl>
TP,3.7958656
TP_prevSummer,5.1533997
chla,3.7389697
chla_prevSummer,5.5868686
wind_speed,0.2482299
cyano,0.7381779
colour_prevSummer,17.2734354
colour,8.95971
rain,101.273552


In [10]:
# Just select values associated with nodes for prediction, and sort alphabetically
sd_predictedNodes = filter(sds, node %in% nodes_to_predict)
sd_predictedNodes = sd_predictedNodes[order(sd_predictedNodes$node),]
sd_predictedNodes

Unnamed: 0_level_0,node,sd
Unnamed: 0_level_1,<fct>,<dbl>
2,chla,3.7389697
4,colour,8.95971
3,cyano,0.7381779
1,TP,3.7958656


## Probability of being within WFD classes

In [11]:
boundaries_list = list('TP' = 29.5, # Middle of 'Moderate' class
                       'chla' = 20.0, # M-P boundary. WFD boundaries: [10.5, 20.0]. Only 6 observed points under 10.5 so merge G & M
                       'colour' = 48.0, # 66th percentile (i.e. upper tercile). No management implications
                       'cyano' = 1.0 # M-P boundary is 2.0, but there were only 2 values in this class. Plenty above 2 tho
                       )

boundaries_list = boundaries_list[order(names(boundaries_list))] # Sort alphabetically

In [12]:
# Data for evidence, converted to named list
evidence_li = as.list(driving_data) # Evidence must be provided as a named list

# Empty list to be populated with probability of being below boundary
prob_li = vector(mode = "list", length = 0)

for (node in nodes_to_predict)
    {
    boundary = unlist(boundaries_list[node], use.names=FALSE)

    # If cyanomax, apply boxcox transformation with lambda=0.1
    if (node=='cyano')
        {boundary = (boundary^0.1 - 1)/0.1}
    
    prob = cpquery(fitted_BN,
                   event = (eval(as.name(node)) < boundary),
                   evidence=evidence_li,
                   method='lw')
    
    # Round to 2 d.p. Below this, cpquery returns variable results over diff calls
    # Even with rounding, still get some variability in results
    prob = round(prob,digits=2)
    
    prob_li[[node]] = prob
    }

# Double-check it's sorted alphabetically
prob_li = prob_li[order(names(prob_li))] # Sort alphabetically
prob_li

# Add all info to a dataframe and save

In [13]:
prob_df = data.frame(node=nodes_to_predict,
                     threshold = unlist(boundaries_list, use.names=FALSE),
                     prob_below_threshold = unlist(prob_li, use.names=FALSE),
                     prob_above_threshold = 1-unlist(prob_li, use.names=FALSE),
                     expected_value = signif(unlist(expectedValue_li, use.names=FALSE),3), #Round to 3 s.f
                     st_dev = signif(sd_predictedNodes['sd'],3)) #Round to 3 s.f

write.csv(prob_df, out_fpath, row.names=FALSE)

prob_df

Unnamed: 0_level_0,node,threshold,prob_below_threshold,prob_above_threshold,expected_value,sd
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
2,chla,20.0,0.99,0.01,9.61,3.74
4,colour,48.0,0.93,0.07,35.2,8.96
3,cyano,1.0,0.84,0.16,0.338,0.738
1,TP,29.5,0.96,0.04,22.8,3.8
