In [1]:
# Import packages
library(tidyverse)
library(bnlearn)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.2.1 ──
[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.3
[32m✔[39m [34mtidyr  [39m 1.0.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘bnlearn’

The following object is masked from ‘package:stats’:

    sigma



# Create a fitted bayesian network

Save fitted network to file for use elsewhere

In [2]:
# Create the network structure manually

# Nodes in the network
nodes = c('TP_ES',
         'TP_LS',
         'TP_prevSummer',
         'chla_ES',
         'chla_LS',
         'chla_prevSummer',
         'cyano_ES',
         'cyano_LS',
         'cyano_prevSummer',
         'rainy_days_winter',
         'windDays_over_Q0.6_LS',
         'windDays_under_Q0.4_LS')

dag = empty.graph(nodes) # Create empty directed acyclic graph with nodes

# Manually define the arcs connecting nodes
arcs_matrix = matrix(c('TP_prevSummer','TP_ES',
                       'TP_prevSummer','TP_LS',
                       'TP_ES','chla_ES',           
                       'chla_prevSummer','chla_ES',
                       'rainy_days_winter','chla_ES',
                       'chla_ES','chla_LS',
                       'TP_LS','chla_LS',
                       'windDays_over_Q0.6_LS','chla_LS',
                       'cyano_prevSummer','cyano_ES',
                       'chla_ES','cyano_ES',
                       'chla_LS','cyano_LS',
                       'windDays_under_Q0.4_LS','cyano_LS'),
                        ncol = 2, byrow = TRUE,
                        dimnames = list(NULL, c("from", "to"))) #assign the DAG structure (as an attribute of asia)

# Assign the structure to the empty graph using arcs, which turns it into a bnlearn object
arcs(dag) = arcs_matrix
dag


  Random/Generated Bayesian network

  model:
   [TP_prevSummer][chla_prevSummer][cyano_prevSummer][rainy_days_winter]
   [windDays_over_Q0.6_LS][windDays_under_Q0.4_LS][TP_ES|TP_prevSummer]
   [TP_LS|TP_prevSummer][chla_ES|TP_ES:chla_prevSummer:rainy_days_winter]
   [chla_LS|TP_LS:chla_ES:windDays_over_Q0.6_LS]
   [cyano_ES|chla_ES:cyano_prevSummer][cyano_LS|chla_LS:windDays_under_Q0.4_LS]
  nodes:                                 12 
  arcs:                                  12 
    undirected arcs:                     0 
    directed arcs:                       12 
  average markov blanket size:           3.33 
  average neighbourhood size:            2.00 
  average branching factor:              1.00 

  generation algorithm:                  Empty 


In [24]:
nodes(dag)

In [3]:
# Read in historic data which will be used to fit the network params

data_discretized_all = read.csv(file="../data/DataMatrices/Vansjo_Seasonal_Discretized_RegTree_all.csv",
                                header=TRUE, sep=",", row.names = 1)

# Convert from factors to ordered factors: for each columns, assign levels as follows depending
# on how many levels there are (key: number of levels, returns levels to use):
#     factor_li_dict = {2: ['L','H'],
#                      3: ['L','M','H'],
#                      4: ['VL','L','M','H'],
#                      5: ['VL','L','M','H','VH']}

# (n.b. the brackets after data_disc_rt preserves the original indexing; otherwise the row indices are dropped...)
data_discretized_all[] = mutate_all(data_discretized_all, ~ droplevels(factor(., order = TRUE, levels = c("VL", "L", "M", "H", "VH"))))

# Check: Drop any columns which don't match the nodes
training_data = data_discretized_all[ , (names(data_discretized_all) %in% nodes)]

# Quick look at output
str(training_data)
head(training_data)

'data.frame':	37 obs. of  12 variables:
 $ chla_prevSummer       : Ord.factor w/ 2 levels "L"<"H": 1 1 1 1 1 1 1 1 1 1 ...
 $ cyano_prevSummer      : Ord.factor w/ 2 levels "L"<"H": NA NA NA NA NA NA NA NA NA NA ...
 $ rainy_days_winter     : Ord.factor w/ 2 levels "L"<"H": 1 1 1 1 1 1 1 2 1 1 ...
 $ TP_prevSummer         : Ord.factor w/ 3 levels "L"<"M"<"H": 3 2 2 2 2 3 3 1 2 2 ...
 $ chla_ES               : Ord.factor w/ 2 levels "L"<"H": 1 1 1 1 1 2 1 2 1 1 ...
 $ cyano_ES              : Ord.factor w/ 2 levels "L"<"H": NA NA NA NA NA NA NA NA NA NA ...
 $ TP_ES                 : Ord.factor w/ 2 levels "L"<"H": 1 2 2 1 2 2 1 2 1 1 ...
 $ chla_LS               : Ord.factor w/ 2 levels "L"<"H": 1 1 1 1 1 1 1 1 2 1 ...
 $ cyano_LS              : Ord.factor w/ 2 levels "L"<"H": NA NA NA NA NA NA NA NA NA NA ...
 $ TP_LS                 : Ord.factor w/ 2 levels "L"<"H": 2 1 1 1 1 2 1 1 1 1 ...
 $ windDays_under_Q0.4_LS: Ord.factor w/ 2 levels "L"<"H": 1 1 1 1 1 1 1 1 1 1 ...
 $ windDays_o

Unnamed: 0_level_0,chla_prevSummer,cyano_prevSummer,rainy_days_winter,TP_prevSummer,chla_ES,cyano_ES,TP_ES,chla_LS,cyano_LS,TP_LS,windDays_under_Q0.4_LS,windDays_over_Q0.6_LS
Unnamed: 0_level_1,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>,<ord>
1981,L,,L,H,L,,L,L,,H,L,L
1982,L,,L,M,L,,H,L,,L,L,H
1983,L,,L,M,L,,H,L,,L,L,H
1984,L,,L,M,L,,L,L,,L,L,L
1985,L,,L,M,L,,H,L,,L,L,H
1986,L,,L,H,H,,H,L,,H,L,H


In [16]:
# Fit. Returns a bn.fit object, which includes fitted conditional probability tables for each node
fitted_BN = bn.fit(dag, data=training_data, method='bayes',
                   iss = 1)
fitted_BN


  Bayesian network parameters

  Parameters of node TP_ES (ordinal distribution)

Conditional probability table:
 
     TP_prevSummer
TP_ES         L         M         H
    L 0.8409091 0.6224490 0.2209302
    H 0.1590909 0.3775510 0.7790698

  Parameters of node TP_LS (ordinal distribution)

Conditional probability table:
 
     TP_prevSummer
TP_LS          L          M          H
    L 0.97727273 0.62244898 0.15116279
    H 0.02272727 0.37755102 0.84883721

  Parameters of node TP_prevSummer (ordinal distribution)

Conditional probability table:
         L         M         H 
0.1929825 0.4298246 0.3771930 

  Parameters of node chla_ES (ordinal distribution)

Conditional probability table:
 
, , chla_prevSummer = L, rainy_days_winter = L

       TP_ES
chla_ES           L           H
      L 0.500000000 0.597560976
      H 0.500000000 0.402439024

, , chla_prevSummer = H, rainy_days_winter = L

       TP_ES
chla_ES           L           H
      L 0.944444444 0.055555556
      H 0.05

In [18]:
# Save bn.fit object to file for reading later & using to make predictions
rfile_fpath = "../data/RData/Vansjo_fitted_seasonal_BN_1981-2017.rds"
saveRDS(fitted_BN, file = rfile_fpath )