In [18]:
import pandas as pd, os
import bayes_net_utils as bn
pd.options.display.width=None

# Bayesian network predictions

The R code required to run the Bayesian network and generate predictions has been refactored into the R function `bayes_net_predict` in `bayes_net_utils.R`. There is also a Python function of the same name in `bayes_net_utils.py`, which provides a simple "wrapper" around the R fucntion and some minor additional calculations. This should make it easy to make predictions from the Bayesian network via Python.

**Note:** There is some computational overhead involved in interfacing between Python and R, but this isn't a major problem.

# User options

In [19]:
# User options

run_mode = 'Historic'        # Run mode? 'Historic' for period 1981-2018/19, or 'NextSeason' for future (operational, or one historic test season)

if run_mode == 'NextSeason': # If making predictions for the next season, for which year? For file reading
    target_yr = 2020

# Also run the combination of models selected as 'best' for each variable?
# If set to True, you MUST also set met_evidence = 'era5-av'
run_operational = True # Boolean (True or False)
    
met_evidence = 'era5-av'  # Source of met data used to create data for driving predictions? 'metno', 'era5', 'era5-av', or 's5'

# Use dictionary to automatically set the met data used in network training based on the source of data used to drive predictions.
# If met data for predictions is not s5, should be the same as met_evidence. If 's5', should be 'era5' as that was used in bias correcting s5
met_training_dict = {'metno':'metno',
                    'era5':'era5',
                    's5':'era5',
                    'era5-av':'era5'}
met_training = met_training_dict[met_evidence]

# Start and end years of data used to fit network (used in the .rds filepath) and, for 'Historic' run_mode,
# in generating the data for prediction (and in the filepaths to these csvs)
st_end_yr_dict = {'metno': [1981,2018],
               'era5': [1981,2019],
               'era5-av': [1981,2019],
               's5': [1993,2019]}

# Fitted bnlearn object
rfile_fpath = "../Data/RData/Vansjo_fitted_GaussianBN_%s_%s-%s.rds" %(met_training, st_end_yr_dict[met_training][0], st_end_yr_dict[met_training][1])

# Pre-calculated standard deviations
sd_fpath = "../Data/FittedNetworkDiagnostics/GBN_%s_%s-%s_stdevs.csv" %(met_training, st_end_yr_dict[met_training][0], st_end_yr_dict[met_training][1])

# The 'evidence' (data that will be used to drive the predictions) folder
ev_folder = r'../Data/DataForPrediction/%s/%s' %(run_mode, met_evidence)

# Outfolder to save predictions in
out_folder = r'../Data/Predictions/%s' %run_mode

# Function to predict multiple years at once

If you are just predicting for one season, you can use bn.bayes_net_predict by itself. The function below works too, but is particularly useful for producing predictions for all years in a historic test period.

In [20]:
def bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df):
    """
    Loop over rows in evidence dataframe and make predictions for each row (year), and concatenate results into a
    single df
    """
    df_list = []
    for idx, row in ev_df.iterrows():
        # Run Bayesian network in R
        df = bn.bayes_net_predict(rfile_fpath,
                                  sd_fpath,
                                  float(row['year']),
                                  float(row['chla_prevSummer']),
                                  float(row['colour_prevSummer']),
                                  float(row['TP_prevSummer']),
                                  float(row['wind_speed']),
                                  float(row['rain']),
                                 )
    #     # Add 'year' to results as unique identifier
    #     df['year'] = int(row['year'])
        df_list.append(df)

    # Merge results from all years
    df = pd.concat(df_list, sort=True)
    df.reset_index(drop=True, inplace=True)

    # Re-order cols
    df = df[['year', 'node', 'threshold','prob_below_threshold', 
             'prob_above_threshold', 'expected_value', 'sd','WFD_class']]
    
    return df

# Predictions for 'deterministic' met data (e.g. met.no or ERA5)

Where there is just a single 'evidence' datafile

In [21]:
if met_evidence !='s5':
    
    # Sort out filepaths for the evidence data to read in and the output file
    if run_mode == 'NextSeason':
        ev_fname = 'DataForPrediction_GBN_%s_%s.csv' %(met_evidence, target_yr)
        out_fname = 'GBN_prediction_%s_%s.csv' %(met_evidence, target_yr)
    else:
        ev_fname = 'DataForPrediction_GBN_%s_%s-%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
        out_fname = 'GBN_prediction_%s_%s-%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
        
    ev_path = os.path.join(ev_folder, ev_fname)
    out_path = os.path.join(out_folder, out_fname)
    
    # Read in evidence and optionally display
    ev_df = pd.read_csv(ev_path)
    
#     display(ev_df.head())
    
    # Predict and save to csv
    df = bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df)
    
    df.to_csv(out_path, index=False)

    display(df)

Unnamed: 0,year,node,threshold,prob_below_threshold,prob_above_threshold,expected_value,sd,WFD_class
0,1981,chla,20.0,0.60,0.40,19.000,3.760,0
1,1981,colour,48.0,0.98,0.02,28.900,9.040,0
2,1981,cyano,1.0,0.26,0.74,1.970,0.719,1
3,1981,TP,29.5,0.02,0.98,37.000,3.790,1
4,1982,chla,20.0,0.97,0.03,12.300,3.760,0
...,...,...,...,...,...,...,...,...
151,2018,TP,29.5,0.95,0.05,23.200,3.790,0
152,2019,chla,20.0,0.99,0.01,11.000,3.760,0
153,2019,colour,48.0,0.89,0.11,36.800,9.040,0
154,2019,cyano,1.0,0.80,0.20,0.413,0.719,0


# Predictions using evidence derived from seasonal forecast data

Where there may be multiple seasons and members. Currently set up for System5.

In [22]:
if met_evidence == 's5':
    
    member_li = ["%.2d" % i for i in range(1,26)] # List of S5 member numbers in format '01','02'... Should be present in s5 met data folder
    season_li = ['summer','late_summer'] # Seasons of interest (must match filenames in s5 met data folder)

    for season in season_li:
        for member in member_li:

            # Sort out filepaths for the evidence data to read in and the output file
            if run_mode == 'NextSeason':
                ev_fname = 'DataForPrediction_GBN_%s_%s_%s_%s.csv' %(met_evidence, target_yr, season, member)
                out_fname = 'GBN_prediction_%s_%s_%s_%s.csv' %(met_evidence, target_yr, season, member)
            else:
                ev_fname = 'DataForPrediction_GBN_%s_%s-%s_%s_%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1], season, member)
                out_fname = 'GBN_prediction_%s_%s-%s_%s_%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1], season, member)

            ev_path = os.path.join(ev_folder, ev_fname)
            out_path = os.path.join(out_folder, 's5', out_fname)

            # Read in evidence
            ev_df = pd.read_csv(ev_path)

            # Predict and save to csv
            df = bn_predict_multipleyears(rfile_fpath, sd_fpath, ev_df)
            df.to_csv(out_path, index=False)

    # Display output for the last season and member for checking
    display(df)

# Simplest possible model: target season = previous season

In [23]:
obs_fpath = '../Data/DataMatrices/Seasonal_BN_obs/seasonal_obs_GBN_1980-2019.csv'

# Read in evidence and optionally display
seasonal_obs_df = pd.read_csv(obs_fpath, index_col=0)
# display(obs_df.head())

# Fill NaNs in water chemistry and ecology (linearly interpolate and backwards fill)
# seasonal_obs_df.interpolate(method='linear',limit=1, inplace=True)

# Predict
naive_sim_df_wide = seasonal_obs_df.shift(+1).loc[1981:]

# Reformat to long format
naive_sim_df_wide = naive_sim_df_wide.reset_index()
sim_naive = pd.melt(naive_sim_df_wide,
                    id_vars=['year'],
                    value_vars=['TP','chla','cyano','colour'],
                    var_name='node',
                    value_name='expected_value')

# Add predicted class
# Dictionary of thresholds to use. N.B. Also defined in bayes_net_utils.R (as boundaries_list)
boundaries_dict = {'TP': 29.5,     # Middle of 'Moderate' class
                   'chla': 20.0,   # M-P boundary. WFD boundaries: [10.5, 20.0]. Only 6 observed points under 10.5 so merge G & M
                   'colour': 48.0, # 66th percentile (i.e. upper tercile). No management implications
                   'cyano': 1.0    # M-P boundary is 2.0, but there were only 2 values in this class. Plenty above 2 tho
                  }
sim_naive['WFD_class'] = sim_naive[['node','expected_value']].apply(lambda x: bn.discretize([boundaries_dict[x.node]],
                                                                                                 x.expected_value), axis=1)
# Save to csv
if run_mode == 'NextSeason':
    out_fname = 'Prediction_naive_%s.csv' %(target_yr)
else:
    out_fname = 'Prediction_naive_%s-%s.csv' %(st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
out_path = os.path.join(out_folder, out_fname)

sim_naive.to_csv(out_path)

display(sim_naive.tail())

Unnamed: 0,year,node,expected_value,WFD_class
151,2015,colour,41.863636,0.0
152,2016,colour,52.833333,1.0
153,2017,colour,52.0,1.0
154,2018,colour,42.0,0.0
155,2019,colour,36.333333,0.0


# Historic predictions from chosen 'operational' models

All forecasts were originally going to be based on a Bayesian Belief Network (BBN) which included several weather-related nodes (mean seasonal wind speed and seasonal precipitation sum). However, the results of cross validation of the Bayesian Network and different versions of the ntework (notebook BN_CV_PythonPostProcess), and a comparison of different models for the hindcast period (notebook Hindcast_stats_and_plots), lead to the following choices for models to use in operational forecasting:

- TP: BBN (no met included anyway in BN)
- chla: Naive seasonal forecast
- colour: BBN, no met (stats were the same for BBN with met, without met, or seasonal naive. Choose this for consistency with cyano)
- cyano: BBN, no met

Here, produce forecasts for the historic period using this model choice

**Note**: For now I decided to leave any NaNs in the seasonal naive forecast (think there's just one in 2000, due to no chla obs in 1999). There aren't corresponding NaNs in the BN though, despite missing data... Should perhaps interpolate for consistency? Or drop BN predictions in years with missing lake chem/ecol data?

In [24]:
def bn_predict_multipleyears_operational(rfile_fpath, ev_df):
    """
    Loop over rows in evidence dataframe and make predictions for each row (year),
    and concatenate results into a single df
    
    The same as bn_predict_multipleyears, but calls the function bayes_net_predict_operational,
    rather than bayes_net_predict. This makes almost no difference to the results, but it does
    make a tiny difference, so worth doing just for consistency across prediction notebooks.
    """
    df_list = []
    for idx, row in ev_df.iterrows():
        # Run Bayesian network in R
        df = bn.bayes_net_predict_operational(rfile_fpath,
                                  float(row['year']),
                                  float(row['chla_prevSummer']),
                                  float(row['colour_prevSummer']),
                                  float(row['TP_prevSummer'])
                                             )
        df_list.append(df)

    # Merge results from all years
    df = pd.concat(df_list, sort=True)
    df.reset_index(drop=True, inplace=True)

    # Re-order cols
    df = df[['year', 'node', 'threshold','prob_below_threshold', 
             'prob_above_threshold', 'expected_value', 'WFD_class']]
    
    return df

In [25]:
if run_operational == True:
    
    # Re-generate era5-av predictions, using the operational predict function, which returns ever-so-slightly
    # different results (not sure why - to investigate later...)
    if met_evidence !='s5':
    
        # Sort out filepaths for the evidence data to read in and the output file
        if run_mode == 'NextSeason':
            ev_fname = 'DataForPrediction_GBN_%s_%s.csv' %(met_evidence, target_yr)
            out_fname = 'GBN_prediction_operational_%s_%s.csv' %(met_evidence, target_yr)
        else:
            ev_fname = 'DataForPrediction_GBN_%s_%s-%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
            out_fname = 'GBN_prediction_operational_%s_%s-%s.csv' %(met_evidence, st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])

        ev_path = os.path.join(ev_folder, ev_fname)
        out_path = os.path.join(out_folder, out_fname)

        # Read in evidence
        ev_df = pd.read_csv(ev_path)

        # Predict and save to csv
        df = bn_predict_multipleyears_operational(rfile_fpath, ev_df)
        df.to_csv(out_path, index=False)

#         display(df)        
        
        # Take TP, colour and cyano predictions produced above using the 'deterministic' met data (era5-av in this case)
        # Replace chl-a predictions with the naive seasonal forecast
        sim_naive_chla = sim_naive.loc[sim_naive['node']=='chla']
        df_operational = df.drop(df.loc[df['node']=='chla'].index)

        df_operational = sim_naive_chla.set_index(['year','node']).append(df_operational.set_index(['year','node']), sort=False)
        df_operational = df_operational.reset_index()
        df_operational.loc[df_operational['node']=='chla','threshold'] = boundaries_dict['chla']

        # Save to csv
        if run_mode == 'NextSeason':
            out_fname = 'Prediction_operational_%s.csv' %(target_yr)
        else:
            out_fname = 'Prediction_operational_%s-%s.csv' %(st_end_yr_dict[met_evidence][0], st_end_yr_dict[met_evidence][1])
        out_path = os.path.join(out_folder, out_fname)

        df_operational.to_csv(out_path, index=False)

        display(df_operational)

Unnamed: 0,year,node,expected_value,WFD_class,threshold,prob_below_threshold,prob_above_threshold
0,1981,chla,16.08013,0.0,20.0,,
1,1982,chla,8.33125,0.0,20.0,,
2,1983,chla,5.97500,0.0,20.0,,
3,1984,chla,6.05000,0.0,20.0,,
4,1985,chla,11.09000,0.0,20.0,,
...,...,...,...,...,...,...,...
151,2018,cyano,0.44700,0.0,1.0,0.78,0.22
152,2018,TP,23.10000,0.0,29.5,0.95,0.05
153,2019,colour,36.80000,0.0,48.0,0.88,0.12
154,2019,cyano,0.40800,0.0,1.0,0.80,0.20
