In [1]:
import os
import bayes_net_utils as bn
import numpy as np
import pandas as pd
from sklearn.metrics import matthews_corrcoef, roc_auc_score

# Introduction

In this notebook we read in the cross validation results from the previous notebook and calculate a wider range of model performance statistics than are available within bnearn's cross validation function, in particular by discretizing the simulated output and calculating classification error, Matthew's correlation coefficient and the ROC_AUC score. This allows the continuous network to be compared to predictive performance of a discrete network.

In addition, we re-calculate correlation coefficients and mse. These can be calculated within bnlearn's CV function, but it was easier to automate the calculation here for all the CV runs done. Results match those obtained within bnlearn.

# Set up for processing

In [2]:
# USER INPUT

# Folder containing files of observed and predicted values from each cross validation run. Produced in notebook BN_development_1Season_R
CV_obs_sim_folder = "../Data/CrossValidation/LOOCV_predictions"

# Boundaries dictionary, copied from notebook B_seasonal_data_matrix_1Season
bound_dict = {
             'TP': [29.5], # No data below 20, so drop this class boundary. 29.5 is middle of 'Mod' class   
             'chla': [20.0],  # WFD boundaries: [10.5, 20.0]. But only 6 d.p. under 10.5 so merge G and M classes and use 20.
                              # For predicting cyano, would be better 17.4.   
             'colour': [48.0], # 66th percentile
             'cyano': [1.0], # M-P boundary is 2.0, but there were only 2 values in this class, rest above
             }

# Alter the boundaries in the boundaries dict for cyano, to take account of the box-cox transformation applied to the continuous
# data:  y* = (y^L - 1)/L, where we used lambda = 0.1 when transforming original cyano data
# bound_dict['cyano'] = [bound_dict['cyano'][0]**0.1 - 1]

met_source = 'metno' #'metno' or 'era5'

var_li = ['TP', 'chla', 'cyano', 'colour'] # What do you want to produce stats for? Need to have corresponding files in 'Data/CrossValidation/%s' %met_source folder

# Pre-calculated standard deviations (from fitting of GBN in a later notebook)
sd_fpath = "../Data/FittedNetworkDiagnostics/GBN_%s_1981-2018_stdevs.csv" %(met_source)

# Define functions

In [3]:
def boxcox_backtransform(x, lambda_param, bias_adj=True, sd_cyano=None):
    """
    x: value to back transform
    lambda_param: lambda used in box cox transform
    bias_adj: bias adjust the values when back-transforming? True or False
    sigma: if bias adjusting, this is the standard deviation of the box-cox transformed observations

    returns: back-transformed value
    """

    if bias_adj is True:
        backtransformed_value = ((x * lambda_param + 1) ** (1 / lambda_param)) * (
            1 + (((sd_cyano**2) * (1 - lambda_param)) / (2 * (lambda_param * x + 1) ** 2))
        )

    else:
        backtransformed_value = (x * 0.1 + 1) ** (1 / 0.1)

    return backtransformed_value


def xval_postprocess(var, fpath, sd_cyano=None):
    """
    Function to read in a csv of observed and predicted values from a continuous
    Bayesian belief network produced in BNLearn R notebook, calculate correlation
    coefficient and mean squared error, and then classify according to WFD and
    work out classification error, Matthew's correlation coefficient and ROC_AUC.
    If the cross validation input file is stochastic (i.e. contains multiple runs
    for the same variable), model performance statistics are averaged.

    Inputs:
        var: string, one of 'TP','chla','cyano','colour'
        fpath: string giving location of csv to be read in. csv should have columns:
                'obs_1','pred_1','obs_2','pred_2',... where _1, _2, etc. is the cross
                validation run number.
        sd_cyano: standard deviation of the box-cox transformed cyanobacteria observations,
                  for use when doing bias-adjusted back transformation

    Returns a series of model performance statistics for the variable.
    """

    # ---------------------------------------------------------------------------------
    # Read in data
    df = pd.read_csv(fpath, index_col=0)

    # ---------------------------------------------------------------------------------
    # Split into separate dataframes for each cross validation run
    cont_dict = {}  # Key: run number, returns df with obs and pred
    for i, col_name in enumerate(df.columns):
        if i % 2 == 0:  # If even, i.e. only do this for half the cols
            run_no = int(col_name.split("_", 1)[1])
            if run_no == 1:
                temp_df = df.iloc[:, [0, 1]]
            else:
                temp_df = df.iloc[:, [2 * run_no - 2, 2 * run_no - 1]]

            temp_df.columns = ["obs", "pred"]

            # If variable is cyanobacteria, transform observed and predicted to original data scale
            if var == "cyano":

                transformed_df = pd.DataFrame()
                transformed_df["obs"] = temp_df["obs"].apply(
                    boxcox_backtransform,
                    lambda_param=0.1,
                    bias_adj=False
                )

                # With bias-adjusted back transformation, to estimate the mean
                transformed_df["pred"] = temp_df["pred"].apply(
                    boxcox_backtransform,
                    lambda_param=0.1,
                    bias_adj=True,
                    sd_cyano=sd_cyano
                )

                cont_dict[run_no] = transformed_df

            else:
                # Add to dict
                cont_dict[run_no] = temp_df
    # ---------------------------------------------------------------------------------
    # Calculate statistics (both continuous and discrete)

    cc_dict = {}  # key: run_no, returns correlation coeff
    rmse_dict = {}  # key: run_no, returns mse (pred - obs)
    classified_dict = (
        {}
    )  # Key: run number. Returns df with cols 'obs' and 'pred', discrete data
    class_error_dict = {}
    mcc_dict = {}
    roc_auc_dict = {}

    for run_no in cont_dict.keys():

        cont_df = cont_dict[run_no]  # df with continuous data

        # Correlation coefficients
        cc_dict[run_no] = cont_df["obs"].corr(cont_df["pred"], method="pearson")

        # Root mean square error
        rmse = np.sqrt(np.mean(((cont_df["pred"] - cont_df["obs"]) ** 2)))
        rmse_dict[run_no] = rmse

        # Classify obs and pred into WFD (or related) class boundaries
        disc_df = pd.DataFrame(
            index=cont_df.index, columns=cont_df.columns
        )  # New empty df to be populated
        for col in cont_df.columns:
            disc_df[col] = cont_df[col].apply(
                lambda x: bn.discretize(bound_dict[var], x)
            )
        classified_dict[run_no] = disc_df

        # Calculate classification error (proportion of time model predicted class correctly)
        error = bn.classification_error(disc_df["obs"], disc_df["pred"])
        class_error_dict[run_no] = error

        # Calculate matthew's correlation coefficient and ROC_AUC score
        mcc_dict[run_no] = matthews_corrcoef(
            disc_df["obs"].values, disc_df["pred"].values
        )
        roc_auc_dict[run_no] = roc_auc_score(
            disc_df["obs"].values, disc_df["pred"].values
        )

    # ---------------------------------------------------------------------------------
    # Aggregate results over repeat CV runs

    corr_coeffs = pd.Series(cc_dict)  # These match those calculated within bnlearn's CV function
    rmses = pd.Series(rmse_dict)
    errors = pd.Series(class_error_dict)
    mccs = pd.Series(mcc_dict)
    roc_aucs = pd.Series(roc_auc_dict)

    # Combine all stats into one series
    results_series = pd.Series(
        data=np.array(
            [
                corr_coeffs.mean(),
                rmses.mean(),
                errors.mean(),
                mccs.mean(),
                roc_aucs.mean(),
            ]
        ),
        index=["mean_CC", "mean_rmse", "mean_class_error", "mean_mcc", "mean_ROC_AUC"],
    )

    return results_series

# Calculate stats

In [4]:
# Setup: loop through CV results files and add to a dict
fpaths = os.listdir(CV_obs_sim_folder)

fpath_dict = {}
for file in fpaths:
    var = file.split('_')[0]
    if var in var_li:
        if var in fpath_dict.keys():
            fpath_dict[var].append(file)
        else:
            fpath_dict[var] = [file]

fpath_dict

{'chla': ['chla_cont_LOOCV_fromPredictableNodes_metno.csv',
  'chla_cont_LOOCV_fromPredictableNodes_nomet.csv'],
 'cyano': ['cyano_cont_LOOCV_fromPredictableNodes_nomet.csv',
  'cyano_cont_LOOCV_fromPredictableNodes_metno.csv'],
 'colour': ['colour_cont_LOOCV_fromPredictableNodes_nomet.csv',
  'colour_cont_LOOCV_fromPredictableNodes_metno.csv'],
 'TP': ['TP_cont_LOOCV_fromPredictableNodes_nomet.csv',
  'TP_cont_LOOCV_fromPredictableNodes_metno.csv']}

In [5]:
# Set the standard deviation of the box cox transformed observations, for use in
# back-transforming the cyanobacteria predictions
sd_cyano = 1.29  # period 1981-2018, box cox transformed with lambda = 0.1

# Start looping through the CV results files and carry out post-processing
series_li = []
for var in var_li:
    for file in fpath_dict[var]:
        in_fpath = os.path.join(CV_obs_sim_folder, file)

        # Calculate stats
        stats_series = xval_postprocess(var, in_fpath, sd_cyano=sd_cyano)

        # Tidy
        stats_series.name = file.split('.')[0]
        series_li.append(stats_series)

# Combine stats for all variables into one dataframe and tidy
df = pd.concat(series_li, axis=1, keys=[s.name for s in series_li]).transpose()
df['Variable'] = [i.split('_')[0] for i in list(df.index)]
df['met_data'] = [i.split('_')[-1] for i in list(df.index)]
df = df.set_index(['Variable', 'met_data']).sort_index()

# df['nodes_used'] = [i.split('_')[2][4:-5] for i in list(df.index)] # Comment out if just using predictable nodes
# df['wind_yn'] = [i.split('_')[-2] for i in list(df.index)]
# df.loc[df['wind_yn']!='noWind','wind_yn'] = 'Wind'
# df = df.set_index(['Variable','met_data','nodes_used','wind_yn']).sort_index()

df = df.reset_index()

# Write to csv
df.to_csv('../Data/CrossValidation/Stats/LOOCV_results_bias-adj-cyano.csv', index=False)

df

Unnamed: 0,Variable,met_data,mean_CC,mean_rmse,mean_class_error,mean_mcc,mean_ROC_AUC
0,TP,metno,0.574923,3.958973,0.330263,0.335971,0.66625
1,TP,nomet,0.574923,3.958973,0.330263,0.335971,0.66625
2,chla,metno,0.546171,4.764127,0.338158,0.049825,0.519613
3,chla,nomet,0.541435,4.758443,0.318421,0.082392,0.530808
4,colour,metno,0.846322,8.780196,0.244737,0.436296,0.706
5,colour,nomet,0.823366,9.352747,0.236842,0.464775,0.727692
6,cyano,metno,0.374754,1.905897,0.313043,0.491434,0.7
7,cyano,nomet,0.473509,1.755389,0.31087,0.494643,0.702083
