In [1]:
# Import packages
%matplotlib inline
import pandas as pd, numpy as np, matplotlib.pyplot as plt, calendar, os
import seaborn as sns; sns.set(style="ticks", color_codes=True)
from scipy.stats import boxcox

import bayes_net_utils as bn # Custom functions

# Intro

Make "evidence" dataset for historic period. Dataset for historic period to drive predictions. Can't contain NaNs, and only includes columns which would be used in generating an operational seasonal forecast.

Derive this using the training data produced by notebook 04_MakeHistoricTrainingData, by selecting just the columns that are forecastable/measurable, and by filling NaNs.

Filling NaNs: there are more sophisticated ways this could be done (e.g. using the "impute" function in bnlearn?), but for now start off by just linearly interpolating or backwards filling missing values, exactly as I did to generate data matrices for use in cross validation of the network.

# Set up

In [2]:
# User input

met_source = 'era5' # Choose from 'metno', 'era5', 's5' (system5)

# Start and end years of data available
st_end_yr_dict = {'metno': [1981,2018],
                 'era5': [1981,2019],
                 's5': [1993,2019]}

# Input data (produced by notebook 04_MakeHistoricTrainingData)
training_data_fpath = r'../Data/BN_TrainingData/TrainingData_GaussianBN_%s_1981-%s.csv' %(met_source,st_end_yr_dict[met_source][1])

# If using s5 data, read in era5 csv and later drop the met cols, to get the lake and ecol data
if met_source =='s5':
    training_data_fpath = r'../Data/BN_TrainingData/TrainingData_GaussianBN_era5_1981-%s.csv' %(st_end_yr_dict['era5'][1])
    s5_met_folder = r'../../Data/Meteorological/07_s5_seasonal'

# Location to store output
out_fold = r'../Data/DataForPrediction/Historic/%s' %met_source

# Read in and process historic training data, and write evidence if based on ERA5 or met.no

In [5]:
evidence_df = pd.read_csv(training_data_fpath, index_col=0)

# Fill NaNs in water chemistry and ecology (linearly interpolate and backwards fill)
evidence_df.interpolate(method='linear',limit=1, inplace=True)
for col in ['colour_prevSummer']:
    evidence_df[col] = evidence_df[col].fillna(method='bfill', limit=2)

# Just select measurable/predictable columns that will be used when generating forecast

if met_source in ['era5','metno']:
    evidence_df = evidence_df.drop(['TP','colour','chla','cyano'], axis=1)
    # Save to csv
    out_fname = 'DataForPrediction_GBN_%s_%s-%s.csv' %(met_source, st_end_yr_dict[met_source][0], st_end_yr_dict[met_source][1])
    evidence_df.to_csv(os.path.join(out_fold, out_fname))
    display(evidence_df.head())
    
    # Also generate climate average from era5 and prediction data based on this (i.e. constant values for wind_speed and rain)
    evidence_df_metav = evidence_df.copy()
    evidence_df_metav['rain'] = np.mean(evidence_df['rain'])
    evidence_df_metav['wind_speed'] = np.mean(evidence_df['wind_speed'])
    # Save to csv
    out_fname_av = 'DataForPrediction_GBN_%s-av_%s-%s.csv' %(met_source, st_end_yr_dict[met_source][0], st_end_yr_dict[met_source][1])
    evidence_df_metav.to_csv(os.path.join(out_fold, out_fname_av))
    display(evidence_df_metav.head())
    
else: # If s5 data, drop all but the previous summer's values, and rename df for use below
    lake_df = evidence_df[['colour_prevSummer','TP_prevSummer','chla_prevSummer']]    
    display(lake_df.head())


Unnamed: 0_level_0,rain,wind_speed,colour_prevSummer,TP_prevSummer,chla_prevSummer
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1981,508.556005,2.471487,26.666667,43.573016,16.08013
1982,557.770616,2.6735,26.666667,28.833333,8.33125
1983,459.499582,2.735913,26.666667,26.988095,5.975
1984,599.917631,2.415682,17.625,29.78125,6.05
1985,533.917587,2.390617,26.014881,26.5275,11.09


Unnamed: 0_level_0,rain,wind_speed,colour_prevSummer,TP_prevSummer,chla_prevSummer
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1981,513.939122,2.516122,26.666667,43.573016,16.08013
1982,513.939122,2.516122,26.666667,28.833333,8.33125
1983,513.939122,2.516122,26.666667,26.988095,5.975
1984,513.939122,2.516122,17.625,29.78125,6.05
1985,513.939122,2.516122,26.014881,26.5275,11.09


# Create training data using System5 (and ERA5 for late summer) data

Use the filled chemistry and ecology training data for the historic period, combined with seasonal met stats calculated from System5 data, to create csvs with training data for the historic period. Seasonal data for deriving predictions are derived twice per year (summer, and late summer), and for all 25 members of S5. The two seasons are considered include the following data:

* Summer: Forecast comes out in April for the period May-October, based solely on S5 data
* Late summer: update to the summer forecast. Forecast issued in July, and uses ERA5 data for the period May and June, and S5 data for July-Oct

In [12]:
if met_source == 's5':

    member_li = ["%.2d" % i for i in range(1,26)] # List of S5 member numbers in format '01','02'... Should be present in s5 met data folder
    season_li = ['summer','late_summer'] # Seasons of interest (must match filenames in s5 met data folder)

    for season in season_li:

        # If it's the late_summer forecasting period, read in era5 data so early summer months are populated with observed
        # data rather than forecasted
        if season == 'late_summer':
            era5_fpath = r'../../Data/Meteorological/06_era5/era5_morsa_1980-2019_daily.csv'
            era5_df = bn.read_era5_csv(era5_fpath) # Read in, calculate wind and reformat

        for member in member_li:

            # Read in daily system5 data, calculate wind and reformat
            s5_df = bn.read_s5_csv(s5_met_folder, season, member)

            # If it's the late_summer season, add values for May and June from ERA5 to s5 data
            if season == 'late_summer':                
                met_df = bn.late_summer_met_data(era5_df, s5_df)
            else:
                met_df = s5_df

            # Aggregate met data from daily to seasonal
            summer_met_df = bn.daily_to_summer_season(met_df)

            # Append lake chem and ecol data from previous summer to make evidence df
            evidence_df = lake_df.join(summer_met_df)

            # Clip to start and end dates of the met data
            evidence_df = evidence_df.loc[st_end_yr_dict[met_source][0]:st_end_yr_dict[met_source][1]]

            # Save to csv
            out_fname = 'DataForPrediction_GBN_%s_%s-%s_%s_%s.csv' %(met_source, st_end_yr_dict[met_source][0], st_end_yr_dict[met_source][1],
                                                                     season, member)
            out_fpath = os.path.join(out_fold, out_fname)
            evidence_df.to_csv(out_fpath)    

In [13]:
# # Some test output for checking. All looks good.
# era5 = era5_df.loc['1994-04-28':'1994-11-10','wind_speed']
# s5 = s5_df.loc['1994-04-28':'1994-11-10','wind_speed']
# joined = met_df.loc['1994-04-28':'1994-11-10','wind_speed']
# df = pd.concat({'era5':era5,'s5':s5,'joined':joined}, axis=1)
# df.to_csv('test.csv')
# df.plot()