Import a bunch of packages and the autoreload

In [3]:
from ninolearn.IO.read_processed import data_reader
from ninolearn.IO.read_raw import ZC_simple_read
import numpy as np
import pandas as pd

from ninolearn.learn.fit import n_decades, lead_times, decade_color, decade_name
from ninolearn.learn.evaluation import evaluation_correlation, evaluation_decadal_correlation, evaluation_seasonal_correlation, evaluation_decadal_correlation_ZC
from ninolearn.learn.fit import cross_hindcast_dem
from ninolearn.plot.evaluation import plot_seasonal_skill_ZC
import matplotlib.pyplot as plt
from ninolearn.learn.fit import cross_training
from ninolearn.learn.fit import cross_hindcast_dem
from ninolearn.learn.models import DEM

oneyear= pd.Timedelta(365, 'D')

%load_ext autoreload

%autoreload 2

Then find start and end times 

In [5]:
import sys

version = 'mu28v4'
name = 'dem' + '_' +version
# leadtime = 12

# t_start is defined using a funky timedelta because the starting date of the network analysis data is the last month
# of its start year which is 1951-12 therefore the time must start in 1952 with some months added for values lost in 
# interpolation. TODO: fix this by backwards interpolating the first values of the year and finding out what is happening 
# with the nms
times = np.unique(ZC_simple_read(version)['time'])

t_start = times[0] + pd.Timedelta((2*365 + 90),'D')
t_end = times[-1] - pd.Timedelta(90,'D')

print(f'tstart = {t_start} and tend = {t_end} (train)')

if t_end < pd.Timestamp('1990-01-01') or t_end < pd.Timestamp('1990-01-01'):
    raise ValueError('one or both timeseries are too short!')
    

tstart = 1952-04-07 14:17:30 and tend = 1994-08-26 03:33:20 (train)


Import the data 

In [None]:
from ninolearn.IO.read_raw import ZC_raw, ZC_h, ZC_oni
from ninolearn.preprocess.prepare import prep_nms
from ninolearn.plot.ZC_dem_plots import nms_plots

## read raw ZC data and save to 1x1 grid file in processeddir
## also makes field of h and sst
ZC_raw(train_version)

ZC_h(train_version)
ZC_oni(train_version)

prep_nms(train_version, 0.99, t_start, t_end)


Define a pipeline for getting all the data


In [6]:
import numpy as np
from sklearn.preprocessing import StandardScaler

from ninolearn.utils import include_time_lag
from ninolearn.IO.read_processed import data_reader


def pipeline(lead_time):
    """
    Data pipeline for the processing of the data before the Deep Ensemble
    is trained.

    :type lead_time: int
    :param lead_time: The lead time in month.

    :returns: The feature "X" (at observation time), the label "y" (at lead
    time), the target season "timey" (least month)
    """
    timelag=False
#     reader = data_reader(startdate='1952-01', enddate='1992-12', lon_min = 124, lon_max = 280,
#                          lat_min = -19, lat_max = 19)
    reader = data_reader(startdate=(t_start + oneyear), enddate=(t_end - 2*oneyear), lon_min = 124, lon_max = 280,
                         lat_min = -19, lat_max = 19)

    # indeces
    oni = reader.read_csv(('oni_ZC_' +version))
    h = reader.read_csv(('h_mean_ZC_' + version))
    #IOD unavailable in ZC87 model 
    
    # seasonal cycle
    sc = np.cos(np.arange(len(oni))/12*2*np.pi)

    # network metrics
    network_ssh = reader.read_statistic('network_metrics', variable='sst', dataset=('ZC_25x25_'+version), processed="anom")
    c2 = network_ssh['fraction_clusters_size_2']
    H = network_ssh['corrected_hamming_distance']

    # time lag
    time_lag = 12

    # shift such that lead time corresponds to the definition of lead time
    shift = 3

    # process features
    feature_unscaled = np.stack((oni, h,
                                 c2, H), axis=1)

    # scale each feature
    scalerX = StandardScaler()
    Xorg = scalerX.fit_transform(feature_unscaled)

    # set nans to 0.
    Xorg = np.nan_to_num(Xorg)

    # arange the feature array
    X = Xorg[:-lead_time-shift,:] # this chops of a bit at the end because matching labels will be offset by 
    # this amount. e.g. if our data runs until 2012 we need to remove X values for 2012 because we will use december 2011
    # to predict december 2012 
    
#     X = include_time_lag(X, max_lag=time_lag)
    X = include_time_lag(X, n_lags =time_lag)  # staggers the data with 1 month shifts so at each moment of input also
    # nlags amount of months before is available to the AI
        
    # arange label
    yorg = oni.values
    y = yorg[lead_time + time_lag + shift:] # labels offset by lead_time to predict into the future and time_lag 
    # because the include_time_lag function shifts X values forward by an amount n_lags=time_lag
    
    # get the time axis of the label
    timey = oni.index[lead_time + time_lag + shift:]

    if timelag == False:
        X = Xorg
        y = yorg
        timey = oni.index
        
    return X, y, timey