In [1]:
import os
from sklearn.metrics import mean_absolute_error
import pandas as pd
import tensorflow as tf
import datetime
import numpy as np


In [2]:
zip_path = tf.keras.utils.get_file(
origin = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
fname = 'jena_climate_2009_2016.csv.zip',
extract = True)

In [3]:
print(zip_path)

/Users/rachel.lund/.keras/datasets/jena_climate_2009_2016.csv.zip


In [4]:
csv_path, _ = os.path.splitext(zip_path)
print(csv_path)

/Users/rachel.lund/.keras/datasets/jena_climate_2009_2016.csv


In [5]:
df = pd.read_csv(csv_path)

In [6]:
#--- slice [start:stop:step, starting from index 5 take every 6th record]
df = df[5::6]


In [7]:
df.columns

Index(['Date Time', 'p (mbar)', 'T (degC)', 'Tpot (K)', 'Tdew (degC)',
       'rh (%)', 'VPmax (mbar)', 'VPact (mbar)', 'VPdef (mbar)', 'sh (g/kg)',
       'H2OC (mmol/mol)', 'rho (g/m**3)', 'wv (m/s)', 'max. wv (m/s)',
       'wd (deg)'],
      dtype='object')

In [8]:
#feature preparation

wv = df['wv (m/s)']
bad_wv = wv == -9999.0
wv[bad_wv] = 0.0
max_wv = df['max. wv (m/s)']
bad_max_wv = max_wv == -9999.0
max_wv[bad_max_wv] = 0.0

In [9]:
date_time = pd.to_datetime(df.pop('Date Time'), format = '%d.%m.%Y %H:%M:%S')

In [10]:
timestamp_s = date_time.map(datetime.datetime.timestamp)
day = 24*60*60
year = (365.2425)*day

In [11]:
df['Day sin'] = np.sin(timestamp_s * (2*np.pi/day))
df['Day cos'] = np.cos(timestamp_s * (2*np.pi/day))
df['Year sin'] = np.sin(timestamp_s * (2*np.pi/year))
df['Year sin'] = np.cos(timestamp_s * (2*np.pi/year))

In [12]:
#--- split into train and test
column_indices = {name: i for i, name in enumerate(df.columns)}
n = len(df)

In [13]:
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]
num_features

17

In [14]:
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df-train_mean)/train_std
val_df = (val_df-train_mean)/train_std
test_df = (test_df-train_mean)/train_std

In [27]:
#--- windowing in tensorflow

class WindowGenerator():
    def __init__(self, 
                 input_width,
                 label_width,
                 shift,
                 train_df,
                 val_df,
                 test_df, 
                 label_columns = None):
        
        #--store the raw data
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        
        #--workout label column indices
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}
        
        #Work out the window parameters
        
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift
        
        self.total_window_size = input_width + shift
        
        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]
        
        self.label_start = self.total_window_size - self.label_width
        self.label_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.label_slice]
                                 
                                 
    def split_window(self, features):
        inputs = features[:,self.input_slice,:]
        labels = features[:,self.label_slice,:]
        if self.label_columns is not None:
            labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_columns], axis = -1)
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])
        return inputs, labels

    def make_dataset(self, data):
        data = np.array(data, dtype = np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
        data = data,
        targets = None,
        sequence_length = self.total_window_size,
        sequence_stride = 1,
        shuffle = False,
        batch_size = 32,
        )

        ds = ds.map(self.split_window)
        return ds

    @property
    def train(self):
        return self.make_dataset(self.train_df)

    @property
    def val(self):
        return self.make_dataset(self.val_df)

    @property
    def test(self):
        return self.make_dataset(self.test_df)                         

In [28]:
single_step_window = WindowGenerator(
input_width = 1,
label_width = 1,
shift = 1,
train_df = train_df,
val_df = val_df,
test_df = test_df,
label_columns = ['T (degC)']
)

In [29]:
dir(single_step_window)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'column_indices',
 'input_indices',
 'input_slice',
 'input_width',
 'label_columns',
 'label_columns_indices',
 'label_indices',
 'label_slice',
 'label_start',
 'label_width',
 'make_dataset',
 'shift',
 'split_window',
 'test',
 'test_df',
 'total_window_size',
 'train',
 'train_df',
 'val',
 'val_df']

In [30]:
class Baseline(tf.keras.Model):
    def __init__(self, label_index = None):
        super().__init__()
        self.label_index = label_index
    def call(self, inputs):
        if self.label_index is None:
            return inputs
        result = inputs[:,:,self.label_index]
        return result[:,:, tf.newaxis]

In [31]:
baseline = Baseline(label_index = column_indices['T (degC)'])
baseline.compile(loss = tf.losses.MeanSquaredError(),
                metrics = [tf.metrics.MeanAbsoluteError()])
val_performance = {}
val_performance['Baseline'] = baseline.evaluate(single_step_window.val)

val_performance



{'Baseline': [0.012845635414123535, 0.07846628874540329]}

Unnamed: 0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg),Day sin,Day cos,Year sin
5,0.945308,-1.982473,-2.041888,-1.918973,1.117102,-1.302851,-1.477323,-0.790424,-1.480036,-1.482697,2.218524,-1.266853,-1.263871,0.221831,0.000047,1.414246,1.428455
11,0.959770,-2.078372,-2.138166,-2.060964,1.044617,-1.330143,-1.534354,-0.786272,-1.536190,-1.539035,2.325708,-1.240790,-1.263871,0.194434,0.366088,1.366061,1.428445
17,0.986284,-2.070284,-2.132435,-2.045187,1.062738,-1.328843,-1.527225,-0.788348,-1.528703,-1.531992,2.323998,-1.286400,-1.263871,-0.069265,0.707185,1.224790,1.428434
23,1.004362,-2.098014,-2.161090,-2.096820,1.008375,-1.336641,-1.546235,-0.782121,-1.547420,-1.553119,2.358913,-1.338526,-1.371630,0.761784,1.000091,1.000059,1.428423
29,1.061006,-2.165028,-2.232152,-2.187178,0.984214,-1.353535,-1.579503,-0.782121,-1.581113,-1.585982,2.446320,-1.143054,-1.156113,-0.185703,1.224845,0.707185,1.428411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294353,0.169167,0.904892,0.886564,1.571997,0.947972,0.776492,1.984916,-0.493604,1.979016,1.977352,-0.882480,0.485880,0.412852,0.890779,-1.348771,0.425358,-1.155769
294359,0.266787,0.879473,0.853325,1.405624,0.621793,0.742703,1.680752,-0.283962,1.672043,1.672192,-0.822662,0.322986,0.068025,0.438725,-1.192740,0.759931,-1.155220
294365,0.217374,0.740824,0.719224,1.425703,1.086900,0.559461,1.716396,-0.605690,1.709479,1.707403,-0.716211,-0.817267,-0.992320,-0.416296,-0.955423,1.042724,-1.154670
294371,0.213759,0.710783,0.689423,1.412795,1.147303,0.521772,1.692633,-0.645127,1.687018,1.683929,-0.689842,-0.888941,-0.776803,0.582561,-0.652992,1.254466,-1.154119


In [22]:
df = df.loc[:, ['T (degC)']]
df.head()df = df.loc[:, ['T (degC)']]
df.head()

prediction_periods = 10
y_hats = []

for i in reversed(range(prediction_periods)):
    print(i)
    h = i + 1
    window_index = (len(df) - h)
    print(f'window_index: {window_index}')
    y_win = df[:window_index].tail(10)
    print(f'y_win: {y_win}')
    y_hats.append(y_win.mean())
    print(f'y_hat: {y_win.mean()}')

9
window_index: 11671
y_win:         T (degC)
419831      3.55
419867      4.27
419903      5.80
419939      6.19
419975      6.27
420011      4.75
420047      6.03
420083      4.40
420119      3.40
420155      2.53
y_hat: T (degC)    4.719
dtype: float64
8
window_index: 11672
y_win:         T (degC)
419867      4.27
419903      5.80
419939      6.19
419975      6.27
420011      4.75
420047      6.03
420083      4.40
420119      3.40
420155      2.53
420191      3.45
y_hat: T (degC)    4.709
dtype: float64
7
window_index: 11673
y_win:         T (degC)
419903      5.80
419939      6.19
419975      6.27
420011      4.75
420047      6.03
420083      4.40
420119      3.40
420155      2.53
420191      3.45
420227      3.23
y_hat: T (degC)    4.605
dtype: float64
6
window_index: 11674
y_win:         T (degC)
419939      6.19
419975      6.27
420011      4.75
420047      6.03
420083      4.40
420119      3.40
420155      2.53
420191      3.45
420227      3.23
420263     -1.82
y_hat: T (degC) 

In [24]:
df.tail(prediction_periods)

Unnamed: 0,T (degC)
420191,3.45
420227,3.23
420263,-1.82
420299,-3.73
420335,-0.92
420371,0.38
420407,-4.17
420443,-5.17
420479,-0.71
420515,-0.42


In [25]:
y_hats

[T (degC)    4.719
 dtype: float64,
 T (degC)    4.709
 dtype: float64,
 T (degC)    4.605
 dtype: float64,
 T (degC)    3.843
 dtype: float64,
 T (degC)    2.851
 dtype: float64,
 T (degC)    2.132
 dtype: float64,
 T (degC)    1.695
 dtype: float64,
 T (degC)    0.675
 dtype: float64,
 T (degC)   -0.282
 dtype: float64,
 T (degC)   -0.693
 dtype: float64]

In [23]:
#--- this model isn't great
mean_absolute_error(df.tail(prediction_periods), y_hats)

3.468