# Dataset Preparation

In [25]:
import urllib.request
import zipfile
import pandas as pd
import numpy as np
import pickle
import os
import numpy as np
from tqdm.notebook import tqdm
from random import shuffle
import torch
from torch.utils.data import Dataset

class WeatherJenaDataset(Dataset):
    MEAN = [ 9.88656343e+02,  9.10820659e+00,  2.83194958e+02,  4.59960541e+00,
        7.59060084e+01,  1.33550981e+01,  9.35695962e+00,  3.99805597e+00,
        5.91355033e+00,  9.46637099e+00,  1.21699436e+03, -5.94181630e-01,
       -3.91512714e-01, -9.62158759e-01, -7.09400721e-01, -5.43022767e-05,
       -7.24215306e-05,  5.28237873e-02, -1.62425716e-02]
    STD = [ 8.29746565,  8.65494994,  8.72474584,  6.97227477, 16.55533649,
        7.69473767,  4.20825963,  4.8177406 ,  2.67125215,  4.26005455,
       40.95770444,  2.0129306 ,  1.56150746,  3.12732207,  2.61966312,
        0.70709063,  0.70713733,  0.70062267,  0.71140285]

    def download_dataset(self, root, download):
        path = os.path.join(*[root, 'data.pkl'])
        if not os.path.exists(path) and download:
            # download dataset and import with pandas
            url='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip'
            print('Downloading dataset...')
            filehandle, _ = urllib.request.urlretrieve(url)
            zip_file_object = zipfile.ZipFile(filehandle, 'r')
            first_file = zip_file_object.namelist()[0]
            df = pd.read_csv(zip_file_object.open(first_file, 'r'))
            df = self.prepare_dataset(df)
            os.makedirs(root, exist_ok=True)
            pd.to_pickle(df, path)
            print('Download complete!')
        else:
            assert os.path.exists(path)
            df = pd.read_pickle(path)
            print('Files already downloaded and verified')
        return df        

    def prepare_dataset(self, df):
        # subsample
        print(df.shape, self.__dir__())
        df = df.iloc[5::self.subsample_rate]
        date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')
        
        # decompose wind speed
        wv = df['wv (m/s)']
        bad_wv = wv == -9999.0
        wv.loc[bad_wv] = 0.0
        max_wv = df['max. wv (m/s)']
        bad_max_wv = max_wv == -9999.0
        max_wv.loc[bad_max_wv] = 0.0
        # df['wv (m/s)'].min()
        wv = df.pop('wv (m/s)')
        max_wv = df.pop('max. wv (m/s)')
        wd_rad = df.pop('wd (deg)')*np.pi / 180
        df.loc['Wx'] = wv*np.cos(wd_rad)
        df.loc['Wy'] = wv*np.sin(wd_rad)
        df.loc['max Wx'] = max_wv*np.cos(wd_rad)
        df.loc['max Wy'] = max_wv*np.sin(wd_rad)

        # decompose day/year signal
        day = 24*60*60
        year = (365.2425)*day
        timestamp_s = date_time.map(pd.Timestamp.timestamp)
        df.loc['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
        df.loc['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
        df.loc['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
        df.loc['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

        return df

    def split_dataset(self, df, train):
        n = len(df)
        if train:
            return df[0:int(n*0.7)]
        else:
            return df[int(n*0.7):]

    def __init__(self, root, input_width=12, label_shift=2, train=True, download=True, subsample_rate=6):
        super().__init__()
        self.subsample_rate = subsample_rate
        self.label_shift = label_shift
        self.input_width = input_width
        self.ds = self.split_dataset(self.download_dataset(root, download), train)
    
    def __len__(self):
        return len(self.ds) - self.input_width - self.label_shift

    def __getitem__(self, idx):
        x = self.ds[idx:idx+self.input_width]
    y = self.ds[idx+self.input_width+self.label_shift-1]['T (degC)']
    x = (x - np.array(self.mean).reshape((1,1,-1))) / np.array(self.std).reshape((1,1,-1))  # normalize input
    x = np.transpose(x, [2, 0, 1])  # transpose to (channels, sequence length, input dimension)
    return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [18]:
WeatherJenaDataset('data', train=False).ds.head()

Files already downloaded and verified


Unnamed: 0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3)
294419,991.45,14.24,288.11,14.05,98.8,16.26,16.06,0.2,10.14,16.2,1194.38
294425,991.42,16.58,290.45,16.21,97.7,18.9,18.46,0.43,11.66,18.62,1183.61
294431,991.46,17.57,291.44,16.02,90.6,20.12,18.23,1.89,11.52,18.39,1179.74
294437,991.32,19.23,293.12,16.03,81.7,22.33,18.24,4.09,11.53,18.4,1172.87
294443,991.14,20.19,294.1,16.17,77.7,23.7,18.41,5.28,11.64,18.58,1168.74


In [None]:
from torch.utils.data import DataLoader

train_ds = WeatherJenaDataset('data')
test_ds  = WeatherJenaDataset('data', train=False)

BATCH_SIZE = 32

train_dl = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE, 
                num_workers=0, drop_last=True, shuffle=True)
test_dl  = DataLoader(dataset=test_ds, batch_size=BATCH_SIZE, 
                num_workers=0, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
import matplotlib.pyplot as plt

(x, y) = next(iter(train_dl))
plt.figure(figsize=(20,3))
for i in range(len(x[::12])):
    plt.plot(np.arange(len(x[i])) + i * 14, x[i][:, 1]*train_ds.STD[1] + train_ds.MEAN[1], c='blue', marker='o')
    plt.scatter([13 + i * 14], [y[i]], color='red', marker='x')

plt.ylabel('Temp')
plt.xlabel('Timestep')

# Model Definition

In [23]:
import torch
from torch import nn

class WeatherLSTM(nn.Module):

    def __init__(self, num_features_in: int, hidden_dim: int, 
                    num_features_out: int):

        super().__init__()
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(num_features_in, hidden_dim, batch_first=True)

        self.net = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_features_out)
        )

    def forward(self, X: torch.Tensor):
        _, (h_n, _) = self.lstm(X)
        h_n = h_n[0]
        output = self.net(h_n)[:, 0]
        return output

# Training Loop

In [22]:
def eval_mae(net: nn.Module, data_loader: torch.utils.data.DataLoader, 
             device: torch.device):
    err, tot = 0, 0
    with torch.no_grad():
        for x, y in tqdm(data_loader):
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            err += (y-y_pred).abs().sum() 
            total += y_pred.size(0)

    return err/total # TODO return the MAE

In [None]:
from torch.optim import SGD
from datetime import datetime


num_hidden      = 20
num_epochs      = 3
learning_rate   = 0.005
num_features_in = 19
num_features_out= 1

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = WeatherLSTM(num_features_in=num_features_in, hidden_dim=num_hidden, 
                       num_features_out=num_features_out).to(device)

loss_fun = nn.MSELoss() # what loss function are we gonna use here?
opt = SGD(model.parameters(), lr=learning_rate)

now = datetime.now()
  
for e in tqdm(range(num_epochs)):

    model.eval()
    
    train_err = eval_mae(model, train_dl, device) # evaluate the error on the train set
    test_err = eval_mae(model, test_dl, device)  # evaluate the error on the test set

    print(f'Epoch {e:03d} - Train MAE {train_err:.3f}\tTest MAE {test_err:.3f}')

    model.train()
    for i, (x, y) in enumerate(train_dl):
        x, y = x.to(device), y.to(device)

        # prepare for gradient computation
        opt.zero_grad()
        # perform forward step
        y_pred = model(x)
        loss = loss_fun(y, y_pred) # something
        if i % 200 == 0:
            print(f'loss {loss.cpu().item():.3f}')
        
        # perform backward step
        loss.backward()
        

        # adjust weights
        opt.step()
        


# Inspect Results

In [None]:
data = next(iter(test_dl))
y = data[1]
y_pred = model(data[0].to(device)).cpu().detach()
y, y_pred = y[:64], y_pred[:64]
for i, (yi, yi_pred) in enumerate(zip(y, y_pred)):
    plt.plot([i, i], [yi, yi_pred], color='red', alpha=.5, ls=":")
plt.plot(np.arange(len(y)), y, marker='.', lw=0, color='k', label='ground truth')
plt.plot(np.arange(len(y)), y_pred, marker='x', lw=0, color='red', label='guess')
plt.xticks([])
plt.ylabel('Temperature')
plt.legend(loc='lower right')