In [22]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

In [3]:
class Model(nn.Module):
    def __init__(self, in_features, out_features):
        super(Model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, out_features)
        )
        
    def forward(self, x):
        return self.layers(x)

In [11]:
df = pd.read_feather('data/river_wear_lagged.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 590201 entries, 2007-01-01 02:00:00 to 2023-11-01 00:00:00
Columns: 164 entries, Level Chester Le Street to Rainfall Tunstall -7d
dtypes: float16(164)
memory usage: 189.1 MB


In [43]:
targets = ['Level Durham New Elvet Bridge +15min',
           'Level Durham New Elvet Bridge +30min',
           'Level Durham New Elvet Bridge +60min',
           'Level Durham New Elvet Bridge +90min',
           'Level Durham New Elvet Bridge +120min']

df['day_of_year'] = df.index.dayofyear
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
df = df.drop(columns=['day_of_year'])

X = df.drop(targets, axis=1)
y = df[targets]

In [44]:
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer, make_column_selector

# Normalize the data
X_pipeline = ColumnTransformer(
    [
        ('Normalise level and flow', preprocessing.StandardScaler(), make_column_selector(pattern='Level|Flow')),
        ('Normalise rainfall', preprocessing.MinMaxScaler(), make_column_selector(pattern='Rainfall')),
    ],
    remainder='passthrough'
)

X = X_pipeline.fit_transform(X)


In [45]:
y_pipeline = preprocessing.StandardScaler()

y = y_pipeline.fit_transform(y)

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, test_size=0.2)

train_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(
        torch.tensor(X_train, dtype=torch.float32), 
        torch.tensor(y_train, dtype=torch.float32)
    ), 
    batch_size=256,
    shuffle=True,
)

test_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(
        torch.tensor(X_val, dtype=torch.float32), 
        torch.tensor(y_val, dtype=torch.float32)
    ), 
    batch_size=256,
    shuffle=True
)

## Model training

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Model(X_train.shape[1], y_train.shape[1]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

metrics = {
    'RMSE +15min': lambda y_pred, y_true: nn.functional.mse_loss(y_pred[:, 0], y_true[:, 0]),
    'RMSE +30min': lambda y_pred, y_true: nn.functional.mse_loss(y_pred[:, 1], y_true[:, 1]),
    'RMSE +60min': lambda y_pred, y_true: nn.functional.mse_loss(y_pred[:, 2], y_true[:, 2]),
    'RMSE +90min': lambda y_pred, y_true: nn.functional.mse_loss(y_pred[:, 3], y_true[:, 3]),
    'RMSE +120min': lambda y_pred, y_true: nn.functional.mse_loss(y_pred[:, 4], y_true[:, 4]),
}

In [48]:
import wandb
from tqdm.autonotebook import tqdm

def cycle(iterable):
    while True:
        for x in iterable:
            yield x

def validate_model(model):
    with torch.no_grad():
        model.eval()
        val_loss = 0
        val_metrics = {k: 0 for k in metrics}
        for x, y in test_loader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            val_loss += criterion(y_pred, y).item()
            for k in metrics:
                val_metrics[k] += metrics[k](y_pred, y).item()
                
        val_loss /= len(test_loader)
        for k in metrics:
            val_metrics[k] /= len(test_loader)
            
        wandb.log(
            {
                'val_loss': val_loss, 
                **{'val_' + k: v for k, v in val_metrics.items()}
            }
        )
        
        return val_loss
    

In [49]:
run = wandb.init(project='river-levels')

train_iter = cycle(train_loader)

train_steps = 100_000
val_freq = 1000
train_loss_smoothing = 100

train_losses = torch.zeros(train_loss_smoothing)

with tqdm(total=train_steps, desc="Train", unit="batch") as pbar:
    for i in range(train_steps):
        model.train()
        x, y = next(train_iter)
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_losses[i % train_loss_smoothing] = loss.item()
        if i % train_loss_smoothing == 0:
            wandb.log({'train_loss': train_losses.mean().item()})
            pbar.postfix['train_loss'] = train_losses.mean().item()
            pbar.update(train_loss_smoothing)
        
        if i % val_freq == 0:
            val_loss = validate_model(model)
            pbar.postfix['val_loss'] = val_loss
            pbar.update(val_freq) 

VBox(children=(Label(value='0.001 MB of 0.013 MB uploaded\r'), FloatProgress(value=0.09923719578641482, max=1.…

0,1
train_loss,▁█▃▃▂▁▂▂▁▁▂▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_RMSE +120min,█▁▁▁▁
val_RMSE +15min,█▁▁▁▁
val_RMSE +30min,█▁▁▁▁
val_RMSE +60min,█▁▁▁▁
val_RMSE +90min,█▁▁▁▁
val_loss,█▁▁▁▁

0,1
train_loss,0.00301
val_RMSE +120min,0.00545
val_RMSE +15min,0.00229
val_RMSE +30min,0.00269
val_RMSE +60min,0.00404
val_RMSE +90min,0.00435
val_loss,0.00376


Train:   0%|          | 0/100000 [00:00<?, ?batch/s]

KeyboardInterrupt: 

In [51]:
import pickle

torch.save(model.state_dict(), 'model.pt')

with open('X_pipeline.pkl', 'wb') as f:
    pickle.dump(X_pipeline, f)
with open('y_pipeline.pkl', 'wb') as f:
    pickle.dump(y_pipeline, f)
    
run.log_artifact('model.pt')
run.log_artifact('X_pipeline.pkl')
run.log_artifact('y_pipeline.pkl')
run.finish()