In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

In [3]:
class Model(nn.Module):
    def __init__(self, in_features, out_features):
        super(Model, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_features, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(64, out_features)
        )
        
    def forward(self, x):
        return self.layers(x)

In [6]:
df = pd.read_feather('data/river_wear_lagged.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 590641 entries, 2007-01-01 12:00:00 to 2023-11-06 00:00:00
Columns: 324 entries, Level Chester Le Street to Rainfall Tunstall -7d
dtypes: float16(324)
memory usage: 369.5 MB


In [7]:
targets = [
    col 
    for col 
    in df.columns 
    if col.startswith('Level Durham New Elvet Bridge +')
]

df['day_of_year'] = df.index.dayofyear
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
df = df.drop(columns=['day_of_year'])

X = df.drop(targets, axis=1)
y = df[targets]

In [8]:
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer, make_column_selector

# Normalize the data
X_pipeline = ColumnTransformer(
    [
        ('Normalise level and flow', preprocessing.StandardScaler(), make_column_selector(pattern='Level|Flow')),
        ('Normalise rainfall', preprocessing.MinMaxScaler(), make_column_selector(pattern='Rainfall')),
    ],
    remainder='passthrough'
)

X = X_pipeline.fit_transform(X)


In [9]:
y_pipeline = preprocessing.StandardScaler()

y = y_pipeline.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, shuffle=True, test_size=0.2)

In [18]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)
model.score(X_val, y_val)

  scl = avg_as_array.dtype.type(a.size/avg_as_array.size)


0.9358188895852896

In [10]:


# train_loader = torch.utils.data.DataLoader(
#     torch.utils.data.TensorDataset(
#         torch.tensor(X_train, dtype=torch.float32), 
#         torch.tensor(y_train, dtype=torch.float32)
#     ), 
#     batch_size=8192,
#     num_workers=4
# )

# test_loader = torch.utils.data.DataLoader(
#     torch.utils.data.TensorDataset(
#         torch.tensor(X_val, dtype=torch.float32), 
#         torch.tensor(y_val, dtype=torch.float32)
#     ), 
#     batch_size=8192,
#     num_workers=4
# )

## Model training

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Model(X_train.shape[1], y_train.shape[1]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

target_time_offsets = list(range(15, 12*60+15, 15))

metrics = {
   f"RMSE +{offset} mins": lambda y_pred, y_true: torch.sqrt(criterion(y_pred[:,i], y_true[:, i]))
    for i, offset
    in enumerate(target_time_offsets)
}

In [14]:
import wandb
from tqdm.autonotebook import tqdm

def cycle(iterable):
    while True:
        for x in iterable:
            yield x

def validate_model(model):
    with torch.no_grad():
        model.eval()
        val_loss = 0
        val_metrics = {k: 0 for k in metrics}
        # for x, y in test_loader:
        #     x, y = x.to(device), y.to(device)
        #     y_pred = model(x)
        #     val_loss += criterion(y_pred, y).item()
        #     for k in metrics:
        #         val_metrics[k] += metrics[k](y_pred, y).item()
                
        # val_loss /= len(test_loader)
        # for k in metrics:
        #     val_metrics[k] /= len(test_loader)
        x,y = torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32)
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        val_loss = criterion(y_pred, y).item()
        for k in metrics:
            val_metrics[k] = metrics[k](y_pred, y).item()
            
        wandb.log(
            {
                'val_loss': val_loss, 
                **{'val_' + k: v for k, v in val_metrics.items()}
            }
        )
        
        return val_loss
    

  from tqdm.autonotebook import tqdm


In [17]:
run = wandb.init(project='river-levels')

# train_iter = cycle(train_loader)

train_steps = 100_000
val_freq = 1000
train_loss_smoothing = 100

train_losses = torch.zeros(train_loss_smoothing)
x, y = X_train, y_train
x, y = torch.tensor(x, dtype=torch.float32).to(device), torch.tensor(y, dtype=torch.float32).to(device)

with tqdm(total=train_steps, desc="Train", unit="batch") as pbar:
    for i in range(train_steps):
        y_pred = model(x)
        loss = criterion(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_losses[i % train_loss_smoothing] = loss.item()
        if i % train_loss_smoothing == 0:
            wandb.log({'train_loss': train_losses.mean().item()})
            pbar.set_postfix({'train_loss': train_losses.mean().item()})
            pbar.update(train_loss_smoothing)
        
        # if i % val_freq == 0:
        #     val_loss = validate_model(model)
        #     model.train()
        #     pbar.set_postfix({'val_loss': val_loss,})
        #     pbar.update(val_freq) 

VBox(children=(Label(value='0.001 MB of 0.015 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.088945…

0,1
train_loss,▁█
val_RMSE +105 mins,▁
val_RMSE +120 mins,▁
val_RMSE +135 mins,▁
val_RMSE +15 mins,▁
val_RMSE +150 mins,▁
val_RMSE +165 mins,▁
val_RMSE +180 mins,▁
val_RMSE +195 mins,▁
val_RMSE +210 mins,▁

0,1
train_loss,0.12493
val_RMSE +105 mins,0.55245
val_RMSE +120 mins,0.55245
val_RMSE +135 mins,0.55245
val_RMSE +15 mins,0.55245
val_RMSE +150 mins,0.55245
val_RMSE +165 mins,0.55245
val_RMSE +180 mins,0.55245
val_RMSE +195 mins,0.55245
val_RMSE +210 mins,0.55245


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

Train:   0%|          | 0/100000 [00:00<?, ?batch/s]

KeyboardInterrupt: 

In [None]:
import pickle

torch.save(model.state_dict(), 'model.pt')

with open('X_pipeline.pkl', 'wb') as f:
    pickle.dump(X_pipeline, f)
with open('y_pipeline.pkl', 'wb') as f:
    pickle.dump(y_pipeline, f)
    
run.log_artifact('model.pt')
run.log_artifact('X_pipeline.pkl')
run.log_artifact('y_pipeline.pkl')
run.finish()