In [16]:
from functools import partial

import numpy as np
import plotly.express as px
import torch
from torch import Tensor
from torch.utils import data as torch_data
from torch.amp import autocast, GradScaler
from torch.utils.data import DataLoader as DL
from torch.optim.lr_scheduler import ExponentialLR, LRScheduler, OneCycleLR

from models import UNet
from datasets import PreprocessedOpenFWI

In [17]:
model = UNet(
    in_channels=5,
    out_channels=1,
    start_features=32,
    depth=4
).cuda()

In [18]:
train_dataset = PreprocessedOpenFWI(train=True, norm_output=True, nb_files_to_load=100)
train_loader = torch_data.DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataset = PreprocessedOpenFWI(train=False, norm_output=True)
test_loader = torch_data.DataLoader(train_dataset, batch_size=128, shuffle=True)

Output()

Output()

Output()

Output()

In [19]:
mk_optimizer = partial(torch.optim.AdamW, lr=0.001)
scaler = GradScaler(device="cuda")

In [20]:
from typing import Callable

from rich.progress import track
from pandas import DataFrame as DF


def fit(epochs:int,
        model: torch.nn.Module,
        scheduler: LRScheduler,
        optimizer: torch.optim.Optimizer,
        train_loader: DL,
        # evaluation_func,
    ) -> DF:
    metrics: list[dict] = []
    for epoch in range(epochs):
        total_epoch_loss = 0
        nb_samples = 0
        loss_value = 0
        step_it = enumerate(track(train_loader, description=f"epoch {epoch}, loss {loss_value}"))
        for step, (x, y) in step_it:
            # forward
            x = x.cuda()
            y = y.cuda()
            nb_samples += len(x)
            model.train()
            optimizer.zero_grad()
            with autocast(device_type="cuda"):
                y_pred = model(x)
            loss_value = (y_pred - y).abs().mean()
            # backward
            scaler.scale(loss_value).backward()
            # scaler.unscale_(optimizer)
            # optional grad clipping ?
            scaler.step(optimizer)
            scaler.update()
            if step:
                scheduler.step()
            # metrics
            # print(loss_value.item())
            total_epoch_loss += loss_value.item()
            metrics.append({
                "step": step,
                "epoch": epoch,
                "batch_train_loss": loss_value.item(),
                **optimizer.state_dict()["param_groups"][-1],
            })

        metrics[-1]["train_epoch_loss"] = total_epoch_loss / len(train_loader)
        # metrics[-1].update(evaluation_func())
        print(metrics[-1]["train_epoch_loss"])
        
    return DF.from_records(metrics)

In [21]:
from copy import deepcopy

MOCK_TRAINING_EPOCHS = 3
mock_model = deepcopy(model)
mock_optim = mk_optimizer(mock_model.parameters())
mock_training_metrics = fit(
    epochs=MOCK_TRAINING_EPOCHS,
    model=mock_model,
    optimizer=mock_optim,
    scheduler=ExponentialLR(mock_optim, 1.01),
    train_loader=train_loader,
)
mock_training_metrics

Output()

Output()

0.30282894086234174


Output()

0.6666890204707279


nan


Unnamed: 0,step,epoch,batch_train_loss,lr,betas,eps,weight_decay,amsgrad,foreach,maximize,capturable,differentiable,fused,initial_lr,params,train_epoch_loss
0,0,0,1.173828,0.001000,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
1,1,0,1.124023,0.001010,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
2,2,0,0.815430,0.001020,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
3,3,0,0.703125,0.001030,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
4,4,0,0.749512,0.001041,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,390,2,,123.190811,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
1181,391,2,,124.422720,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
1182,392,2,,125.666947,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",
1183,393,2,,126.923616,"(0.9, 0.999)",1.000000e-08,0.01,False,,False,False,False,,0.001,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",


In [22]:
mock_training_metrics.columns

Index(['step', 'epoch', 'batch_train_loss', 'lr', 'betas', 'eps',
       'weight_decay', 'amsgrad', 'foreach', 'maximize', 'capturable',
       'differentiable', 'fused', 'initial_lr', 'params', 'train_epoch_loss'],
      dtype='object')

In [23]:
mock_training_metrics["batch_train_loss"].count()

np.int64(827)

In [24]:
mock_training_metrics.shape

(1185, 16)

In [25]:
px.violin(mock_training_metrics, y="batch_train_loss", points="all")

In [None]:
px.trendline_functions

In [32]:
px.scatter(mock_training_metrics, x="lr", y="batch_train_loss", log_x=True, log_y=True, trendline="rolling", trendline_options={"window": 50})

In [27]:
OFFSET = 50
IMGS_TO_SHOW = 10

x, y = next(iter(train_loader))


y_true_to_display = (
    y
    .cpu()
    .numpy()
    [OFFSET:OFFSET+IMGS_TO_SHOW, 0, ...]
)
with autocast("cuda"):
    y_pred = model(x.cuda())
y_pred_to_display = (
    y_pred
    .cpu()
    .detach()
    .numpy()
    [OFFSET:OFFSET+IMGS_TO_SHOW, 0, ...]
)

px.imshow(
    np.concatenate((
        y_true_to_display,
        y_pred_to_display,
        np.abs(y_pred_to_display - y_true_to_display), # Absolute loss
    )),
    facet_col=0,
    facet_col_wrap=IMGS_TO_SHOW,
    color_continuous_scale="Rainbow",
    height=600,
)