# Training Playground

In [1]:
import torch
import torch.nn as nn
import utils
import neptune.new as neptune

from constants import *
from torch.utils.data import DataLoader
from torchmetrics import Accuracy, F1
from messenger_dataset import ClassificationDataset, PredictionDataset
from models import BaseNet

In [2]:
run = neptune.init(
    project="nelorth/freddie",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiIwYWE3NzA2NS0yMTMwLTQ4YzMtYmYzYy0zYjEyNmVmNTBjMGMifQ==",
)

https://app.neptune.ai/nelorth/Freddie/e/FREDDIE-6
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


Hyperparameters

In [3]:
MODEL_NAME = "baseline"
EPOCHS = 10
BATCH_SIZE = 4096
LEARNING_RATE = 1e-3
NUM_WORKERS = 32  # 4 per GPU seems to be a rule of thumb
DROPOUT = 0.2

SEED = 42
EVAL_SPLIT = 0.2
WINDOW_SIZE = 10

Setup device

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
parallelized = device != "cpu" and torch.cuda.device_count() > 1
print(f"Device: {device}", f"#GPUs: {torch.cuda.device_count()}", sep="\n")

Device: cuda
#GPUs: 8


Ensure reproducability

In [5]:
utils.apply_global_seed(SEED)

Load training data

In [6]:
ds = ClassificationDataset(utils.resolve_path(DATA_DIR, TRAIN_FILE), window_size=WINDOW_SIZE, partial=True)
train_ds, test_ds = ds.split(EVAL_SPLIT)

In [7]:
print(len(train_ds))
print(len(test_ds))

825663
173824


In [8]:
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)

Prepare model

In [9]:
model = BaseNet(num_bands=2)
if parallelized:
    model = nn.DataParallel(model)
print(model)
model.to(device, non_blocking=True)

DataParallel(
  (module): BaseNet(
    (flatten): Flatten(start_dim=-2, end_dim=-1)
    (linear1): Linear(in_features=6, out_features=16, bias=True)
    (relu): ReLU()
    (linear2): Linear(in_features=16, out_features=5, bias=True)
  )
)


DataParallel(
  (module): BaseNet(
    (flatten): Flatten(start_dim=-2, end_dim=-1)
    (linear1): Linear(in_features=6, out_features=16, bias=True)
    (relu): ReLU()
    (linear2): Linear(in_features=16, out_features=5, bias=True)
  )
)

Count model parameters

In [10]:
utils.count_trainable_parameters(model)

197

Verify the model resides on GPU

In [11]:
utils.is_model_on_gpu(model)

True

View a sample

In [12]:
sample, label = ds[16604]
sample = sample.to(device, non_blocking=True)
with torch.no_grad():
    out = model(torch.stack([sample]))
print(out)
print(out.shape)

tensor([[[-0.0026,  0.0324,  0.0293,  0.0196,  0.0189, -0.0191, -0.0354,
          -0.0241,  0.0010,  0.0006],
         [ 0.2105,  0.2236,  0.2518,  0.2659,  0.2673,  0.2684,  0.2389,
           0.2026,  0.2001,  0.2092],
         [ 0.1028,  0.1042,  0.1030,  0.1067,  0.1336,  0.1417,  0.1097,
           0.1011,  0.1006,  0.1045],
         [ 0.0757,  0.0679,  0.0654,  0.0688,  0.0673,  0.0743,  0.0897,
           0.0832,  0.0770,  0.0754],
         [ 0.1413,  0.1204,  0.1199,  0.1302,  0.1507,  0.1756,  0.1694,
           0.1552,  0.1371,  0.1417]]], device='cuda:0')
torch.Size([1, 5, 10])


Define optimization evironment

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
accuracy = Accuracy(num_classes=len(CLASSES)).to(device, non_blocking=True)
f1_macro = F1(num_classes=len(CLASSES), average="macro", mdmc_average="global").to(device, non_blocking=True)

In [14]:
params = {
    "batch_size": BATCH_SIZE,
    "dropout": DROPOUT,
    "learning_rate": LEARNING_RATE,
    "optimizer": type(optimizer).__name__,
    "criterion": type(criterion).__name__
}
run["params"] = params

Training loop

In [15]:
def save_model(model, filename):
    module = model.module if parallelized else model
    path = utils.resolve_path(MODELS_DIR, MODEL_NAME, filename + ".pth")
    torch.save(module.state_dict(), path)
    return path

In [16]:
model.train()
console_log = lambda loss, acc, f1: f"loss: {loss:.8f} | acc: {acc:.8f} | f1: {f1:.8f} [{batch + 1}/{size}]"

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}\n" + 16 * "-")
    size = len(train_dl)

    for batch, (X, y) in enumerate(train_dl):
        # move tensors to device
        X = X.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
  
        # forward propagation
        pred = model(X)
        
        # metric calculation
        loss = criterion(pred, y)
        accuracy(pred, y)
        f1_macro(pred, y)

        # backward propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # intermediate logging
        if (batch + 1) % 10 == 0:
            loss = loss.item()
            acc = accuracy.compute()
            f1 = f1_macro.compute()
            print(console_log(loss, acc, f1), end="\r")

    # calculate metrics and log them 
    loss = loss.item()
    acc = accuracy.compute()
    
    run["train/loss"].log(loss)
    run["train/accuracy"].log(acc)
    run["train/f1_macro"].log(f1)
    
    print(console_log(loss, acc, f1), end="\n\n")
    
    # save model checkpoint
    path = save_model(model, f"epoch_{epoch:02d}")
    run[f"model_weights/epoch{epoch:02d}"].upload(path)
    
print("DONE.")

Epoch 1/10
----------------
loss: 0.96698725 | acc: 0.45311597 | f1: 0.45154366 [201/201]

Epoch 2/10
----------------
loss: 0.25658378 | acc: 0.61422706 | f1: 0.61367327 [201/201]

Epoch 3/10
----------------
loss: 0.06480440 | acc: 0.68827516 | f1: 0.68797708 [201/201]

Epoch 4/10
----------------
loss: 0.02657936 | acc: 0.73196423 | f1: 0.73177189 [201/201]

Epoch 5/10
----------------
loss: 0.01521456 | acc: 0.76125270 | f1: 0.76111567 [201/201]

Epoch 6/10
----------------
loss: 0.01051522 | acc: 0.78182513 | f1: 0.78172082 [201/201]

Epoch 7/10
----------------
loss: 0.00812231 | acc: 0.79728860 | f1: 0.79720551 [201/201]

Epoch 8/10
----------------
loss: 0.00671726 | acc: 0.80953068 | f1: 0.80946249 [201/201]

Epoch 9/10
----------------
loss: 0.00582639 | acc: 0.81947660 | f1: 0.81941897 [201/201]

Epoch 10/10
----------------
loss: 0.00523459 | acc: 0.82763577 | f1: 0.82758629 [201/201]

DONE.


In [17]:
path = save_model(model, MODEL_NAME + f"{epoch:02d}")
run[f"checkpoints/epoch{epoch:02d}"].upload(path)

Evaluate the model

Save the model

In [18]:
model.eval()
accuracy.reset()
f1_macro.reset()

with torch.no_grad():
    for X, y in test_dl:
        X = X.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        accuracy(model(X), y)
        f1_macro(model(X), y)

    # calculate metrics
    acc = accuracy.compute()
    f1 = f1_macro.compute()
    
    # log metrics to Neptune
    run["eval/accuracy"] = acc
    run["eval/f1_macro"] = f1
    
    # log metrics to console
    print(f"accuracy: {acc}")
    print(f"f1_macro: {f1}")

accuracy: 0.8639888763427734
f1_macro: 0.8639888763427734


In [19]:
run.stop()

Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 7 operations to synchronize with Neptune. Do not kill this process.


All 7 operations synced, thanks for waiting!
