In [1]:
!pip install wandb
!wandb login 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
# ~/.netrc

[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jiawei-wang/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


```
import wandb

NUM_EPOCHS = 100
BATCH_SIZE = 1000
NUM_TOKENS = 10
LR = 1e-5
KL_FACTOR = 6000
WANDB = False

if WANDB:
    run = wandb.init(
        project="tinycatstories",
        config={
            "epochs": NUM_EPOCHS,
            "batch_size": BATCH_SIZE,
            "num_tokens": NUM_TOKENS,
            "learning_rate": LR,
            "kl_factor": KL_FACTOR,
        },
    )
```

## basics

In [None]:
import wandb
print(wandb.__file__)

In [2]:
import argparse
import random # to set the python random seed
import numpy # to set the numpy random seed
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchvision import models
from datetime import datetime
# Ignore excessive warnings
import logging
logging.propagate = False 
logging.getLogger().setLevel(logging.ERROR)

# WandB – Import the wandb library
import wandb

## summary

```
!pip install wandb
wandb login
# api_key
~/.netrc
```

- WandB: weights & biases

```
wandb.init(project="wandb-demo-0423")
# 字典（dict）
config = wandb.config
config[k] = v

# 实例化模型
model = Net().to(device)
train_dataset
test_dataset
train_dataloader
test_dataloader

# 监控模型，histogram weights and biases
wandb.watch(model, log="all")


for epoch in range(n_epochs):
    train_loss, train_acc = train(model, train_dataloader)
    # 字典的形式
    wandb.log({"train_loss": train_loss, "train_acc": train_acc})
    # 评估，不进行参数的更新
    test_loss, test_acc = test(model, test_dataloader)
    wandb.log({"test_loss": test_loss, "test_acc": train_acc})
```

## model, train & test

In [3]:
def train(train_dataloader, model, criterion, optimizer, device):
    total_loss = 0
    total_correct = 0
    total_batch = len(train_dataloader)
    for batch_idx, (images, labels) in enumerate(train_dataloader):
        images = images.to(device)
        labels = labels.to(device)

        # forward
        out = model(images)
        loss = criterion(out, labels)

        # 标准的处理，用 validate data；这个过程是监督训练过程，用于 early stop
        n_corrects = (out.argmax(axis=1) == labels).sum().item()
        acc = n_corrects/labels.size(0)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()   # 更细 模型参数
        
        total_loss += loss.item()
        total_correct += n_corrects
        
        if (batch_idx+1) % 200 == 0:
            print(f'{datetime.now()}, {batch_idx+1}/{total_batch}: {loss.item():.4f}, acc: {acc}')
    total_errors = len(train_dataloader.dataset) - total_correct
    return total_loss, total_correct/len(train_dataloader.dataset), total_errors

In [22]:
import numpy as np
def test(test_dataloader, model, criterion, device, classes):
    total_loss = 0
    total_correct = 0
    example_images = []
    model.eval()
    mean = torch.tensor([0.4914, 0.4822, 0.4465]).view(3, 1, 1).to(device)
    std = torch.tensor([0.2023, 0.1994, 0.2010]).view(3, 1, 1).to(device)
    for images, labels in test_dataloader:
        images = images.to(device)
        labels = labels.to(device)
        out = model(images)
        loss = criterion(out, labels)
        total_loss += loss.item()
        preds = torch.argmax(out, dim=1)
        total_correct += (preds == labels).sum().item()
        
        mis_preds_indice = torch.flatten((preds != labels).nonzero())
        mis_preds = preds[mis_preds_indice]
        mis_labels = labels[mis_preds_indice]
        mis_images = images[mis_preds_indice]
        
        # 13*8 + 4 == 108
        for idx in range(len(mis_preds)):
            if len(example_images) < 32: 
                        
                        # --- 关键修改：反归一化 ---
                        img = images[idx] * std + mean # 变回 [0, 1] 范围
                        img = img.clamp(0, 1)          # 强制截断，防止溢出导致的噪点
                        
                        # --- 关键修改：维度转换 (C, H, W) -> (H, W, C) ---
                        img_np = img.cpu().numpy().transpose(1, 2, 0)
                        
                        # 3. 转换为 uint8 [0, 255]（这是最保险的格式）
                        img_uint8 = (img_np * 255).astype(np.uint8)
                        
                        example_images.append(wandb.Image(
                            img_uint8, 
                            caption=f"Pred: {classes[preds[idx]]}, Truth: {classes[labels[idx]]}"
                        ))
            else:
                break
    total_errors = len(test_loader.dataset) - total_correct
    return example_images, total_loss, total_correct / len(test_loader.dataset), total_errors


## wandb config & dataset

In [6]:
import os
#os.environ["WANDB_API_KEY"] = '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'
os.environ["WANDB_MODE"] = "online"

# WandB – Initialize a new run
# 一个 project 可以 run 多次
wandb.init(project="Lab-DL")
wandb.watch_called = False # Re-run the model without restarting the runtime, unnecessary after our next release

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/jiawei-wang/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mjw4807[0m ([33mlione-wang[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# empty dict
wandb.config

{}

In [8]:
# WandB – Config is a variable that holds and saves hyperparameters and inputs
config = wandb.config          # Initialize config
config.batch_size = 64          # input batch size for training (default: 64)
config.test_batch_size = 32    # input batch size for testing (default: 1000)
config.epochs = 30             # number of epochs to train (default: 10)
config.lr = 1e-3              # learning rate (default: 0.01)
config.momentum = 0.9         # SGD momentum (default: 0.5) 
config.weight_decay = 5e-4
config.no_cuda = False         # disables CUDA training
config.seed = 42               # random seed (default: 42)
config.log_interval = 10     # how many batches to wait before logging training status

In [9]:
use_cuda = not config.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
print(kwargs)

cuda
{'num_workers': 1, 'pin_memory': True}


In [10]:
model = models.resnet18(pretrained=False)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, 10)
model = model.to(device)



In [11]:
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.4914, 0.4822, 0.4465), std=(0.2023, 0.1994, 0.2010))
])
    
# Now we load our training and test datasets and apply the transformations defined above
train_dataset = datasets.CIFAR10(root='./data', train=True,
                                 download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False,
                                download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=config.batch_size,
                                           shuffle=True, 
                                           **kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                          batch_size=config.test_batch_size,
                                          shuffle=False, 
                                          **kwargs)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [12]:
print(len(train_dataset))
print(len(train_dataset)//config.batch_size)
print(len(train_loader))

50000
781
782


In [13]:
print(len(test_dataset))
print(len(test_dataset)//config.test_batch_size)
print(len(test_loader))

10000
312
313


## training pipeline

In [23]:

# Set random seeds and deterministic pytorch for reproducibility
# random.seed(config.seed)      
torch.manual_seed(config.seed) 
# numpy.random.seed(config.seed)
torch.backends.cudnn.deterministic = True

# Load the dataset: We're training our CNN on CIFAR10 (https://www.cs.toronto.edu/~kriz/cifar.html)
optimizer = torch.optim.SGD(model.parameters(), lr=config.lr, momentum=config.momentum, weight_decay=config.weight_decay)
criterion = nn.CrossEntropyLoss()

# WandB – wandb.watch() automatically fetches all layer dimensions, gradients, model parameters and logs them automatically to your dashboard.
# Using log="all" log histograms of parameter values in addition to gradients
wandb.watch(model, log="all")

for epoch in range(1, config.epochs + 1):
    train_loss, train_acc, train_errors = train(train_loader, model, criterion, optimizer, device)
    #字典的形式放在wandb的log里
    wandb.log({"train_loss": train_loss, "train_acc": train_acc, "train_errors": train_errors})
    example_images, test_loss, test_acc, test_errors = test(test_loader, model, criterion, device, classes)
    wandb.log({'example_images': example_images, 'test_loss': test_loss, 'test_acc': test_acc, 'test_errors': test_errors})
    print()
    print(f'{datetime.now()}, epoch: {epoch}, train_loss: {train_loss:.4f}, train_acc: {train_acc:.2f}, test_loss: {test_loss:.4f}, test_acc: {test_acc:.2f}')
    print()


2026-01-29 13:13:14.132820, 200/782: 1.0483, acc: 0.6875
2026-01-29 13:13:23.292310, 400/782: 1.1377, acc: 0.625
2026-01-29 13:13:32.437868, 600/782: 1.1427, acc: 0.5625

2026-01-29 13:13:48.737307, epoch: 1, train_loss: 843.1986, train_acc: 0.61, test_loss: 329.1142, test_acc: 0.62

2026-01-29 13:13:58.131501, 200/782: 0.9349, acc: 0.640625
2026-01-29 13:14:07.450962, 400/782: 1.3221, acc: 0.515625
2026-01-29 13:14:16.778518, 600/782: 0.9524, acc: 0.703125

2026-01-29 13:14:32.843156, epoch: 2, train_loss: 769.8788, train_acc: 0.65, test_loss: 314.0742, test_acc: 0.65

2026-01-29 13:14:42.141791, 200/782: 0.9358, acc: 0.6875
2026-01-29 13:14:51.352657, 400/782: 0.8662, acc: 0.65625
2026-01-29 13:15:00.556041, 600/782: 0.8683, acc: 0.65625

2026-01-29 13:15:16.633403, epoch: 3, train_loss: 691.4147, train_acc: 0.68, test_loss: 295.2364, test_acc: 0.66

2026-01-29 13:15:25.964212, 200/782: 0.8586, acc: 0.640625
2026-01-29 13:15:35.170838, 400/782: 0.5662, acc: 0.765625
2026-01-29 13:15:

In [24]:
# WandB – Save the model checkpoint. This automatically saves a file to the cloud and associates it with the current run.
torch.save(model.state_dict(), "model.ckpt")
wandb.save('model.ckpt')



['/home/jiawei-wang/workspace/Lab/DL/wandb/run-20260129_130650-z7mn4jfu/files/model.ckpt']