## p1. 拆解"缓存准备.py"文件

In [1]:
import sys

sys.path.append("../../src/")

In [2]:
import argparse

import numpy as np

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.optim import SGD
from torch.utils.data import DataLoader

from util.util import enumerateWithEstimate
from util.logconf import logging

from p2ch11.dsets import LunaDataset
from p2ch11.model import LunaModel

In [3]:
log = logging.getLogger(__name__)
# log.setLevel(logging.WARN)
log.setLevel(logging.INFO)
# log.setLevel(logging.DEBUG)

注意需要修改p2ch11中dsets.py中的路径问题

In [4]:
batch_size = 128
num_workers = 8

prep_dl = DataLoader(
    LunaDataset(
        sortby_str="series_uid",
    ),
    batch_size=batch_size,
    num_workers=num_workers,
)

2024-06-28 17:43:13,535 INFO     pid:28724 p2ch11.dsets:197:__init__ <p2ch11.dsets.LunaDataset object at 0x0000020CC2292C90>: 110143 training samples


In [5]:
import datetime

In [6]:
# print(f"Started at {datetime.datetime.now()}")
# batch_iter = enumerateWithEstimate(
#     prep_dl,
#     "Stuffing cache",
#     start_ndx=prep_dl.num_workers,
# )
# for _ in batch_iter:
#     pass
# print(f"Finished at {datetime.datetime.now()}")

## p2. 拆解train.py文件

In [7]:
num_workers = 8
batch_size = 32
epochs = 1
tb_prefix = "p2ch11"
comment = "dlwpt"

time_str = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
trn_writer = None
val_writer = None
total_training_samples = 0

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

device

device(type='cuda')

### 1. 创建模型

In [8]:
def initModel():
    model = LunaModel()
    if use_cuda:
        log.info(f"Using CUDA; {torch.cuda.device_count()} devices.")
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        model = model.to(device)
    return model

In [9]:
def initOptimizer():
    return SGD(model.parameters(), lr=0.001, momentum=0.99)
    # return Adam(model.parameters())

In [13]:
def initTrainDl(batch_size=64):
    train_ds = LunaDataset(
        val_stride=10,
        isValSet_bool=False,
    )

    if use_cuda:
        batch_size *= torch.cuda.device_count()

    train_dl = DataLoader(
        train_ds,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=use_cuda,
    )

    return train_dl

In [14]:
def initValDl(batch_size=64):
    val_ds = LunaDataset(
        val_stride=10,
        isValSet_bool=True,
    )

    if use_cuda:
        batch_size *= torch.cuda.device_count()

    val_dl = DataLoader(
        val_ds,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=use_cuda,
    )

    return val_dl

In [15]:
model = initModel()
optimizer = initOptimizer()
train_dl = initTrainDl()
val_dl = initValDl()

2024-06-28 18:04:28,279 INFO     pid:28724 __main__:004:initModel Using CUDA; 1 devices.
2024-06-28 18:04:28,337 INFO     pid:28724 p2ch11.dsets:197:__init__ <p2ch11.dsets.LunaDataset object at 0x0000020CC046E480>: 99128 training samples
2024-06-28 18:04:28,344 INFO     pid:28724 p2ch11.dsets:197:__init__ <p2ch11.dsets.LunaDataset object at 0x0000020CCD423E30>: 11015 validation samples


### 2.设计训练方法

In [20]:
log = logging.getLogger(__name__)
# log.setLevel(logging.WARN)
log.setLevel(logging.INFO)
log.setLevel(logging.DEBUG)

# Used for computeBatchLoss and logMetrics to index into metrics_t/metrics_a
METRICS_LABEL_NDX = 0
METRICS_PRED_NDX = 1
METRICS_LOSS_NDX = 2
METRICS_SIZE = 3

In [16]:
def computeBatchLoss(batch_ndx, batch_tup, batch_size, metrics_g):
    input_t, label_t, _series_list, _center_list = batch_tup

    input_g = input_t.to(device, non_blocking=True)
    label_g = label_t.to(device, non_blocking=True)

    logits_g, probability_g = model(input_g)

    loss_func = nn.CrossEntropyLoss(reduction="none")
    loss_g = loss_func(
        logits_g,
        label_g[:, 1],
    )
    start_ndx = batch_ndx * batch_size
    end_ndx = start_ndx + label_t.size(0)

    metrics_g[METRICS_LABEL_NDX, start_ndx:end_ndx] = label_g[:, 1].detach()
    metrics_g[METRICS_PRED_NDX, start_ndx:end_ndx] = probability_g[:, 1].detach()
    metrics_g[METRICS_LOSS_NDX, start_ndx:end_ndx] = loss_g.detach()

    return loss_g.mean()

In [17]:
def doTraining(epoch_ndx, train_dl, totalTrainingSamples_count=0):
    model.train()
    trnMetrics_g = torch.zeros(
        METRICS_SIZE,
        len(train_dl.dataset),
        device=device,
    )

    batch_iter = enumerateWithEstimate(
        train_dl,
        "E{} Training".format(epoch_ndx),
        start_ndx=train_dl.num_workers,
    )
    for batch_ndx, batch_tup in batch_iter:
        optimizer.zero_grad()

        loss_var = computeBatchLoss(
            batch_ndx, batch_tup, train_dl.batch_size, trnMetrics_g
        )

        loss_var.backward()
        optimizer.step()

        # # This is for adding the model graph to TensorBoard.
        # if epoch_ndx == 1 and batch_ndx == 0:
        #     with torch.no_grad():
        #         model = LunaModel()
        #         trn_writer.add_graph(model, batch_tup[0], verbose=True)
        #         trn_writer.close()

    totalTrainingSamples_count += len(train_dl.dataset)

    return trnMetrics_g.to("cpu")

In [18]:
def doValidation(epoch_ndx, val_dl):
    with torch.no_grad():
        model.eval()
        valMetrics_g = torch.zeros(
            METRICS_SIZE,
            len(val_dl.dataset),
            device=device,
        )

        batch_iter = enumerateWithEstimate(
            val_dl,
            "E{} Validation ".format(epoch_ndx),
            start_ndx=val_dl.num_workers,
        )
        for batch_ndx, batch_tup in batch_iter:
            computeBatchLoss(batch_ndx, batch_tup, val_dl.batch_size, valMetrics_g)

    return valMetrics_g.to("cpu")

In [21]:
for epoch_ndx in range(1, epochs + 1):

    log.info(
        "Epoch {} of {}, {}/{} batches of size {}*{}".format(
            epoch_ndx,
            epochs,
            len(train_dl),
            len(val_dl),
            batch_size,
            (torch.cuda.device_count() if use_cuda else 1),
        )
    )

    trnMetrics_t = doTraining(epoch_ndx, train_dl)
    # logMetrics(epoch_ndx, 'trn', trnMetrics_t)

    valMetrics_t = doValidation(epoch_ndx, val_dl)
    # logMetrics(epoch_ndx, 'val', valMetrics_t)

2024-06-28 18:10:57,106 INFO     pid:28724 __main__:003:<module> Epoch 1 of 1, 1549/173 batches of size 32*1
2024-06-28 18:11:41,463 INFO     pid:28724 util.util:236:enumerateWithEstimate E1 Training   64/1549, done at 2024-06-28 18:15:32, 0:03:59
2024-06-28 18:12:10,971 INFO     pid:28724 util.util:236:enumerateWithEstimate E1 Training  256/1549, done at 2024-06-28 18:15:30, 0:03:57
2024-06-28 18:14:11,504 INFO     pid:28724 util.util:236:enumerateWithEstimate E1 Training 1024/1549, done at 2024-06-28 18:15:33, 0:04:00
2024-06-28 18:16:07,871 INFO     pid:28724 util.util:236:enumerateWithEstimate E1 Validation    64/173, done at 2024-06-28 18:16:11, 0:00:05
