## summary

In [1]:
import gc
import importlib.util
import sys
import time

import matplotlib.pyplot as plt

import albumentations as albu
import torch.cuda.nccl
import wandb
import albumentations.pytorch as albuPy
# import ToTensorV2
from torch.optim import lr_scheduler as LRS

sys.path.append("..")

import model.cv_metric as module_metric
import model.loss as module_loss

from logger.Loggers import *
from model.model import VesuviusModel as CustomModel
from utils import *
from utils.env_detect import *


  warn(f"Failed to load image Python extension: {e}")


## Configs
### Env detection

In [2]:


Logger = get_logger(__name__)


### Auto settings

In [3]:


# ============== augmentation =============
def get_aug_list(size, in_channels, mode='train'):
    """
    type: train, valid
    return: list of albumentations

    in case of any further modification,
    one should use albu.Compose by themselves
    """

    train_aug_list = [
        # A.RandomResizedCrop(
        #     size, size, scale=(0.85, 1.0)),
        albu.Resize(size, size),
        albu.HorizontalFlip(p=0.5),
        albu.VerticalFlip(p=0.5),
        albu.RandomBrightnessContrast(p=0.75),
        albu.ShiftScaleRotate(p=0.75),
        albu.OneOf([
            albu.GaussNoise(var_limit=(10.0, 50.0)),
            albu.GaussianBlur(),
            albu.MotionBlur(),
        ], p=0.4),
        albu.GridDistortion(num_steps=5, distort_limit=0.3, p=0.5),
        albu.CoarseDropout(max_holes=1, max_width=int(size * 0.3), max_height=int(size * 0.3),
                           mask_fill_value=0, p=0.5),
        # A.Cutout(max_h_size=int(size * 0.6),
        #          max_w_size=int(size * 0.6), num_holes=1, p=1.0),

        albu.Normalize(
            mean=[0] * in_channels,
            std=[1] * in_channels,
            # max_pixel_value=1.0,
            always_apply=True,
        ),
        # pad is placed last
        # for not interfering with other aug,i.e.Normalize
        albu.PadIfNeeded(size, size,
                         position='top_left'),
        albuPy.ToTensorV2(transpose_mask=True),
    ]

    valid_aug_list = [
        albu.Resize(size, size),
        albu.Normalize(
            mean=[0] * in_channels,
            std=[1] * in_channels,
            # max_pixel_value=1.0,
            always_apply=True,
        ),
        albu.PadIfNeeded(size, size,
                         position='top_left'),
        albuPy.ToTensorV2(transpose_mask=True),
    ]

    if mode == 'train':
        return train_aug_list
    else:
        return valid_aug_list


In [4]:

def get_config():
    HOST, PATHS = decide_paths()
    _cfg = yaml.safe_load(open(f"{PATHS['ROOT_DIR']}/dev/default_config.yaml", "r"))

    util.seed_everything(_cfg['seed'])

    PATHS = {k: str(v) for k, v in PATHS.items()}
    _cfg["PATHS"] = PATHS

    _cfg['dataset']['data_dir'] = PATHS['DATA_DIR']
    _cfg['dataset']['cache_dir'] = PATHS['CACHE_DIR']

    with open("config.yaml", "w") as f:
        yaml.dump(_cfg, f)

    # update in running cfgs
    device = torch.device("cuda" if torch.cuda.is_available()
                          else "mps" if torch.backends.mps.is_available() else "cpu")
    _cfg["device"] = device

    _cfg["kaggle_run_type"] = os.getenv("KAGGLE_KERNEL_RUN_TYPE")
    _cfg['HOST'] = HOST

    _cfg['data_loader']['num_workers'] = max(
        _cfg['data_loader']['num_workers'], os.cpu_count() - 2)
    # model id consist of settings of model,
    # exp_id add on the settings for dataset
    # run_id add on time
    cfg_model = _cfg['model']
    _cfg['model_id'] = f"{cfg_model['Proto']}_{cfg_model['in_channels']}_{cfg_model['model_type']}"

    _cfg['model_id'] += f"_fold_{'_'.join(_cfg['dataset']['image_sets'])}"
    _cfg['exp_id'] = f"{_cfg['model_id']}_{cfg_model['tile_size']}"
    _cfg['run_id'] = f"{_cfg['exp_id']}_{time.strftime('%m%d_%H%M%S')}"

    if not os.path.exists(f"{PATHS['LOG_DIR']}/{_cfg['exp_id']}"):
        os.makedirs(f"{PATHS['LOG_DIR']}/{_cfg['exp_id']}")
    setup_logging(f"{PATHS['LOG_DIR']}/{_cfg['exp_id']}")

    import wandb
    Logger.info('wandb imported')

    t_resume = (_cfg['model']["resume_path"] is not None) and os.path.exists(
        get_saved_model_path(_cfg['PATHS']['CP_DIR'], _cfg['model_id']))
    t_mode = 'online' if _cfg["kaggle_run_type"] != "batch" and _cfg['wandb_mode'] == 'online' else 'offline'
    Logger.info(f'resuming: {t_resume}; wandb mode: {t_mode}')

    wandb.init(project=_cfg['comp_name'],
               name=_cfg['run_id'],
               config=_cfg,
               dir=_cfg['PATHS']['LOG_DIR'],
               resume=t_resume,
               tags=[
                   _cfg['model']['Proto'],
                   _cfg['model']['model_type'],
               ],
               notes='resumed',
               mode=t_mode)
    del t_resume, t_mode

    wandb.config['train_aug_list'] = albu.to_dict(albu.Compose(
        get_aug_list(cfg_model['tile_size'], cfg_model['in_channels'], mode='train')))
    wandb.config['valid_aug_list'] = albu.to_dict(albu.Compose(
        get_aug_list(cfg_model['tile_size'], cfg_model['in_channels'], mode='valid')))

    _cfg = wandb.config

    return _cfg

### Global cfg


In [5]:

g_cfg = get_config()
if g_cfg["device"] == "cpu":
    Logger.warning("No accelarater enabled, Start testing")
    testing = True
else:
    Logger.warning(f"accelarater: {g_cfg['device']}")
    testing = False

wandb imported
resuming: False; wandb mode: online


[34m[1mwandb[0m: Currently logged in as: [33mhernando[0m ([33mlzhen-ntu[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668611816567137, max=1.0…

accelarater: cuda


### External models

In [6]:

if not g_cfg.kaggle_run_type == "batch":
    # https://github.com/Cadene/pretrained-models.pytorch/issues/222
    import ssl

    ssl._create_default_https_context = ssl._create_unverified_context

if g_cfg.HOST == 'kaggle':
    EXTERNAL_MODELS_DIR = Path(g_cfg.PATHS.EXTERNAL_MODELS_DIR)
    sys.path.append(str(EXTERNAL_MODELS_DIR / "segmentation-models-pytorch" / "segmentation_models.pytorch-master"))
    sys.path.append(str(EXTERNAL_MODELS_DIR / "pretrainedmodels" / "pretrainedmodels-0.7.4"))
    sys.path.append(str(EXTERNAL_MODELS_DIR / "efficientnet-pytorch" / "EfficientNet-PyTorch-master"))
    del EXTERNAL_MODELS_DIR

    # noinspection PyUnresolvedReferences
    import segmentation_models_pytorch as smp

else:
    if importlib.util.find_spec("segmentation_models_pytorch") is None:
        # %pip install -y segmentation-models-pytorch
        pass
    # %%conda install -y -c conda-forge segmentation-models-pytorch

In [7]:
g_cfg["HOST"]

'Vincint'

## Helper functions

In [8]:
# ref.: https://www.kaggle.com/stainsby/fast-tested-rle
def rle(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.flatten()
    # pixels = (pixels >= thr).astype(np.uint8)

    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

# def np_rle(img):
#     flat_img = np.where(img.flatten() , 1, 0).astype(np.uint8)
#     starts = np.array((flat_img[:-1] == 0) & (flat_img[1:] == 1))
#     ends = np.array((flat_img[:-1] == 1) & (flat_img[1:] == 0))
#     starts_ix = np.where(starts)[0] + 2
#     ends_ix = np.where(ends)[0] + 2
#     lengths = ends_ix - starts_ix
#     predicted_arr = np.stack([starts_ix, lengths]).T.flatten()
#     rle_str=np.array2string(predicted_arr.reshape(-1), separator=' ')
#     return rle_str[1:-1]

## Model

In [9]:
# class CustomModel(nn.Module):
#     def __init__(self, cfg):
#         super().__init__()
#         self.cfg = cfg
#         self.th = None
#         # if cfg["model_name"] == "Unet++":
#         #     model_Conductor=getattr(smp, "UnetPlusPlus")
#         # else:
#         #     model_Conductor=getattr(smp, cfg["model_name"])
#         # self.encoder = model_Conductor(
#         self.encoder = getattr(smp, cfg["Proto"])(**cfg['args'])
#
#     def forward(self, image):
#         output = self.encoder(image)
#         # output = output.squeeze(-1)
#         return output


def build_model(cfg, cp_dir=g_cfg['PATHS']['CP_DIR'],
                model_id=g_cfg['model_id'], weight=None):
    """
    DO notice that this needs a global config
    for paths and model_id
    """
    Logger.info(f"model_cfg: {cfg}")
    # if cfg['resume'] is not None:
    model_path = get_saved_model_path(cp_dir, model_id)
    if os.path.exists(model_path):
        Logger.info(f'load model from: {model_path}')
        _model = CustomModel(cfg)
        loaded_model = torch.load(model_path)
        # print(loaded_model)
        _model.load_state_dict(loaded_model['model'])
        # best_loss = loaded_model['best_loss']
        # best_loss = None if loaded_model['best_loss'] is None else loaded_model['best_loss']
        # best_loss = loaded_model['best_loss']
        th = loaded_model['th'] if 'th' in loaded_model else 0.5
        _model.th = th
        return _model
    Logger.info(f'trained model not found')

    # if cfg['HOST']=='kaggle':
    # #     weight = None
    _model = CustomModel(cfg)
    _model.th = 0.5
    return _model

## DataSet

In [10]:



def get_transforms(mode, cfg):
    return albu.Compose(get_aug_list(cfg['tile_size'], cfg['in_channels'], mode=mode))


from data_loader import datasets, data_loaders


available memory size is 25.53173065185547 GB


In [11]:
def make_dataset(cfg, trfs_mode='train'):
    imgset_paths = [f"train/{i}" for i in cfg['image_sets']]
    imgs = []
    masks = []
    labels = []
    for path in imgset_paths:
        imgs.append(f"{path}/surface_volume")
        masks.append(f"{path}/mask.png")
        labels.append(f"{path}/inklabels.png")

    dataset = getattr(datasets, cfg['type'])(
        image_sets=imgs,
        cfg=cfg,
        masks=masks,
        labels=labels,
        transform=get_transforms(mode=trfs_mode, cfg=cfg))
    return dataset


def make_data_loader(cfg, dataset):
    loader = getattr(data_loaders, cfg['type'])(
        dataset,
        **cfg['args'],
        pin_memory=True,
        drop_last=False)

    return loader

In [12]:
# dataset=make_dataset(g_cfg['dataset'])
# train_loader=make_data_loader(g_cfg['data_loader'],dataset)
# valid_lodaer=train_loader.split_validation()

## optimizer and Scheduler

In [13]:
def get_optimizer(cfg, model):
    _optimizer = getattr(torch.optim, cfg['type'])(
        model, **cfg['args'])
    return _optimizer


def get_scheduler(cfg, optimizer):
    _scheduler = getattr(LRS, cfg['type'])(
        optimizer, **cfg['args'])
    return _scheduler


def scheduler_step(_scheduler):
    _scheduler.step()

In [14]:
# model,loss =build_model(g_cfg['model'],model_id=g_cfg['model_id'],cp_dir=g_cfg['PATHS']['CP_DIR'])
# optimizer=get_optimizer(g_cfg['optimizer'],model)
# scheduler=get_scheduler(g_cfg['scheduler'],optimizer)

## Assessments

## LOSS

In [15]:
# testing = True

use trainer

In [16]:
if testing:
    importlib.reload(datasets)
    g_cfg['data_loader']['args']['num_workers'] = 0
    from trainer.VesuviusTrainer import VesuviusTrainer as Trainer

    # def main(config):
    """
    Take global config and train model
    """
    logger = get_logger('train')
    config = g_cfg
    # setup data_loader instances
    # trms=get_aug_list(config['model']['tile_size'],config['model']['in_channels'],'train')

    full_dataset = make_dataset(config['dataset'])
    data_loader = make_data_loader(config['data_loader'], full_dataset)

    # data_loader = config.init_obj('data_loader', module_data)

    valid_data_loader = data_loader.split_validation()

    # build model architecture, then print to console
    # model = config.init_obj('arch', module_arch)
    model = build_model(config['model'])
    # logger.info(model)

    # prepare for (multi-device) GPU training
    device, device_ids = prepare_device(config['n_gpu'])
    model = model.to(device)
    if len(device_ids) > 1:
        model = torch.nn.DataParallel(model, device_ids=device_ids)

    # get function handles of loss and metrics
    # criterion = module_loss.config['loss'] which should be a function
    criterion = getattr(module_loss, config['loss'])
    # metrics is module_metric.__dict__[config['metrics']] which should be a list of functions
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())

    optimizer = get_optimizer(config['optimizer'], trainable_params)
    lr_scheduler = get_scheduler(config['scheduler'], optimizer)

    trainer = Trainer(model, criterion, metrics, optimizer,
                      config=config,
                      device=device,
                      data_loader=data_loader,
                      valid_data_loader=valid_data_loader,
                      lr_scheduler=lr_scheduler,
                      len_epoch=1000)


In [17]:
# testing_img_label={
#     "img":img,
#     "label":label
# }
# torch.save(testing_img_label,'testing_img_label.pt')

In [18]:
# testing_var = torch.load('testing_img_label.pt')
# img, label = testing_var['img'], testing_var['label']

In [19]:

if testing:
    importlib.reload(module_metric)
    metrics = [getattr(module_metric, met) for met in g_cfg['metrics']]
    #1195
    i = np.random.randint(0, len(data_loader.dataset))
    # i=1195
    print(f"i={i}")
    img, label = data_loader.dataset[i]

    fig, axs = plt.subplots(1, 2)
    axs[0].imshow(img.mean(0), cmap='Greys')
    axs[1].imshow(label)
    plt.show()
    model.eval()
    with torch.no_grad():
        out = model(img.unsqueeze(0).to(device))

    # print(img.max(), img.min(), img.mean())
    # print(out.max(), out.min(), out.mean())
    # print(label.max(), label.min(), label.mean())

    print(f"input max={img.max()}, min={img.min()}, mean={img.mean()}")
    print(f"out max={out.max()}, min={out.min()}, mean={out.mean()}")
    print(f"label max={label.max()}, min={label.min()}, mean={label.mean()}")

    sig_out = out.squeeze().sigmoid()
    print(f"sig_out max={sig_out.max()}, min={sig_out.min()}, mean={sig_out.mean()}")
    out, label = sig_out.cpu(), label.cpu()
    # plt.imshow(out.squeeze().cpu(),cmap='gray')
    # out, label = out.squeeze().sigmoid().cpu(), label.cpu()
    print(criterion(out, label))
    fbeta = None
    for m in metrics:
        if m.__name__ == 'fbeta':
            fbeta = m
        dice = m(out, label)
        print(f'{m.__name__}: {dice}')
        assert dice <=1
    fig, axs = plt.subplots(1, 3)
    axs[0].imshow(out)
    axs[1].imshow(out > 0.25)
    axs[2].imshow(label)

    plt.show()

In [20]:
if testing:
    fb = metrics[0]

    output = out
    target = label
    print(f'out max={out.max()}, min={out.min()}, mean={out.mean()}')
    print(f'output max={output.max()}, min={output.min()}, mean={output.mean()}')
    print(f'target max={target.max()}, min={target.min()}, mean={target.mean()}')

    smooth = 1e-7
    beta = 0.5
    fnorm = module_metric.normalize
    output = fnorm(output)
    target = fnorm(target)
    print(f"to max={output.max()}, min={output.min()}")
    print(f"tt max={target.max()}, min={target.min()}")
    # Calculate true positives, false positives, and false negatives
    true_positives = (output & target).sum().item()
    false_positives = (output & ~target).sum().item()
    true_negatives = (~output & ~target).sum().item()
    false_negatives = (~output & target).sum().item()
    sensitivity = true_positives / (true_positives + false_negatives + smooth)
    specificity = true_negatives / (true_negatives + false_positives + smooth)

    print(f"true_positives={true_positives}")
    print(f"false_positives={false_positives}")
    print(f"true_negatives={true_negatives}")
    print(f"false_negatives={false_negatives}")
    print(f"sensitivity={sensitivity}")
    print(f"specificity={specificity}")


    # Calculate precision and recall
    precision = true_positives / (true_positives + false_positives + smooth)
    recall = true_positives / (true_positives + false_negatives + smooth)

    # Calculate F-beta score
    f_beta = (1 + beta ** 2) * (precision * recall) / ((beta ** 2 * precision) + recall + smooth)



In [21]:
# torch.allclose(output, output)

In [22]:


from trainer.VesuviusTrainer import VesuviusTrainer as Trainer


def main(config):
    """
    Take global config and train model
    """
    # logger = get_logger('train')
    # config = g_cfg
    # setup data_loader instances
    # trms=get_aug_list(config['model']['tile_size'],config['model']['in_channels'],'train')

    full_dataset = make_dataset(config['dataset'])
    data_loader = make_data_loader(config['data_loader'], full_dataset)

    # data_loader = config.init_obj('data_loader', module_data)

    valid_data_loader = data_loader.split_validation()

    # build model architecture, then print to console
    # model = config.init_obj('arch', module_arch)
    model = build_model(config['model'])
    # logger.info(model)

    # prepare for (multi-device) GPU training
    device, device_ids = prepare_device(config['n_gpu'])
    model = model.to(device)
    if len(device_ids) > 1:
        model = torch.nn.DataParallel(model, device_ids=device_ids)

    # get function handles of loss and metrics
    # criterion = module_loss.config['loss'] which should be a function
    criterion = getattr(module_loss, config['loss'])
    # metrics is module_metric.__dict__[config['metrics']] which should be a list of functions
    metrics = [getattr(module_metric, met) for met in config['metrics']]

    # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
    trainable_params = filter(lambda p: p.requires_grad, model.parameters())

    optimizer = get_optimizer(config['optimizer'], trainable_params)
    lr_scheduler = get_scheduler(config['scheduler'], optimizer)

    trainer = Trainer(model, criterion, metrics, optimizer,
                      config=config,
                      device=device,
                      data_loader=data_loader,
                      valid_data_loader=valid_data_loader,
                      lr_scheduler=lr_scheduler,
                      len_epoch=1000)

    trainer.train()
    del trainer, model, optimizer, lr_scheduler, data_loader, valid_data_loader
    gc.collect()
    torch.cuda.empty_cache()


In [23]:
if not testing:
    # importlib.reload(datasets)
    # reload all
    for module in [datasets, module_metric, module_loss]:
        importlib.reload(module)
    main(g_cfg)

model_cfg: {'Proto': 'Unet', 'resume_path': None, 'pretrained': True, 'model_type': 'Multi', 'tile_size': 224, 'in_channels': 6, 'args': {'encoder_name': 'se_resnext50_32x4d', 'encoder_weights': 'imagenet', 'in_channels': 6, 'classes': 1, 'activation': None}}
load model from: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_best.pth


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 1
    train/loss     : 0.9415499313175678
    train/fbeta    : 0.4405196011066437
    train/accuracy : 0.09060244937807588
    train/precision: 0.49099355604551037
    train/recall   : 0.3246543490525192
    train/roc_auc  : 0.6621432206272684
    valid/loss     : 0.8760857670203499
    valid/fbeta    : 0.4591566026210785
    valid/accuracy : 0.09593589261439754
    valid/precision: 0.519911524237322
    valid/recall   : 0.3293214505967711
    valid/roc_auc  : 0.664490213093875
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 2
    train/loss     : 0.7191191166043281
    train/fbeta    : 0.47055795788764954
    train/accuracy : 0.11137259969892041
    train/precision: 0.4983655739066006
    train/recall   : 0.4016802649572922
    train/roc_auc  : 0.7006200133946643
    valid/loss     : 0.6369723006435063
    valid/fbeta    : 0.49956226348876953
    valid/accuracy : 0.12399727957573166
    valid/precision: 0.5316572842288009
    valid/recall   : 0.42873946537448304
    valid/roc_auc  : 0.714159388154533
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

[34m[1mwandb[0m: Network error resolved after 0:00:11.377007, resuming normal operation.


    epoch          : 3
    train/loss     : 0.5920579836666584
    train/fbeta    : 0.473591685295105
    train/accuracy : 0.14253116230849577
    train/precision: 0.47036799694193393
    train/recall   : 0.5120753350417921
    train/roc_auc  : 0.7557207368414499
    valid/loss     : 0.5721952624942945
    valid/fbeta    : 0.48863524198532104
    valid/accuracy : 0.1609446054234822
    valid/precision: 0.4781474117934875
    valid/recall   : 0.5532615593436813
    valid/roc_auc  : 0.7762889268127907
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

[34m[1mwandb[0m: Network error resolved after 0:00:00.336141, resuming normal operation.


  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 4
    train/loss     : 0.5431244969964028
    train/fbeta    : 0.4593495726585388
    train/accuracy : 0.1816784777580645
    train/precision: 0.4313422062470287
    train/recall   : 0.6490682136266296
    train/roc_auc  : 0.8240617501703177
    valid/loss     : 0.5325259812500166
    valid/fbeta    : 0.4725158214569092
    valid/accuracy : 0.19381538939430462
    valid/precision: 0.4425703958099605
    valid/recall   : 0.6689637304422491
    valid/roc_auc  : 0.8340118782340853
Early stop count: 1


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 5
    train/loss     : 0.5238362985849381
    train/fbeta    : 0.4502207934856415
    train/accuracy : 0.20021728216652337
    train/precision: 0.41459991556872283
    train/recall   : 0.716926805143984
    train/roc_auc  : 0.8579076783214251
    valid/loss     : 0.5200912776200668
    valid/fbeta    : 0.4725761413574219
    valid/accuracy : 0.20892806235068914
    valid/precision: 0.4376242868857129
    valid/recall   : 0.7177996897408095
    valid/roc_auc  : 0.8583770342914743
Early stop count: 2
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 6
    train/loss     : 0.5145070714950561
    train/fbeta    : 0.45222750306129456
    train/accuracy : 0.20815378193732034
    train/precision: 0.4140114887329605
    train/recall   : 0.7459872263005601
    train/roc_auc  : 0.8724143121667177
    valid/loss     : 0.5070541887179665
    valid/fbeta    : 0.46845561265945435
    valid/accuracy : 0.21042990629479663
    valid/precision: 0.4462548194563575
    valid/recall   : 0.728436741728886
    valid/roc_auc  : 0.8636966570034967
Early stop count: 3


  0%|          | 0/1000 [00:00<?, ?it/s]

wandb: Network error (ReadTimeout), entering retry loop.


  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 7
    train/loss     : 0.5054103604257106
    train/fbeta    : 0.46199682354927063
    train/accuracy : 0.21351718725061022
    train/precision: 0.42309466679936797
    train/recall   : 0.7619371703741802
    train/roc_auc  : 0.8803958342457439
    valid/loss     : 0.49393302228139796
    valid/fbeta    : 0.4747372269630432
    valid/accuracy : 0.21600542178461152
    valid/precision: 0.4428877734524969
    valid/recall   : 0.7481447237436846
    valid/roc_auc  : 0.8735472409828414
Early stop count: 4


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 8
    train/loss     : 0.5006593522727489
    train/fbeta    : 0.46141740679740906
    train/accuracy : 0.2120157645086643
    train/precision: 0.4224802214659839
    train/recall   : 0.7619956588161998
    train/roc_auc  : 0.8804282393854681
    valid/loss     : 0.49674351241277614
    valid/fbeta    : 0.4853588342666626
    valid/accuracy : 0.21824478921125945
    valid/precision: 0.44812264534012547
    valid/recall   : 0.7498487786842786
    valid/roc_auc  : 0.8744047828135808
Early stop count: 5


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 9
    train/loss     : 0.49440337550640107
    train/fbeta    : 0.47251221537590027
    train/accuracy : 0.21539510149247693
    train/precision: 0.4334866074919312
    train/recall   : 0.7707523310901159
    train/roc_auc  : 0.8848227273141275
    valid/loss     : 0.4913627653018288
    valid/fbeta    : 0.49523237347602844
    valid/accuracy : 0.21364797001484567
    valid/precision: 0.4691229741225642
    valid/recall   : 0.7348396949223099
    valid/roc_auc  : 0.8669452490017864
Early stop count: 6


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 10
    train/loss     : 0.49238809365034103
    train/fbeta    : 0.4762728810310364
    train/accuracy : 0.2146411802950928
    train/precision: 0.4377188827905093
    train/recall   : 0.7676118438267966
    train/roc_auc  : 0.8832632133755417
    valid/loss     : 0.4854782254799553
    valid/fbeta    : 0.5064945816993713
    valid/accuracy : 0.21028490582687812
    valid/precision: 0.47827859492623503
    valid/recall   : 0.7270214103479057
    valid/roc_auc  : 0.8630583774850512
Early stop count: 0
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...
Saving current best: model_best.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 11
    train/loss     : 0.4854202296435833
    train/fbeta    : 0.48442545533180237
    train/accuracy : 0.21379134297246943
    train/precision: 0.4463152908030973
    train/recall   : 0.7665808654731202
    train/roc_auc  : 0.8827685227269212
    valid/loss     : 0.47987494546434156
    valid/fbeta    : 0.5004803538322449
    valid/accuracy : 0.21732437688873227
    valid/precision: 0.4723081159303922
    valid/recall   : 0.7528202826097775
    valid/roc_auc  : 0.875924643261876
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 12
    train/loss     : 0.4804711996614933
    train/fbeta    : 0.4863983988761902
    train/accuracy : 0.2122818456530359
    train/precision: 0.44858042925746566
    train/recall   : 0.766022545027723
    train/roc_auc  : 0.8824978815367022
    valid/loss     : 0.48115757651951
    valid/fbeta    : 0.5036228895187378
    valid/accuracy : 0.21523334601306898
    valid/precision: 0.4742113598941514
    valid/recall   : 0.7422694931881054
    valid/roc_auc  : 0.8706654223363245
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 13
    train/loss     : 0.47760133096575735
    train/fbeta    : 0.49548760056495667
    train/accuracy : 0.21607215800355725
    train/precision: 0.4575648434468025
    train/recall   : 0.7727029134190679
    train/roc_auc  : 0.8858476159114209
    valid/loss     : 0.47008308794187464
    valid/fbeta    : 0.5037664175033569
    valid/accuracy : 0.22590353694097964
    valid/precision: 0.46529430675075734
    valid/recall   : 0.776372689141544
    valid/roc_auc  : 0.8876878994242736
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 14
    train/loss     : 0.47850022998452185
    train/fbeta    : 0.49610909819602966
    train/accuracy : 0.2162650363218764
    train/precision: 0.4585338197206631
    train/recall   : 0.770839262130091
    train/roc_auc  : 0.8849165699500255
    valid/loss     : 0.46601046945737756
    valid/fbeta    : 0.5251246690750122
    valid/accuracy : 0.21881698161573546
    valid/precision: 0.4902911069549061
    valid/recall   : 0.7588838790645536
    valid/roc_auc  : 0.8789988103798967
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 15
    train/loss     : 0.47147711396217346
    train/fbeta    : 0.501961886882782
    train/accuracy : 0.2148941568177325
    train/precision: 0.4645089918544362
    train/recall   : 0.7729987227883928
    train/roc_auc  : 0.8860122655267122
    valid/loss     : 0.4653571286927099
    valid/fbeta    : 0.5167585611343384
    valid/accuracy : 0.23064314699185307
    valid/precision: 0.47677310635917
    valid/recall   : 0.7951541244813116
    valid/roc_auc  : 0.897082500993031
Early stop count: 0
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...
Saving current best: model_best.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 16
    train/loss     : 0.4686525530517101
    train/fbeta    : 0.49992549419403076
    train/accuracy : 0.21531401641196077
    train/precision: 0.4620308786034381
    train/recall   : 0.7757738342421558
    train/roc_auc  : 0.8873938575826101
    valid/loss     : 0.46155125680177106
    valid/fbeta    : 0.5376814603805542
    valid/accuracy : 0.21812298968472393
    valid/precision: 0.504017141624902
    valid/recall   : 0.7555247743858787
    valid/roc_auc  : 0.8773404455981864
Early stop count: 1


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 17
    train/loss     : 0.4682412500679493
    train/fbeta    : 0.5093827247619629
    train/accuracy : 0.2159513950890169
    train/precision: 0.4723208224785784
    train/recall   : 0.7705641726959842
    train/roc_auc  : 0.8848075206727769
    valid/loss     : 0.46018312962158864
    valid/fbeta    : 0.5277730226516724
    valid/accuracy : 0.22651485250151038
    valid/precision: 0.49058533655413744
    valid/recall   : 0.7810165420351797
    valid/roc_auc  : 0.8900496044497233
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 18
    train/loss     : 0.4636262955367565
    train/fbeta    : 0.5116411447525024
    train/accuracy : 0.21529073735424406
    train/precision: 0.4748364867474112
    train/recall   : 0.7741603230895457
    train/roc_auc  : 0.8866107398579609
    valid/loss     : 0.4576805576034214
    valid/fbeta    : 0.5338080525398254
    valid/accuracy : 0.21212719271615663
    valid/precision: 0.5021140413994181
    valid/recall   : 0.7349740661738269
    valid/roc_auc  : 0.8670882507383416
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 19
    train/loss     : 0.4614975233972073
    train/fbeta    : 0.5190209746360779
    train/accuracy : 0.21732299829572735
    train/precision: 0.4824879302731683
    train/recall   : 0.7760114614861131
    train/roc_auc  : 0.887546723468618
    valid/loss     : 0.4601719594520071
    valid/fbeta    : 0.5291035771369934
    valid/accuracy : 0.22512293682729362
    valid/precision: 0.4931511598801111
    valid/recall   : 0.7701847170190885
    valid/roc_auc  : 0.8846429461562084
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 20
    train/loss     : 0.4594893101155758
    train/fbeta    : 0.514713704586029
    train/accuracy : 0.21463680992878242
    train/precision: 0.478290296546684
    train/recall   : 0.7753273465494007
    train/roc_auc  : 0.8872025844730361
    valid/loss     : 0.45120868294135386
    valid/fbeta    : 0.527040958404541
    valid/accuracy : 0.2325955462854741
    valid/precision: 0.48708516219964376
    valid/recall   : 0.8042837797829548
    valid/roc_auc  : 0.9016636148031034
Early stop count: 1
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 21
    train/loss     : 0.4598429578244686
    train/fbeta    : 0.5200366377830505
    train/accuracy : 0.21664827806095463
    train/precision: 0.4836153874147491
    train/recall   : 0.7741251027099505
    train/roc_auc  : 0.8866075589430318
    valid/loss     : 0.45450524843257406
    valid/fbeta    : 0.5413647294044495
    valid/accuracy : 0.21984516675188107
    valid/precision: 0.507436489619003
    valid/recall   : 0.7596372300577744
    valid/roc_auc  : 0.8794058803157186
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 22
    train/loss     : 0.45611393615603446
    train/fbeta    : 0.5214056372642517
    train/accuracy : 0.21761167839178885
    train/precision: 0.484905137804111
    train/recall   : 0.7795269178399307
    train/roc_auc  : 0.889308106181956
    valid/loss     : 0.45715029913446176
    valid/fbeta    : 0.5459586977958679
    valid/accuracy : 0.22227450677932808
    valid/precision: 0.5120930009457281
    valid/recall   : 0.7607253505274008
    valid/roc_auc  : 0.8799517499953841
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 23
    train/loss     : 0.4560739820599556
    train/fbeta    : 0.5230035185813904
    train/accuracy : 0.21769275624773549
    train/precision: 0.48662048047382717
    train/recall   : 0.77871188634074
    train/roc_auc  : 0.8889042661444365
    valid/loss     : 0.448684772460357
    valid/fbeta    : 0.5528507232666016
    valid/accuracy : 0.21567221424038646
    valid/precision: 0.524231635222985
    valid/recall   : 0.7440006354855306
    valid/roc_auc  : 0.8716205905448452
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 24
    train/loss     : 0.4526621988117695
    train/fbeta    : 0.5255234241485596
    train/accuracy : 0.21808059854285627
    train/precision: 0.4890644077940917
    train/recall   : 0.7824894927021281
    train/roc_auc  : 0.890795865677696
    valid/loss     : 0.4423544253991998
    valid/fbeta    : 0.5423738956451416
    valid/accuracy : 0.22370307604160963
    valid/precision: 0.5068598158803117
    valid/recall   : 0.7715802581169727
    valid/roc_auc  : 0.8853717741804721
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 25
    train/loss     : 0.44749303394556045
    train/fbeta    : 0.5279857516288757
    train/accuracy : 0.215311159219279
    train/precision: 0.49183883289839514
    train/recall   : 0.7804602312687755
    train/roc_auc  : 0.8897922366208944
    valid/loss     : 0.44491531460181527
    valid/fbeta    : 0.5406741499900818
    valid/accuracy : 0.22520694546471573
    valid/precision: 0.504437619978042
    valid/recall   : 0.7784315493711919
    valid/roc_auc  : 0.8887870594645172
Early stop count: 0
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...
Saving current best: model_best.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 26
    train/loss     : 0.452562680721283
    train/fbeta    : 0.5347034931182861
    train/accuracy : 0.21989363091889008
    train/precision: 0.4991667852632094
    train/recall   : 0.7790518938671745
    train/roc_auc  : 0.8890913827226019
    valid/loss     : 0.444948738295099
    valid/fbeta    : 0.5518642663955688
    valid/accuracy : 0.2196550966681453
    valid/precision: 0.5182915704794927
    valid/recall   : 0.7658111086137214
    valid/roc_auc  : 0.8825059126508081
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 27
    train/loss     : 0.4460615194141865
    train/fbeta    : 0.5356138944625854
    train/accuracy : 0.21960800855959572
    train/precision: 0.49958562465230444
    train/recall   : 0.785330328579131
    train/roc_auc  : 0.8922317142483184
    valid/loss     : 0.444155309252117
    valid/fbeta    : 0.5669931769371033
    valid/accuracy : 0.2107326016766105
    valid/precision: 0.5455354620409264
    valid/recall   : 0.7275220280384241
    valid/roc_auc  : 0.8634151426310573
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 28
    train/loss     : 0.444606995254755
    train/fbeta    : 0.5350733399391174
    train/accuracy : 0.21732409618117074
    train/precision: 0.4990506439039461
    train/recall   : 0.7832000518823655
    train/roc_auc  : 0.8911701409001329
    valid/loss     : 0.4400371800298276
    valid/fbeta    : 0.556434690952301
    valid/accuracy : 0.22401206448923064
    valid/precision: 0.5221274400370939
    valid/recall   : 0.7734323532345314
    valid/roc_auc  : 0.8863179432402586
Early stop count: 1


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 29
    train/loss     : 0.44360693073272706
    train/fbeta    : 0.5371735692024231
    train/accuracy : 0.21825169901917937
    train/precision: 0.5011887903837093
    train/recall   : 0.7849731387417427
    train/roc_auc  : 0.8920592711147844
    valid/loss     : 0.4390696463377579
    valid/fbeta    : 0.5529382824897766
    valid/accuracy : 0.22118225368414562
    valid/precision: 0.524638763239909
    valid/recall   : 0.7614726806911217
    valid/roc_auc  : 0.8803440241031771
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 30
    train/loss     : 0.4429683577120304
    train/fbeta    : 0.5396526455879211
    train/accuracy : 0.21932869001088753
    train/precision: 0.5037277060809472
    train/recall   : 0.7853715776090431
    train/roc_auc  : 0.892260745586878
    valid/loss     : 0.4359336155912151
    valid/fbeta    : 0.5670566558837891
    valid/accuracy : 0.21889630024026405
    valid/precision: 0.5358392181866571
    valid/recall   : 0.758733503045271
    valid/roc_auc  : 0.8789964141122799
Early stop count: 0
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...
Saving current best: model_best.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 31
    train/loss     : 0.4464000116288662
    train/fbeta    : 0.5380975604057312
    train/accuracy : 0.21984942178504155
    train/precision: 0.5024962823753985
    train/recall   : 0.7833477247903065
    train/roc_auc  : 0.8912451090885445
    valid/loss     : 0.4367635055728581
    valid/fbeta    : 0.5579383373260498
    valid/accuracy : 0.221521418519573
    valid/precision: 0.5245539807335944
    valid/recall   : 0.7649050239804751
    valid/roc_auc  : 0.8820638958347378
Early stop count: 1


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 32
    train/loss     : 0.44106312251091
    train/fbeta    : 0.5373453497886658
    train/accuracy : 0.21930020278591955
    train/precision: 0.501018223287507
    train/recall   : 0.7884856864441453
    train/roc_auc  : 0.893813272928032
    valid/loss     : 0.43422382810841437
    valid/fbeta    : 0.5579530000686646
    valid/accuracy : 0.22134860291745104
    valid/precision: 0.5245110559287877
    valid/recall   : 0.7668184899790494
    valid/roc_auc  : 0.8830259315293015
Early stop count: 2


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 33
    train/loss     : 0.4398650369346142
    train/fbeta    : 0.5416689515113831
    train/accuracy : 0.21938883579771754
    train/precision: 0.5058634776516662
    train/recall   : 0.7871875550836247
    train/roc_auc  : 0.893171997545779
    valid/loss     : 0.4322739419729813
    valid/fbeta    : 0.567844569683075
    valid/accuracy : 0.22480187132568807
    valid/precision: 0.5335359248585059
    valid/recall   : 0.7833650657013658
    valid/roc_auc  : 0.8912976894464817
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 34
    train/loss     : 0.43998144748806955
    train/fbeta    : 0.5427682399749756
    train/accuracy : 0.22051342050953038
    train/precision: 0.5068694477691169
    train/recall   : 0.78805025185844
    train/roc_auc  : 0.8936028464274937
    valid/loss     : 0.4379340086294257
    valid/fbeta    : 0.5480602383613586
    valid/accuracy : 0.22981833294600063
    valid/precision: 0.5105830845462774
    valid/recall   : 0.7928755174933906
    valid/roc_auc  : 0.8960104682669281
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 35
    train/loss     : 0.44046700271964073
    train/fbeta    : 0.5426768660545349
    train/accuracy : 0.22015230289353957
    train/precision: 0.5069123371850508
    train/recall   : 0.7870671744448374
    train/roc_auc  : 0.8931125781782915
    valid/loss     : 0.4354399066904317
    valid/fbeta    : 0.5682790875434875
    valid/accuracy : 0.22249269527755192
    valid/precision: 0.5360342698268866
    valid/recall   : 0.7653030070850254
    valid/roc_auc  : 0.882277967809773
Early stop count: 1
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 36
    train/loss     : 0.43653131094574926
    train/fbeta    : 0.5446746349334717
    train/accuracy : 0.2205119118201177
    train/precision: 0.5084626923418963
    train/recall   : 0.7911020030287818
    train/roc_auc  : 0.8951315490798248
    valid/loss     : 0.43088751176129214
    valid/fbeta    : 0.5478842854499817
    valid/accuracy : 0.23460045639319718
    valid/precision: 0.5092754889625104
    valid/recall   : 0.8092393652598613
    valid/roc_auc  : 0.9041847006813982
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 37
    train/loss     : 0.43775225907564164
    train/fbeta    : 0.5435855984687805
    train/accuracy : 0.21998858941498461
    train/precision: 0.5076314004370674
    train/recall   : 0.7899425155688143
    train/roc_auc  : 0.8945515452538509
    valid/loss     : 0.4322747925053472
    valid/fbeta    : 0.5706078410148621
    valid/accuracy : 0.22319537402229278
    valid/precision: 0.5455785686426021
    valid/recall   : 0.762363992014662
    valid/roc_auc  : 0.8808173191487968
Early stop count: 1


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 38
    train/loss     : 0.43851340940594674
    train/fbeta    : 0.5453927516937256
    train/accuracy : 0.22130035375450724
    train/precision: 0.5096786027656046
    train/recall   : 0.7902418614332569
    train/roc_auc  : 0.8947023877023731
    valid/loss     : 0.4295998003171838
    valid/fbeta    : 0.5738142728805542
    valid/accuracy : 0.2250139617240071
    valid/precision: 0.5410055890351421
    valid/recall   : 0.7745824359843574
    valid/roc_auc  : 0.8869196135499853
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 39
    train/loss     : 0.4382579994797707
    train/fbeta    : 0.5436946749687195
    train/accuracy : 0.2201126280489922
    train/precision: 0.5080705449728646
    train/recall   : 0.788633356992932
    train/roc_auc  : 0.8938976906066923
    valid/loss     : 0.42762351994929104
    valid/fbeta    : 0.5585880875587463
    valid/accuracy : 0.22849152504892573
    valid/precision: 0.5219417017224774
    valid/recall   : 0.7934182704411415
    valid/roc_auc  : 0.8963019771585706
Early stop count: 1


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 40
    train/loss     : 0.43810002475976945
    train/fbeta    : 0.5435982346534729
    train/accuracy : 0.2212847300001228
    train/precision: 0.507458150654613
    train/recall   : 0.7915772089488247
    train/roc_auc  : 0.8953665256304696
    valid/loss     : 0.4311482934848122
    valid/fbeta    : 0.5418797135353088
    valid/accuracy : 0.2360731529361423
    valid/precision: 0.5017436329745546
    valid/recall   : 0.8126359571620787
    valid/roc_auc  : 0.90586472473623
Early stop count: 2
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 41
    train/loss     : 0.43521021914482116
    train/fbeta    : 0.549074649810791
    train/accuracy : 0.22067508669455932
    train/precision: 0.5133837184243127
    train/recall   : 0.7920295832164748
    train/roc_auc  : 0.8956020871680208
    valid/loss     : 0.42744816049285556
    valid/fbeta    : 0.5724714398384094
    valid/accuracy : 0.21958620871894277
    valid/precision: 0.5409498912281843
    valid/recall   : 0.7646409750511349
    valid/roc_auc  : 0.8819592837710172
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 42
    train/loss     : 0.43652654692530635
    train/fbeta    : 0.5444798469543457
    train/accuracy : 0.21961106629636182
    train/precision: 0.5087682775409658
    train/recall   : 0.7883406280511419
    train/roc_auc  : 0.8937532126685452
    valid/loss     : 0.4285862990047621
    valid/fbeta    : 0.573408305644989
    valid/accuracy : 0.2245175135693313
    valid/precision: 0.5402688125502765
    valid/recall   : 0.7764880449929642
    valid/roc_auc  : 0.8878703461639139
Early stop count: 1


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 43
    train/loss     : 0.43530007234215734
    train/fbeta    : 0.545931875705719
    train/accuracy : 0.2210260079517335
    train/precision: 0.5100562691214628
    train/recall   : 0.7927444634222762
    train/roc_auc  : 0.8959550752119095
    valid/loss     : 0.4290157986723858
    valid/fbeta    : 0.5794219970703125
    valid/accuracy : 0.22308557572966833
    valid/precision: 0.54844860186454
    valid/recall   : 0.7661889776260649
    valid/roc_auc  : 0.8827388500823853
Early stop count: 2


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 44
    train/loss     : 0.4351047618985176
    train/fbeta    : 0.5516246557235718
    train/accuracy : 0.22307155064146297
    train/precision: 0.5159764507937383
    train/recall   : 0.7929185812696465
    train/roc_auc  : 0.8960476562760595
    valid/loss     : 0.4335307268992714
    valid/fbeta    : 0.5693354606628418
    valid/accuracy : 0.22709519014777654
    valid/precision: 0.5359601243772156
    valid/recall   : 0.7764233022729261
    valid/roc_auc  : 0.8878337269902998
Early stop count: 0


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 45
    train/loss     : 0.4340011690557003
    train/fbeta    : 0.5484480857849121
    train/accuracy : 0.22115407316219238
    train/precision: 0.512543746975884
    train/recall   : 0.7923391663938085
    train/roc_auc  : 0.8957556472256003
    valid/loss     : 0.4271228126857592
    valid/fbeta    : 0.5610885620117188
    valid/accuracy : 0.2278289659526035
    valid/precision: 0.525078221567392
    valid/recall   : 0.7894649466496305
    valid/roc_auc  : 0.894335229308215
Early stop count: 1
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 46
    train/loss     : 0.433392739713192
    train/fbeta    : 0.5485802292823792
    train/accuracy : 0.22107914939385745
    train/precision: 0.5127390440484425
    train/recall   : 0.7913434848552994
    train/roc_auc  : 0.8952584602595673
    valid/loss     : 0.43402625348256985
    valid/fbeta    : 0.5629740953445435
    valid/accuracy : 0.2248419259854592
    valid/precision: 0.5292705081830317
    valid/recall   : 0.7732810016129875
    valid/roc_auc  : 0.8862545693327519
Early stop count: 2


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 47
    train/loss     : 0.43094925528764727
    train/fbeta    : 0.5490582585334778
    train/accuracy : 0.22148797632306547
    train/precision: 0.5129416891690185
    train/recall   : 0.7957101775544221
    train/roc_auc  : 0.8974417842217253
    valid/loss     : 0.4266823040402454
    valid/fbeta    : 0.560027003288269
    valid/accuracy : 0.22993629814328206
    valid/precision: 0.5236119658419888
    valid/recall   : 0.7963734475397437
    valid/roc_auc  : 0.8977814633740493
Early stop count: 3


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 48
    train/loss     : 0.43076925465464594
    train/fbeta    : 0.5508603453636169
    train/accuracy : 0.22043009058087967
    train/precision: 0.5150371080699905
    train/recall   : 0.7921067969428022
    train/roc_auc  : 0.8956450726505224
    valid/loss     : 0.4285955771155979
    valid/fbeta    : 0.5549342036247253
    valid/accuracy : 0.2337865821002673
    valid/precision: 0.5170661320836837
    valid/recall   : 0.8058765737854314
    valid/roc_auc  : 0.9025144819273884
Early stop count: 4


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 49
    train/loss     : 0.43421961277723314
    train/fbeta    : 0.5516061782836914
    train/accuracy : 0.22178909289274004
    train/precision: 0.51635651487771
    train/recall   : 0.7911800843455691
    train/roc_auc  : 0.8951821372216207
    valid/loss     : 0.4270902605160423
    valid/fbeta    : 0.5683935880661011
    valid/accuracy : 0.22951481437483687
    valid/precision: 0.5333646300857211
    valid/recall   : 0.7877990861140599
    valid/roc_auc  : 0.8935141923277955
Early stop count: 5


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/115 [00:00<?, ?it/s]

    epoch          : 50
    train/loss     : 0.4307818140387535
    train/fbeta    : 0.5485549569129944
    train/accuracy : 0.21972966358390977
    train/precision: 0.5125859959338123
    train/recall   : 0.7935454032447695
    train/roc_auc  : 0.8963619247559836
    valid/loss     : 0.42642823535463087
    valid/fbeta    : 0.5590143203735352
    valid/accuracy : 0.23003558994050938
    valid/precision: 0.5219706838829494
    valid/recall   : 0.8019061050032019
    valid/roc_auc  : 0.9005428833665243
Early stop count: 6
Saving checkpoint: /home/lz/Codes/Vesuvius/dev/../saved/checkpoints/Unet_6_Multi_fold_2_checkpoints.pth ...




0,1
train/accuracy,▁▂▃▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇
train/fbeta,▁▃▃▂▂▂▂▃▄▄▄▄▄▅▅▅▆▅▇▆▆▆▅▇▆▆▆▆▇▇▇▇▇▇█▇▇▇▇▇
train/loss,█▅▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/precision,▅▆▅▁▁▂▁▂▃▃▄▃▄▄▄▅▅▅▆▅▆▅▅▆▆▆▆▆▆▆▆▇▇▆█▆▆▆▇▆
train/recall,▁▂▃▇▇▇▇█████████████████████████████████
train/roc_auc,▁▂▃▇▇▇▇█████████████████████████████████
valid/accuracy,▁▂▄▆▇▇▇▇▇▇█▇▇█▇█▇▇▇▇▇▇▇▇▇▇███▇██▇▇▇█████
valid/fbeta,▁▃▃▂▂▂▃▃▃▄▄▅▆▅▅▅▆▆▆▆▆▇▇▆▇▇▇▆▆▇█▇███▇▇▇▇▇
valid/loss,█▄▃▃▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid/precision,▆▇▃▁▁▁▁▃▃▃▃▄▅▄▅▄▅▆▆▅▆█▆▆▆▆▇▅▅██▆█▇█▇▇▆▆▆

0,1
train/accuracy,0.21973
train/fbeta,0.54855
train/loss,0.43078
train/precision,0.51259
train/recall,0.79355
train/roc_auc,0.89636
valid/accuracy,0.23004
valid/fbeta,0.55901
valid/loss,0.42643
valid/precision,0.52197


In [24]:
# data_ori, target_ori = next(iter(data_loader))
# output_ori = model(data_ori.to(device)).squeeze(1)
wandb.finish()

In [25]:

# for met in metrics:
#     print('train/' + met.__name__, met(output_ori, target_ori))



In [26]:
# from torchvision.utils import make_grid
# from matplotlib import pyplot as plt
# target=target_ori.unsqueeze(1)
# output=output_ori.unsqueeze(1)
#
# fig,axs=plt.subplots(2,1)
# axs[0].imshow(make_grid(output, nrow=8)[0].cpu().detach().numpy(),cmap='gray')
# axs[1].imshow(make_grid(target, nrow=8)[0].cpu().detach().numpy(),cmap='gray')
#
# plt.show()


In [27]:
# ctp = output_norm[target_norm == 1].sum()
# cfp = output_norm[target_norm == 0].sum()
# ctn = (~output_norm)[target_norm == 0].sum()
# cfn = (~output_norm)[target_norm == 1].sum()
# assert ctp + cfp + ctn + cfn == output_norm.numel()

In [28]:


# output_grid = make_grid(output_tensor, nrow=8)[0].cpu().numpy()
# target_grid = make_grid(target.unsqueeze(1), nrow=8)[0].cpu().numpy()
# fig,axs=plt.subplots(1,2)
# axs[0].imshow(output_grid, cmap='gray')
# axs[1].imshow(target_grid, cmap='gray')
#
# plt.show()


In [29]:
# if __name__ == '__main__':
#     main(g_cfg)


In [30]:
# data, target = next(iter(data_loader))
# output = model(data.to(device))
# a=output[0,:,:].cpu().detach().numpy()

In [31]:

# from matplotlib import pyplot as plt
# plt.imshow(a.astype(np.float32), cmap='gray')

In [32]:
# for met in metrics:
#     print('train/' + met.__name__, met(output, target))

In [33]:
# import torch
# import wandb
# import torchvision.utils as vutils
#
# # Create a random input tensor and target tensor
# B, C, H, W = 4, 8, 256, 256
# input_tensor = torch.randint(low=0, high=2, size=(B, C, H, W))
# output_tensor = torch.randint(low=0, high=2, size=(B, H, W))
# target_tensor = torch.randint(low=0, high=2, size=(B, H, W))
#
# # Convert the target tensor to an RGB image
# output_tensor = output_tensor.unsqueeze(1)
# target_tensor = target_tensor.unsqueeze(1)
# # target_tensor[:, 0, :, :][target_tensor[:, 0, :, :] == 1] = 255
#
# # Create a grid of input images and a grid of target images
# input_grid = vutils.make_grid(input_tensor, nrow=B).numpy()
# output_grid = vutils.make_grid(input_tensor, nrow=B).numpy()
# target_grid = vutils.make_grid(target_tensor, nrow=B).numpy()
#
#
#
# a = wandb.Image(
#     input_grid[0], masks={
#         "predictions": {"mask_data": output_grid[0]},
#         "ground_truth": {"mask_data": target_grid[0]},
#     })
#
# # a.image.show()
# a._masks['predictions']['mask_data'].show()
# a._masks['ground_truth']['mask_data'].show()
# # fig,axs=plt.subplots(2,1)
# # # ax[0].imshow(input_grid[0,:,:])
# # axs[0].imshow(input_grid, cmap='gray')
# # axs[1].imshow(target_grid, cmap='gray')
# # # Set the x-axis limits of all subplots
# # # xmin = min(axs[0].get_xlim()[0], axs[1].get_xlim()[0])
# # # xmax = max(axs[0].get_xlim()[1], axs[1].get_xlim()[1])
# # # for ax in axs:
# # #     ax.set_aspect('equal')
# #     # ax.set_xlim(xmin, xmax)
# #     # ax.set_axis_off()
# # plt.savefig('test.png')

In [34]:

# with tqdm(enumerate(full_dataset), total=len(full_dataset)) as pbar:
#     for idx, (img, tar) in pbar:
#         assert img.shape[1:] == tar.shape, f"img.shape:{img.shape} != tar.shape:{tar.shape}"
#         assert img.shape[-1] == 224, f"img.shape:{img.shape}"
#         assert img.dtype == tar.dtype, f"img.dtype:{img.dtype} != tar.dtype:{tar.dtype}"
# break

 ## Train and valid

In [35]:

# from torch.cuda.amp import GradScaler, autocast
#
#
# def train_fn(cfg):
#     model.train()
#
#     scaler = GradScaler(enabled=cfg["use_amp"])
#     losses = AverageMeter()
#
#     full_dataset = make_dataset(g_cfg['dataset'])
#     train_loader = make_data_loader(g_cfg['data_loader'], full_dataset)
#     valid_loader = train_loader.split_validation()
#
#     with tqdm(enumerate(train_loader), total=len(train_loader)) as pbar:
#         for step, data in pbar:
#             images, masks, labels, positions = data
#
#             images = images.to(cfg['device'])
#             labels = labels.to(cfg['device'])
#             batch_size = labels.size(0)
#
#             with autocast(cfg["use_amp"]):
#                 y_preds = model(images).squeeze()
#                 labels = labels.squeeze()
#
#                 loss = criterion(y_preds, labels)
#                 assert loss > 0, f'input should be 0-1, but got: labels: {labels.min()}-{labels.max()}, y_preds: {y_preds.min()}-{y_preds.max()}'
#
#             losses.update(loss.item(), batch_size)
#             scaler.scale(loss).backward()
#
#             grad_norm = torch.nn.utils.clip_grad_norm_(
#                 model.parameters(), cfg["max_grad_norm"])
#
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()
#
#     return losses.avg
#
#
# def valid_fn(cfg, valid_loader):
#     label_pred = np.zeros(valid_mask_gt.shape)
#     label_count = np.zeros(valid_mask_gt.shape)
#
#     model.eval()
#     losses = AverageMeter(mode='valid')
#
#     with tqdm(enumerate(valid_loader), total=len(valid_loader)) as pbar:
#         for step, (images, masks, labels, positions) in pbar:
#
#             images = images.to(cfg['device'])
#             labels = labels.to(cfg['device'])
#             batch_size = labels.size(0)
#
#             with torch.no_grad():
#                 y_preds = model(images).squeeze()
#                 labels = labels.squeeze()
#                 loss = criterion(y_preds, labels)
#                 assert loss > 0, f'input should be 0-1, but got: labels: {labels.min()}-{labels.max()}, y_preds: {y_preds.min()}-{y_preds.max()}'
#             losses.update(loss.item(), batch_size)
#
#             # make whole mask
#             y_preds = torch.sigmoid(y_preds).to('cpu').numpy()
#             # start_idx = step*CONFIG["valid_batch_size"]             # end_idx = start_idx + batch_size
#             # print(positions)
#             for i, (x1, y1, x2, y2) in enumerate(zip(*positions)):
#                 label_pred[y1:y2, x1:x2] += y_preds[i]
#                 label_count[y1:y2, x1:x2] += np.ones((y2 - y1, x2 - x1))
#
#     Logger.info(f'mask_count_min: {label_count.min()}')
#     label_pred /= label_count + 1e-8
#     return losses.avg, label_pred

In [36]:
from matplotlib import pyplot as plt

results = []
for id in ['a', 'b']:
    test_img = ImgLoader.load_from_path_static(
        cache_dir=g_cfg['cache_dir'],
        data_dir=g_cfg['data_dir'],
        file_path=f"test/{id}/surface_volume",
    )

    mask = ImgLoader.load_from_path_static(
        cache_dir=g_cfg['cache_dir'],
        data_dir=g_cfg['data_dir'],
        file_path=f"test/{id}/mask.png",
    )
    test_img = np.moveaxis(test_img, -1, 0)

    model, best_loss = build_model(g_cfg)
    model.output(device)
    label_pred = test_fn(test_img, model, device, mask, g_cfg['tile_size'])

    fig, ax = plt.subplots(1, 3, figsize=(10, 5))
    ax[0].imshow(test_img)
    ax[1].imshow(label_pred)
    ax[2].imshow(mask)
    ax[0].set_title('test_img')
    ax[1].set_title('label_pred')
    ax[2].set_title('mask')

    plt.show()
    results.append((id, label_pred))


NameError: name 'ImgLoader' is not defined

In [None]:
sub = pd.DataFrame(results, columns=['Id', 'Predicted'])
sub.Id = sub.Id.asytpe(str)
sub

In [None]:
sample_sub = pd.read_csv(g_cfg['comp_dataset_path'] / 'sample_submission.csv')
sample_sub = pd.merge(sample_sub[['Id']], sub, on='Id', how='left')
sample_sub.to_csv('submission.csv', index=False)