# train

In [1]:
import yaml
import os

with open("./config.yaml", 'rb') as f:
    config = yaml.load(f)

  """


In [2]:
OUTPUT_ROOT = config['IO_OPTION']['OUTPUT_ROOT']
os.makedirs(OUTPUT_ROOT, exist_ok=True)

## load library

In [3]:
# python default library
import os
import shutil
import datetime
import sys
import pickle

# general analysis tool-kit
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# pytorch
import torch
from torch import optim, nn
from torch.utils.tensorboard import SummaryWriter

from torchsummary import summary

# etc
import yaml
yaml.warnings({'YAMLLoadWarning': False})
import mlflow
from collections import defaultdict
from scipy.stats import zscore

# original library

import common as com
import pytorch_modeler as modeler
from pytorch_model import LSTM_AutoEncoder as Model
from pytorch_utils import filtered_load_model
import models

import librosa
import IPython
import librosa.display

## load config and set logger

In [4]:
#with open("./config.yaml", 'rb') as f:
#    config = yaml.load(f)

log_folder = config['IO_OPTION']['OUTPUT_ROOT']+'/{0}.log'.format(datetime.date.today())
logger = com.setup_logger(log_folder, '00_train.py')

  and should_run_async(code)


## Setting

In [5]:
# Setting seed
modeler.set_seed(42)

In [6]:
############################################################################
# Setting I/O path
############################################################################
# input dirs
INPUT_ROOT = config['IO_OPTION']['INPUT_ROOT']
dev_path = INPUT_ROOT + "/dev_data"
add_dev_path = INPUT_ROOT + "/add_dev_data"
# machine type
MACHINE_TYPE = config['IO_OPTION']['MACHINE_TYPE']
machine_types = os.listdir(dev_path)
# output dirs
OUTPUT_ROOT = config['IO_OPTION']['OUTPUT_ROOT']
MODEL_DIR = config['IO_OPTION']['OUTPUT_ROOT'] + '/models'
TB_DIR = config['IO_OPTION']['OUTPUT_ROOT'] + '/tb'
PKL_DIR = OUTPUT_ROOT + '/pkl'
#os.makedirs(OUTPUT_ROOT, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(TB_DIR, exist_ok=True)
os.makedirs(PKL_DIR, exist_ok=True)
# copy config
shutil.copy('./config.yaml', OUTPUT_ROOT)

'/media/hiroki/working/research/dcase2020/result/2D/LSTM_VAE/config.yaml'

## make path list and train/valid split

In [7]:
############################################################################
# make path set and train/valid split
############################################################################
'''
train_paths[machine_type]['train' or 'valid'] = path
'''
dev_train_paths = {}
add_train_paths = {}
train_paths = {}

In [8]:
for machine_type in machine_types:
    # dev train
    dev_train_paths = ["{}/{}/train/".format(dev_path, machine_type) + file for file in os.listdir("{}/{}/train".format(dev_path, machine_type))]
    dev_train_paths = sorted(dev_train_paths)
    # add_dev train
    add_train_paths = ["{}/{}/train/".format(add_dev_path, machine_type) + file for file in os.listdir("{}/{}/train".format(add_dev_path, machine_type))]
    add_train_paths = sorted(add_train_paths)
    # valid
    dev_valid_paths = ["{}/{}/test/".format(dev_path, machine_type) + file for file in os.listdir("{}/{}/test".format(dev_path, machine_type))]
    dev_valid_paths = sorted(dev_valid_paths)
    
    train_paths[machine_type] = {}
    train_paths[machine_type]['train'] = dev_train_paths + add_train_paths
    train_paths[machine_type]['valid'] = dev_valid_paths

## training

In [9]:
#############################################################################
# run
#############################################################################
def run(machine_type):
    com.tic()
    logger.info('TARGET MACHINE_TYPE: {0}'.format(machine_type))
    logger.info('MAKE DATA_LOADER')
    # dev_train_paths
    dataloaders_dict = modeler.make_dataloader(train_paths, machine_type)
    # define writer for tensorbord
    os.makedirs(TB_DIR+'/'+machine_type, exist_ok=True)         # debug
    tb_log_dir = TB_DIR + '/' + machine_type
    writer = SummaryWriter(log_dir = tb_log_dir)
    logger.info('TRAINING')
    # parameter setting
    net = Model(sample_rate=config['preprocessing']['sample_rate'],
                window_size=config['preprocessing']['window_size'],
                hop_size=config['preprocessing']['hop_size'],
                mel_bins=config['preprocessing']['mel_bins'],
                fmin=config['preprocessing']['fmin'],
                fmax=config['preprocessing']['fmax'])
    #pretrained_dict = torch.load(config['IO_OPTION']['PREMODEL_PATH'])
    #net = filtered_load_model(net, pretrained_dict)
    optimizer = optim.Adam(net.parameters())
    criterion = nn.MSELoss()
    num_epochs = config['fit']['num_epochs']
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=num_epochs, steps_per_epoch=len(dataloaders_dict['train']))
    history = modeler.train_net(net, dataloaders_dict, criterion, optimizer, scheduler, num_epochs, writer)
    # output
    model = history['model']
    model_out_path = MODEL_DIR+'/{}_model.pth'.format(machine_type)
    torch.save(model.state_dict(), model_out_path)
    logger.info('\n success:{0} \n'.format(machine_type) + \
                    'model_out_path ==> \n {0}'.format(model_out_path))
    #  close writer for tensorbord
    writer.close()
    #modeler.mlflow_log(history, config, machine_type, model_out_path, tb_log_dir)
    com.toc()
    return history

In [10]:
machine_types

['fan', 'pump', 'slider', 'ToyCar', 'ToyConveyor', 'valve']

In [11]:
for machine_type in machine_types:
    history = run(machine_type)
    with open('{}/{}_history.pkl'.format(PKL_DIR, machine_type), 'wb') as file:
        pickle.dump(history , file)

2020-11-06 01:36:31,440 - 00_train.py - INFO - TARGET MACHINE_TYPE: fan
2020-11-06 01:36:31,441 - 00_train.py - INFO - MAKE DATA_LOADER
2020-11-06 01:36:32,320 - 00_train.py - INFO - TRAINING


use: cuda:0


100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:22<00:00, 80.74it/s]
2020-11-06 01:38:04,131 - pytorch_modeler.py - INFO - Epoch 1/200:train_loss:2613.871860, valid_AUC:0.519841, valid_pAUC:0.503179
100%|██████████| 102/102 [01:06<00:00,  1.52it/s]
100%|██████████| 1803/1803 [00:22<00:00, 80.47it/s]
2020-11-06 01:39:33,518 - pytorch_modeler.py - INFO - Epoch 2/200:train_loss:972.887104, valid_AUC:0.517432, valid_pAUC:0.502964
100%|██████████| 102/102 [01:06<00:00,  1.52it/s]
100%|██████████| 1803/1803 [00:22<00:00, 80.19it/s]
2020-11-06 01:41:02,969 - pytorch_modeler.py - INFO - Epoch 3/200:train_loss:1082.674046, valid_AUC:0.519416, valid_pAUC:0.503057
100%|██████████| 102/102 [01:07<00:00,  1.52it/s]
100%|██████████| 1803/1803 [00:22<00:00, 80.48it/s]
2020-11-06 01:42:32,395 - pytorch_modeler.py - INFO - Epoch 4/200:train_loss:944.254542, valid_AUC:0.519549, valid_pAUC:0.502945
100%|██████████| 102/102 [01:06<00:00,  1.52it/s]
100%|██████████| 1803/18

100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:21<00:00, 81.98it/s]
2020-11-06 03:22:51,886 - pytorch_modeler.py - INFO - Epoch 72/200:train_loss:575.938532, valid_AUC:0.520809, valid_pAUC:0.502448
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:21<00:00, 82.14it/s]
2020-11-06 03:24:20,331 - pytorch_modeler.py - INFO - Epoch 73/200:train_loss:570.638957, valid_AUC:0.519457, valid_pAUC:0.502345
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:22<00:00, 81.91it/s]
2020-11-06 03:25:48,960 - pytorch_modeler.py - INFO - Epoch 74/200:train_loss:561.935944, valid_AUC:0.519072, valid_pAUC:0.502842
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:21<00:00, 82.42it/s]
2020-11-06 03:27:17,466 - pytorch_modeler.py - INFO - Epoch 75/200:train_loss:558.060288, valid_AUC:0.515857, valid_pAUC:0.502335
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/

100%|██████████| 1803/1803 [00:21<00:00, 82.09it/s]
2020-11-06 05:06:03,315 - pytorch_modeler.py - INFO - Epoch 142/200:train_loss:284.539438, valid_AUC:0.517662, valid_pAUC:0.502570
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:21<00:00, 82.38it/s]
2020-11-06 05:07:31,685 - pytorch_modeler.py - INFO - Epoch 143/200:train_loss:281.489957, valid_AUC:0.519211, valid_pAUC:0.502560
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:22<00:00, 81.95it/s]
2020-11-06 05:09:00,145 - pytorch_modeler.py - INFO - Epoch 144/200:train_loss:278.486117, valid_AUC:0.517985, valid_pAUC:0.502795
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:21<00:00, 82.16it/s]
2020-11-06 05:10:28,595 - pytorch_modeler.py - INFO - Epoch 145/200:train_loss:275.258054, valid_AUC:0.517810, valid_pAUC:0.502260
100%|██████████| 102/102 [01:06<00:00,  1.53it/s]
100%|██████████| 1803/1803 [00:21<00:00, 81.98it/s]
2020-11-06 05:11

elapsed time: 17702.297129393 [sec]


2020-11-06 06:31:35,525 - 00_train.py - INFO - TARGET MACHINE_TYPE: pump
2020-11-06 06:31:35,526 - 00_train.py - INFO - MAKE DATA_LOADER
2020-11-06 06:31:35,529 - 00_train.py - INFO - TRAINING
  0%|          | 0/91 [00:00<?, ?it/s]

use: cuda:0


  1%|          | 1/91 [00:02<03:49,  2.55s/it]


RuntimeError: CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 5.93 GiB total capacity; 4.46 GiB already allocated; 96.19 MiB free; 4.65 GiB reserved in total by PyTorch)
Exception raised from malloc at /opt/conda/conda-bld/pytorch_1595629427478/work/c10/cuda/CUDACachingAllocator.cpp:272 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x4d (0x7fbaf3d6477d in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x20626 (0x7fbaf3fbc626 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x214f4 (0x7fbaf3fbd4f4 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x21b81 (0x7fbaf3fbdb81 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x249 (0x7fbaf6ebee39 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #5: <unknown function> + 0xd15c49 (0x7fbaf4edfc49 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #6: <unknown function> + 0xd2fa77 (0x7fbaf4ef9a77 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #7: <unknown function> + 0xe450dd (0x7fbb2b3040dd in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #8: <unknown function> + 0xe453f7 (0x7fbb2b3043f7 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #9: at::empty(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0xfa (0x7fbb2b40ee7a in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #10: at::native::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x49e (0x7fbb2b08d09e in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #11: <unknown function> + 0xfe3521 (0x7fbb2b4a2521 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #12: <unknown function> + 0x101ecc3 (0x7fbb2b4ddcc3 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #13: at::empty_like(at::Tensor const&, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x101 (0x7fbb2b3f1f91 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #14: at::native::zeros_like(at::Tensor const&, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x46 (0x7fbb2b086b56 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #15: <unknown function> + 0xfe6894 (0x7fbb2b4a5894 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #16: <unknown function> + 0x101ecc3 (0x7fbb2b4ddcc3 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #17: at::zeros_like(at::Tensor const&, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x101 (0x7fbb2b3f2231 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #18: at::native::avg_pool2d_backward_cuda(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, bool, c10::optional<long>) + 0x6d (0x7fbaf5a1a91d in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #19: <unknown function> + 0xd0fd19 (0x7fbaf4ed9d19 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #20: <unknown function> + 0xd31f9d (0x7fbaf4efbf9d in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cuda.so)
frame #21: at::avg_pool2d_backward(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, bool, c10::optional<long>) + 0x1e7 (0x7fbb2b401e07 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #22: <unknown function> + 0x2ba17ce (0x7fbb2d0607ce in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #23: <unknown function> + 0xe6fbbd (0x7fbb2b32ebbd in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #24: at::avg_pool2d_backward(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, bool, c10::optional<long>) + 0x1e7 (0x7fbb2b401e07 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #25: torch::autograd::generated::AvgPool2DBackward::apply(std::vector<at::Tensor, std::allocator<at::Tensor> >&&) + 0x1a9 (0x7fbb2cf5fc59 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #26: <unknown function> + 0x30d1017 (0x7fbb2d590017 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #27: torch::autograd::Engine::evaluate_function(std::shared_ptr<torch::autograd::GraphTask>&, torch::autograd::Node*, torch::autograd::InputBuffer&, std::shared_ptr<torch::autograd::ReadyQueue> const&) + 0x1400 (0x7fbb2d58b860 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #28: torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) + 0x451 (0x7fbb2d58c401 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #29: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7fbb2d584579 in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so)
frame #30: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7fbb318b399a in /home/hiroki/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so)
frame #31: <unknown function> + 0xc819d (0x7fbb75ca819d in /home/hiroki/anaconda3/lib/python3.7/site-packages/zmq/backend/cython/../../../../.././libstdc++.so.6)
frame #32: <unknown function> + 0x76db (0x7fbb78c3a6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #33: clone + 0x3f (0x7fbb78963a3f in /lib/x86_64-linux-gnu/libc.so.6)


history = run('ToyCar')
with open('{}/{}_history.pkl'.format(PKL_DIR, machine_type), 'wb') as file:
    pickle.dump(history , file)