### Train model in notebook (inefficient, for testing and debug only)

In [1]:
import os

In [2]:
#os.environ['NCCL_DEBUG'] = 'INFO'
#os.environ['NCCL_DEBUG_SUBSYS'] = 'ALL'
#os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ['NCCL_SOCKET_IFNAME'] = 'enp2s0f1np1'
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_BLOCKING_WAIT'] = '1'

In [2]:
import sys
from os.path import join
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import optim, nn
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from astropy.io import fits
import pyxis.torch as pxt

from networks import *
from train import *
import config

train_dir = '/data/wxs0703/kl-nn/databases/train_database_5m'
test_dir = '/data/wxs0703/kl-nn/databases/test_database_5m'
fig_dir = '/data/wxs0703/kl-nn/figures/'
model_dir = '/data/wxs0703/kl-nn/models/'

In [3]:
torch.cuda.device_count()

8

In [4]:
world_size = 4
save_every = 1
nepochs = config.train['epoch_number']
batch_size = config.train['batch_size']
nfeatures = config.train['feature_number']

In [3]:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.cuda.set_device(0)
init_process_group(backend='nccl', rank=0, world_size=1)

In [4]:
train_ds = pxt.TorchDataset(train_dir)
valid_ds = pxt.TorchDataset(test_dir)

In [5]:
fid_train = torch.empty((100, 9), dtype=torch.float, device='cuda')

In [6]:
for i in range(100):
    fid_train[i, :3] = train_ds[i]['fid_pars'][:3]
    fid_train[i, 4:] = train_ds[i]['fid_pars'][3:]

In [None]:
fid_train

In [7]:
train_dl = DataLoader(
        train_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(train_ds),
    )
valid_dl = DataLoader(
        valid_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(valid_ds),
    )

In [8]:
model = ForkCNN(batch_size)
model.to(0)
model = DDP(model, device_ids=[0])

In [9]:
optimizer = optim.SGD(model.parameters(), 
                      lr=config.train['initial_learning_rate'],
                      momentum=config.train['momentum'])

In [10]:
trainer = CNNTrainer(model, nfeatures, train_dl, valid_dl, optimizer, 0, save_every, batch_size)

In [11]:
trainer.train(50)

Batch 0 complete
Batch 100 complete
Batch 200 complete
Batch 300 complete
Batch 400 complete


KeyboardInterrupt: 

In [12]:
destroy_process_group()

In [None]:
mp.spawn(train_nn, args=(world_size, ForkCNN, CNNTrainer), nprocs=world_size)

INFO:Setup:Initializing
INFO:Setup:[rank: 0] Successfully set up device
INFO:Setup:[rank: 0] Successfully loaded training objects
INFO:Setup:[rank: 0] Successfully prepared dataloader
INFO:Setup:[rank: 0] Successfully initialized Trainer
INFO:Trainer:Batch 0 complete
