### Train model in notebook (inefficient, for testing and debug only)

In [1]:
import os

In [None]:
#os.environ['NCCL_DEBUG'] = 'INFO'
#os.environ['NCCL_DEBUG_SUBSYS'] = 'ALL'
#os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ['NCCL_SOCKET_IFNAME'] = 'enp2s0f1np1'
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_BLOCKING_WAIT'] = '1'

In [1]:
import sys
from os.path import join
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import optim, nn
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from astropy.io import fits
import pyxis.torch as pxt
import normflows as nf

from networks import *
from train import *
import config

train_dir = '/ocean/projects/phy250048p/shared/datasets/small'
test_dir = '/ocean/projects/phy250048p/shared/datasets/small'
fig_dir = '/ocean/projects/phy250048p/shared/figures/'
model_dir = '/ocean/projects/phy250048p/shared/models/'

In [2]:
torch.cuda.device_count()

1

In [2]:
world_size = 1
save_every = 1
nepochs = config.train['epoch_number']
batch_size = config.train['batch_size']
nfeatures = config.train['feature_number']

In [4]:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.cuda.set_device(0)
init_process_group(backend='nccl', rank=0, world_size=1)

In [5]:
train_ds = pxt.TorchDataset(train_dir)
valid_ds = pxt.TorchDataset(test_dir)

In [6]:
train_dl = DataLoader(
        train_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(train_ds),
    )
valid_dl = DataLoader(
        valid_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(valid_ds),
    )

In [7]:
# Define flows
K = 4

latent_size = 2
hidden_units = 64
num_blocks = 2
context_size = 1024

flows = []
for i in range(K):
    flows += [nf.flows.MaskedAffineAutoregressive(latent_size, hidden_units, 
                                                  context_features=context_size, 
                                                  num_blocks=num_blocks,
                                                  use_batch_norm=True,)]
    flows += [nf.flows.LULinearPermute(latent_size)]

# Set base distribution
q0 = nf.distributions.DiagGaussian(2, trainable=False)

In [8]:
model = ForkCNN(mode=1, base=q0, flows=flows)
model.to(0)
model = DDP(model, device_ids=[0])

In [9]:
print(model)

DistributedDataParallel(
  (module): ForkCNN(
    (cnn_spec): Sequential(
      (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
      (6): MaxPool2d(kernel_size=(1, 2), stride=(1, 2), padding=0, dilation=1, ceil_mode=False)
      (7): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (8): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (9): ReLU(inplace=True)
      (10): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (11): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (12): ReLU(in

In [10]:
# optimizer = optim.SGD(model.parameters(), 
#                       lr=config.train['initial_learning_rate'],
#                       momentum=config.train['momentum'])
optimizer = optim.AdamW(model.parameters(),
                        lr=config.train['initial_learning_rate'], 
                        weight_decay=config.train['weight_decay'])

In [11]:
trainer = CNNTrainer(world_size, model, nfeatures, train_ds, valid_ds, optimizer, 0, save_every, batch_size)

In [12]:
trainer.train(2)



torch.Size([100, 1024]) -4.150607585906982 6.034210681915283
torch.Size([100, 1024]) -2.6189422607421875 4.951674461364746
torch.Size([100, 1024]) -3.665984869003296 6.31422233581543
torch.Size([100, 1024]) -2.3218109607696533 5.557764530181885
torch.Size([100, 1024]) -2.5004327297210693 3.6089398860931396
torch.Size([100, 1024]) -2.646090030670166 4.563873767852783
torch.Size([100, 1024]) -2.7121989727020264 3.794757604598999
torch.Size([100, 1024]) -2.8538455963134766 3.733776092529297
torch.Size([100, 1024]) -2.933443069458008 3.7106428146362305
torch.Size([100, 1024]) -3.033552646636963 3.330029249191284
torch.Size([100, 1024]) -3.0799570083618164 3.3456454277038574
torch.Size([100, 1024]) -3.0903546810150146 3.5749006271362305
torch.Size([100, 1024]) -3.094247341156006 3.8736019134521484
torch.Size([100, 1024]) -3.1074163913726807 3.615888833999634
torch.Size([100, 1024]) -3.0984911918640137 4.084143161773682
torch.Size([100, 1024]) -3.091672897338867 4.279745578765869
torch.Size(

In [5]:
destroy_process_group()

AssertionError: 

In [4]:
mp.spawn(train_nn, args=(world_size, ForkCNN, CNNTrainer), nprocs=world_size)

INFO:Setup:Initializing
INFO:Setup:[rank: 0] Successfully set up device
INFO:Setup:Setting up for density estimation


Loaded new model


INFO:Setup:[rank: 0] Successfully loaded training objects
INFO:Setup:[rank: 0] Successfully initialized Trainer
INFO:Trainer:Setting up tensors on GPU
INFO:Trainer:Uploading training set to GPU...
INFO:Trainer:10% complete
INFO:Trainer:20% complete
INFO:Trainer:30% complete
INFO:Trainer:40% complete
INFO:Trainer:50% complete
INFO:Trainer:60% complete
INFO:Trainer:70% complete
INFO:Trainer:80% complete
INFO:Trainer:90% complete
INFO:Trainer:Uploading validation set to GPU...
INFO:Trainer:10% complete
INFO:Trainer:20% complete
INFO:Trainer:30% complete
INFO:Trainer:40% complete
INFO:Trainer:50% complete
INFO:Trainer:60% complete
INFO:Trainer:70% complete
INFO:Trainer:80% complete
INFO:Trainer:90% complete
INFO:Trainer:Training start
INFO:Trainer:Starting epoch 0
INFO:Trainer:Randomized SNR and noise for epoch 0
INFO:Trainer:Batch 0 complete
INFO:Trainer:[TRAIN] Epoch: 1 Loss: 1.3587103580317554 Time: 0:11
INFO:Trainer:Batch 0 complete
INFO:Trainer:[VALID] Epoch: 1 Loss: 1.373456243098524