Anyone encounter loss nan? #11

kuanzi · 2019-05-27T13:54:59Z

It is ok for small scale models like MLP/LeNet5. However, when it comes to vgg16/ resnet18, it will always produce nan loss.
The model structure configuration is below:

`
cfg = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

class VGG_CIFAR10_BAY(nn.Module):
kl_list = []
def init(self, vgg_name):
super(VGG_CIFAR10_BAY, self).init()
self.features = self._make_layers(cfg[vgg_name])
linear_index = BayesianLayer.LinearGroupNJ(512, 10, clip_var=0.04, cuda=True)
self.classifier = linear_index
self.kl_list.append(linear_index)

def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out

def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
conv_index = BayesianLayer.Conv2dGroupNJ(in_channels, x, kernel_size=3, padding=1, clip_var=0.04, cuda=True)
layers += [conv_index,
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)]
self.kl_list.append(conv_index)
in_channels = x
layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)

def get_masks(self,thresholds):
# import pdb
# pdb.set_trace()
weight_masks = []
mask = None
layers = self.kl_list
for i, (layer, threshold) in enumerate(zip(layers, thresholds)):
# compute dropout mask
if len(layer.weight_mu.shape) > 2:
if mask is None:
mask = [True]*layer.in_channels
else:
mask = np.copy(next_mask)

        log_alpha = layers[i].get_log_dropout_rates().cpu().data.numpy()
        next_mask = log_alpha <= thresholds[i]

        weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
        weight_mask = weight_mask[:,:,None,None]
    else:
        if mask is None:
            log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
            mask = log_alpha <= threshold
        elif len(weight_mask.shape) > 2:
            temp = next_mask.repeat(layer.in_features/next_mask.shape[0])
            log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
            mask = log_alpha <= threshold
            #mask = mask | temp  ##Upper bound for number of weights at first fully connected layer
            mask = mask & temp   ##Lower bound for number of weights at fully connected layer
        else:
            mask = np.copy(next_mask)

        try:
            log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
            next_mask = log_alpha <= thresholds[i + 1]
        except:
            # must be the last mask
            next_mask = np.ones(10)

        weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
    weight_masks.append(weight_mask.astype(np.float))
return weight_masks

def model_kl_div(self):
KLD = 0
for layer in self.kl_list:
KLD += layer.layer_kl_div()
return KLD
`

Does it cause by high variance? But I have tried to clip variance, it doesn't work...

The text was updated successfully, but these errors were encountered:

kuanzi · 2019-05-27T14:02:40Z

Sorry, the format cannot be adjusted well... Has anyone been in a similar nan situation?

gullalc · 2019-07-06T13:53:26Z

Unfortunately with a larger network like VGG, loss becomes nan because of vanishing gradients problem. I believe only authors can help in this case.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Anyone encounter loss nan? #11

Anyone encounter loss nan? #11

kuanzi commented May 27, 2019 •

edited

Loading

kuanzi commented May 27, 2019

gullalc commented Jul 6, 2019

Anyone encounter loss nan? #11

Anyone encounter loss nan? #11

Comments

kuanzi commented May 27, 2019 • edited Loading

kuanzi commented May 27, 2019

gullalc commented Jul 6, 2019

kuanzi commented May 27, 2019 •

edited

Loading