In [1]:
import torch
import torch.utils.data
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as tr
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
import torchvision.utils as vutils
import itertools

import os
import numpy as np
from tqdm import tqdm
from sklearn.mixture import GaussianMixture

# Model

In [2]:
nc = 1
ndf = 64
ngf = 64

class MyModel(nn.Module):
    def __init__(self,latent_dim,nClusters):
        super(MyModel,self).__init__()
        
        self.latent_dim = latent_dim
        self.nClusters = nClusters
        self.pi_=nn.Parameter(torch.FloatTensor(self.nClusters,).fill_(1)/self.nClusters,requires_grad=True)
        self.mu_c=nn.Parameter(torch.FloatTensor(self.nClusters,self.latent_dim).fill_(0),requires_grad=True)
        self.log_var_c=nn.Parameter(torch.FloatTensor(self.nClusters,self.latent_dim).fill_(0),requires_grad=True)
        
        self.encoder = nn.Sequential(
            # input is (nc) x 28 x 28
            nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf) x 14 x 14
            nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*2) x 7 x 7
            nn.Conv2d(ndf * 2, ndf * 4, 3, 2, 1, bias=False),
            nn.BatchNorm2d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            # state size. (ndf*4) x 4 x 4
            nn.Conv2d(ndf * 4, 1024, 4, 1, 0, bias=False),
            # nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.2, inplace=True),
            # nn.Sigmoid()
        )
        
        self.decoder = nn.Sequential(
            # input is Z, going into a convolution
            nn.ConvTranspose2d(     1024, ngf * 8, 4, 1, 0, bias=False),
            nn.BatchNorm2d(ngf * 8),
            nn.ReLU(True),
            # state size. (ngf*8) x 4 x 4
            nn.ConvTranspose2d(ngf * 8, ngf * 4, 3, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 4),
            nn.ReLU(True),
            # state size. (ngf*4) x 8 x 8
            nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(True),
            # state size. (ngf*2) x 16 x 16
            nn.ConvTranspose2d(ngf * 2,     nc, 4, 2, 1, bias=False),
            # nn.BatchNorm2d(ngf),
            # nn.ReLU(True),
            # state size. (ngf) x 32 x 32
            # nn.ConvTranspose2d(    ngf,      nc, 4, 2, 1, bias=False),
            # nn.Tanh()
            nn.Sigmoid()
            # state size. (nc) x 64 x 64
        )

        self.fc1 = nn.Linear(1024, 512)
        self.fc21 = nn.Linear(512, latent_dim)
        self.fc22 = nn.Linear(512, latent_dim)

        self.fc3 = nn.Linear(latent_dim, 512)
        self.fc4 = nn.Linear(512, 1024)

        self.lrelu = nn.LeakyReLU()
        self.relu = nn.ReLU()
        # self.sigmoid = nn.Sigmoid()
    def encode(self, x):
        conv = self.encoder(x);
        # print("encode conv", conv.size())
        h1 = self.fc1(conv.view(-1, 1024))
        # print("encode h1", h1.size())
        return self.fc21(h1), self.fc22(h1)

    def decode(self, z):
        h3 = self.relu(self.fc3(z))
        deconv_input = self.fc4(h3)
        # print("deconv_input", deconv_input.size())
        deconv_input = deconv_input.view(-1,1024,1,1)
        # print("deconv_input", deconv_input.size())
        return self.decoder(deconv_input)

    #def reparameterize(self, mu, logvar):
        #std = torch.nan_to_num(logvar.mul(0.5).exp_(),nan=0)  ## Reparameterize nan??
        #std = logvar.mul(0.5).exp_()
        #eps = Variable(std.data.new(std.size()).normal_())
          # num = np.array([[ 1.096506  ,  0.3686553 , -0.43172026,  1.27677995,  1.26733758,
          #       1.30626082,  0.14179629,  0.58619505, -0.76423112,  2.67965817]], dtype=np.float32)
          # num = np.repeat(num, mu.size()[0], axis=0)
          # eps = Variable(torch.from_numpy(num))
        #return eps.mul(std).add_(mu)

    def reparameterize(self, mu, logvar):
        std = logvar.mul(0.5).exp_()
        if torch.cuda.is_available():
            eps = torch.cuda.FloatTensor(std.size()).normal_()
        else:
            eps = torch.FloatTensor(std.size()).normal_()
        eps = Variable(eps)
        return eps.mul(std).add_(mu)
    
    def gaussian_pdf_log(self,x,mu,log_sigma2):
        return -0.5*(torch.sum(np.log(np.pi*2)+log_sigma2+(x-mu).pow(2)/torch.exp(log_sigma2),1))
    
    def gaussian_pdfs_log(self,x,mus,log_sigma2s):
    #def gaussian_pdfs_log(x,mus,log_sigma2s):
        G=[]
        for c in range(self.nClusters):
            G.append(self.gaussian_pdf_log(x,mus[c:c+1,:],log_sigma2s[c:c+1,:]).view(-1,1))
        return torch.cat(G,1)
    
    def forward(self, x):
        # print("x", x.size())
        mu, logvar = self.encode(x)
        #decoded = self.decode(mu)
        z = self.reparameterize(mu,logvar)
        z = torch.nan_to_num(z,nan=0)
        #z = torch.randn_like(mu)*torch.exp(logvar/2)+mu
        decoded = self.decode(z)
        # print("decoded", decoded.size())
        return decoded, mu, logvar
    
    def predict(self,x):
        z_mu, z_sigma2_log = self.encode(x)
        z = torch.randn_like(z_mu) * torch.exp(z_sigma2_log / 2) + z_mu
        z = torch.nan_to_num(z,nan=0)
        pi = self.pi_
        log_sigma2_c = self.log_var_c
        mu_c = self.mu_c
        yita_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_sigma2_c))

        yita=yita_c.detach().cpu().numpy()
        #print(yita)
        return np.argmax(yita,axis=1)
        
    def RE(self,recon_x,x):
        return torch.nn.functional.binary_cross_entropy(recon_x.view(-1,784),x.view(-1,784),size_average=False)

    def KLD(self,mu,log_var):
        det=1e-10

        pi=self.pi_
        log_var_c = self.log_var_c
        mu_c = self.mu_c

        z = torch.randn_like(mu) * torch.exp(log_var/2) + mu
        z = torch.nan_to_num(z,nan=0)

        yita_c = torch.exp(torch.log(pi.unsqueeze(0))+self.gaussian_pdfs_log(z,mu_c,log_var_c))+det
        yita_c = yita_c/(yita_c.sum(1).view(-1,1))
        loss = 0.5*torch.mean(torch.sum(yita_c*torch.sum(log_var_c.unsqueeze(0)+
                                                    torch.exp(log_var.unsqueeze(1)-log_var_c.unsqueeze(0))+
                                                    (mu.unsqueeze(1)-mu_c.unsqueeze(0)).pow(2)/torch.exp(log_var_c.unsqueeze(0)),2),1))
        loss -= torch.mean(torch.sum(yita_c*torch.log(pi.unsqueeze(0)/(yita_c)),1))+0.5*torch.mean(torch.sum(1+log_var,1))
        return loss

    def loss_function(self,recon_x,x,mu,log_var):
        return self.RE(recon_x,x)+self.KLD(mu,log_var)

# Input Parameter

In [3]:
latent_dim = 10
nClusters = 13
batch = 100

In [4]:
kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available else {}

# DataLoader

In [5]:
import h5py

In [6]:
data_list = ['../data/normal_dataset.h5','../data/Anomaly_dataset.h5']

In [7]:
imageList=[]
labelList=[]
for file_path in data_list:
    print('Loading data from ', file_path)
    dataset = h5py.File(file_path,'r',libver='latest',swmr=True)
    FimageList=[]
    FlabelList=[]
    for gName,group in dataset.items():
        for dName,data in group.items():
            if dName == 'images':
                FimageList.append(data)
            elif dName == 'labels':
                FlabelList.append(data)

    if len(FimageList) >= 2:
        #print("More than 2 gropus in File")
        image_concat = []
        for i in range(0,len(FimageList)):
            image_concat.append(FimageList[i][:])
        imageList.append(np.concatenate(image_concat))
        label_concat = []
        for i in range(0,len(FlabelList)):
            label_concat.append(FlabelList[i][:])
        labelList.append(np.concatenate(label_concat))
    else:
        imageList.append(FimageList[0][:])
        labelList.append(FlabelList[0][:])
imageList = np.concatenate(imageList)
labelList = np.concatenate(labelList)
print('input image shape : ',imageList.shape)
print('input label shape : ',labelList.shape)
ds = TensorDataset(torch.tensor(imageList),torch.tensor(labelList))
length = [int(len(ds)*0.7),int(len(ds)*0.2)]
length.append(len(ds)-sum(length))

trnSet,valSet,tstSet=torch.utils.data.random_split(ds,length)

#train Loader
train_loader = DataLoader(trnSet, batch_size=batch, shuffle=True, **kwargs)
#test Loader
test_loader = DataLoader(valSet, batch_size=batch, shuffle=False, **kwargs)

Loading data from  ../data/normal_dataset.h5
Loading data from  ../data/Anomaly_dataset.h5
input image shape :  (72100, 1, 28, 28)
input label shape :  (72100,)


# Initial $\gamma$-training

In [8]:
# Initial gamma-training parameter
gamma = 1e-5
gamma_step = 200

In [9]:
def gamma_training(gamma,steps,dataloader):
    optimizer = optim.Adam(model.parameters(),lr=1e-3)
    print('Starting initial gamma training for ',steps,'steps.')
    for batch_idx, (data,_) in tqdm(enumerate(train_loader)):

        #print(batch_idx+1,'step trained')
        data = Variable(data)
        data = data.cuda()
        
        optimizer.zero_grad()
        
        recon_batch, mu, logvar = model(data)
        
        loss = model.RE(recon_batch,data) + gamma * model.KLD(mu,logvar)
        loss.backward()
        #print(loss.item())
        optimizer.step()
        
        if batch_idx +1 == steps:
            print('Training completed')
            break

In [10]:
net = MyModel(latent_dim=latent_dim,nClusters=nClusters)

In [11]:
model = net.cuda()

In [12]:
model.state_dict()

OrderedDict([('pi_',
              tensor([0.0769, 0.0769, 0.0769, 0.0769, 0.0769, 0.0769, 0.0769, 0.0769, 0.0769,
                      0.0769, 0.0769, 0.0769, 0.0769], device='cuda:0')),
             ('mu_c',
              tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                     

In [13]:
gamma_training(gamma=gamma,steps=gamma_step,dataloader=train_loader)

Starting initial gamma training for  200 steps.


199it [00:12, 16.55it/s]


Training completed


In [14]:
model.state_dict()

OrderedDict([('pi_',
              tensor([0.2307, 0.2307, 0.2307, 0.2307, 0.2307, 0.2307, 0.2307, 0.2307, 0.2307,
                      0.2307, 0.2307, 0.2307, 0.2307], device='cuda:0')),
             ('mu_c',
              tensor([[-0.1900,  0.2083,  0.2084,  0.1877,  0.2000,  0.1864, -0.1845,  0.1993,
                        0.1943,  0.0088],
                      [-0.1900,  0.2083,  0.2084,  0.1877,  0.2000,  0.1864, -0.1845,  0.1993,
                        0.1943,  0.0088],
                      [-0.1900,  0.2083,  0.2084,  0.1877,  0.2000,  0.1864, -0.1845,  0.1993,
                        0.1943,  0.0088],
                      [-0.1900,  0.2083,  0.2084,  0.1877,  0.2000,  0.1864, -0.1845,  0.1993,
                        0.1943,  0.0088],
                      [-0.1900,  0.2083,  0.2084,  0.1877,  0.2000,  0.1864, -0.1845,  0.1993,
                        0.1943,  0.0088],
                      [-0.1900,  0.2083,  0.2084,  0.1877,  0.2000,  0.1864, -0.1845,  0.1993,
         

In [15]:
t_im,t_la = next(iter(train_loader))
t_re,t_mu_,t_logvar = model(t_im.cuda())
t_mu2,t_logvar2 = model.encode(t_im.cuda())
t_z = model.reparameterize(t_mu2,t_logvar2)
t_re2 = model.decode(t_z)

In [16]:
t_re[0]

tensor([[[1.1669e-02, 2.3968e-03, 4.5017e-03, 1.9883e-03, 2.6831e-03,
          2.3788e-03, 2.9266e-03, 2.3113e-03, 2.1498e-03, 2.0018e-03,
          2.7471e-03, 1.9373e-03, 2.9702e-03, 2.3909e-03, 3.5000e-03,
          2.8128e-03, 2.3933e-03, 2.3798e-03, 3.1876e-03, 2.1258e-03,
          2.8895e-03, 2.2610e-03, 3.3376e-03, 2.4162e-03, 1.7148e-03,
          1.2221e-03, 1.7029e-03, 8.7763e-03],
         [1.4209e-03, 1.7616e-04, 2.6462e-04, 2.4719e-04, 4.5653e-04,
          6.3763e-04, 5.6104e-04, 3.7330e-04, 5.9346e-04, 9.5894e-04,
          4.9644e-04, 4.3958e-04, 8.0210e-04, 9.0577e-04, 7.8358e-04,
          6.0775e-04, 9.7101e-04, 1.1025e-03, 6.5782e-04, 6.1080e-04,
          7.7979e-04, 7.5478e-04, 6.7138e-04, 5.0522e-04, 3.7668e-04,
          4.0578e-04, 8.0300e-05, 1.0837e-03],
         [2.8009e-03, 2.6366e-04, 1.7700e-03, 1.2262e-03, 1.5367e-03,
          1.7914e-03, 4.5704e-03, 1.7670e-03, 2.4128e-03, 3.1524e-03,
          4.7484e-03, 2.3042e-03, 3.6695e-03, 4.5805e-03, 8.7286e-

In [17]:
model.RE(t_im.cuda(),t_re)

tensor(527560.3750, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)

In [18]:
model.KLD(t_mu_,t_logvar)

tensor(1438.9792, device='cuda:0', grad_fn=<SubBackward0>)

# mini-Batch GMM initialization

In [19]:
len(train_loader)

505

In [20]:
from sklearn.mixture import GaussianMixture

k=250
Loss=nn.MSELoss()
optimizer = optim.Adam(itertools.chain(model.encoder.parameters(),model.decoder.parameters()))

In [21]:
subset = np.random.randint(len(train_loader),size=k*batch)

In [22]:
train_subset_loader = DataLoader(torch.utils.data.Subset(trnSet,subset))

In [23]:
print(k*batch,' Monte Carlo Sample training')
for batch_idx, (data,y) in tqdm(enumerate(train_subset_loader)):
    data = data.cuda()
    
    mu,logvar = model.encode(data)
    recon = model.decode(mu)
    
    loss = Loss(data,recon)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

model.fc22.load_state_dict(model.fc21.state_dict())

Z=[]
Y=[]
print('GMM Initializing with ',k*batch, 'MC samples')
with torch.no_grad():
    for batch_idx, (data,y) in tqdm(enumerate(train_subset_loader)):
        data = data.cuda()
        
        mu,logvar = model.encode(data)
        
        assert nn.functional.mse_loss(mu,logvar)==0
        Z.append(mu)
        Y.append(y)
    Z=torch.cat(Z,0).detach().cpu().numpy()
    Y=torch.cat(Y,0).detach().cpu().numpy()
    
    gmm = GaussianMixture(n_components=nClusters,covariance_type='diag',max_iter=int(1e+04))
    
    pre = gmm.fit_predict(Z)
    
    model.pi_.data = torch.from_numpy(gmm.weights_).cuda().float()
    model.mu_c.data = torch.from_numpy(gmm.means_).cuda().float()
    model.log_var_c.data = torch.log(torch.from_numpy(gmm.covariances_).cuda().float())

25000  Monte Carlo Sample training


25000it [02:54, 143.20it/s]


GMM Initializing with  25000 MC samples


25000it [00:32, 771.59it/s]


In [24]:
model.state_dict()

OrderedDict([('pi_',
              tensor([0.0420, 0.1081, 0.0848, 0.0771, 0.1183, 0.0593, 0.0931, 0.0632, 0.0749,
                      0.0871, 0.0528, 0.0850, 0.0543], device='cuda:0')),
             ('mu_c',
              tensor([[  96.7025,  -48.8457,  199.3880,   20.9107, -113.3362,  148.9146,
                        -73.5483,  -37.9001,  -71.4437,   57.6443],
                      [  35.4629,  -46.7273,  -83.9129,   -9.6973,   24.3818,  -36.1425,
                         28.8227,  -15.7118,  -22.2052,  -92.7677],
                      [ -82.4508,  -10.9163,  -45.5627,  134.2726,  -22.4699, -107.1788,
                         29.0955,  -13.2616, -169.9713,   41.3483],
                      [-217.8853, -267.1485,  -20.3915,   52.7830,   93.7232,   -4.7489,
                       -131.3195,   47.1489,  -45.4940,   95.4881],
                      [ -98.7428, -101.7929,    7.7619,  -56.1147,  -94.5015,   75.1096,
                        -39.1587,  138.7993, -138.8608,   82.6086],
    

In [25]:
t_im,t_la = next(iter(train_loader))
t_re,t_mu_,t_logvar = model(t_im.cuda())
t_mu2,t_logvar2 = model.encode(t_im.cuda())
t_z = model.reparameterize(t_mu2,t_logvar2)
t_re2 = model.decode(t_z)

In [26]:
t_logvar

tensor([[-1.7910e+02, -2.5831e+02, -5.4095e+01,  3.5193e+00, -1.4625e+00,
          1.0825e+02, -5.0355e+01,  1.2033e+02, -1.4867e+02,  1.3488e+02],
        [ 5.9783e+01,  1.3572e+01,  2.8041e+02,  9.2861e+01, -1.8223e+02,
          6.0181e+01, -8.1286e+01, -1.1445e+02, -1.9225e+02,  1.1262e+02],
        [ 2.8171e+02, -5.3082e+01,  7.1941e+00, -7.9738e+00,  8.8380e+01,
         -2.9508e+02,  8.1247e+01, -1.0028e+02,  6.1102e+01, -7.1107e+00],
        [ 1.1729e+02, -7.2196e+01, -1.5712e+02, -1.2286e+02,  9.4105e+01,
          1.2806e+02, -1.0061e+01,  6.8617e-01,  2.6923e+01, -7.7981e+01],
        [-1.7294e+02, -1.3932e+02, -1.2917e+02,  1.9196e+01,  9.9721e+01,
          6.8531e+01, -1.2765e+02,  9.5426e+01,  6.6821e+01,  1.0673e+02],
        [-1.4876e+02, -7.2810e+01, -1.0229e+02,  5.9234e+01,  9.4088e+01,
         -1.2033e+02, -7.2956e+01,  2.4117e+01,  5.1895e+01, -8.4157e+00],
        [-2.1747e+02,  1.3728e+02,  1.0843e+01,  2.9454e+02, -2.0113e+01,
          8.3392e+01,  1.5144e+0

In [27]:
t_mu_

tensor([[-1.7910e+02, -2.5831e+02, -5.4095e+01,  3.5193e+00, -1.4625e+00,
          1.0825e+02, -5.0355e+01,  1.2033e+02, -1.4867e+02,  1.3488e+02],
        [ 5.9783e+01,  1.3572e+01,  2.8041e+02,  9.2861e+01, -1.8223e+02,
          6.0181e+01, -8.1286e+01, -1.1445e+02, -1.9225e+02,  1.1262e+02],
        [ 2.8171e+02, -5.3082e+01,  7.1941e+00, -7.9738e+00,  8.8380e+01,
         -2.9508e+02,  8.1247e+01, -1.0028e+02,  6.1102e+01, -7.1107e+00],
        [ 1.1729e+02, -7.2196e+01, -1.5712e+02, -1.2286e+02,  9.4105e+01,
          1.2806e+02, -1.0061e+01,  6.8617e-01,  2.6923e+01, -7.7981e+01],
        [-1.7294e+02, -1.3932e+02, -1.2917e+02,  1.9196e+01,  9.9721e+01,
          6.8531e+01, -1.2765e+02,  9.5426e+01,  6.6821e+01,  1.0673e+02],
        [-1.4876e+02, -7.2810e+01, -1.0229e+02,  5.9234e+01,  9.4088e+01,
         -1.2033e+02, -7.2956e+01,  2.4117e+01,  5.1895e+01, -8.4157e+00],
        [-2.1747e+02,  1.3728e+02,  1.0843e+01,  2.9454e+02, -2.0113e+01,
          8.3392e+01,  1.5144e+0

In [28]:
t_re[0]

tensor([[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan],
         [nan, nan, nan, nan, nan,

In [29]:
model.RE(t_im.cuda(),t_re)



tensor(nan, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)

In [30]:
model.KLD(t_mu_,t_logvar)

tensor(inf, device='cuda:0', grad_fn=<SubBackward0>)

In [31]:
model.loss_function(t_im.cuda(),t_re,t_mu_,t_logvar)

tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)

# Periodic $\beta$ - Annealing

# Inverse Min-Max Transform : Avoid NaN Losses

In [32]:
epochs = 300
batch_size = 100 
optimizer=optim.Adam(model.parameters())
outf = './test'

In [33]:
def train(epoch):
    model.train()
    #print('train')
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = Variable(data)
        #print(data.shape)
        #print(data)
        if torch.cuda.is_available():
            data = data.cuda()
        optimizer.zero_grad()

        recon_batch, mu, logvar = model(data)
        
        #print(recon_batch.shape,mu.shape,logvar.shape)
        #print(recon_batch.view(-1,784))
        #print(recon_batch.shape)
        print(recon_batch)
        #print(data.shape)
        print(model.RE(recon_batch,data))
        print(model.KLD(mu,logvar))
        loss = model.loss_function(recon_batch,data,mu,logvar)
        loss.backward()
        
        train_loss += loss.data
        #print(loss.data)
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.data / len(data)))
    print('====> Epoch: {} Average loss: {:.4f}'.format(epoch, train_loss / len(train_loader.dataset)))

    #if epoch % 5 == 0:
        #torch.save(model.state_dict(),'./model_%d.pth' % (epoch))
    return (train_loss/len(train_loader.dataset)).cpu()
    #return train_loss/len(train_loader.dataset)

In [34]:
def test(epoch):
    model.eval()
    #print('validation')
    test_loss = 0
    for i, (data, _) in enumerate(test_loader):
        if torch.cuda.is_available():
            data = data.cuda()
        data = Variable(data, volatile=True)
        recon_batch, mu, logvar = model(data)
        test_loss += loss_function(recon_batch, data, mu, logvar).data
        if i == 0:
            n = min(data.size(0), 16)
            comparison = torch.cat([data[:n],
                                  recon_batch.view(batch_size, 1, 28, 28)[:n]])
    test_loss /= len(test_loader.dataset)
    print('====> Test set loss: {:.4f}'.format(test_loss))
    return test_loss.cpu()
    #return test_loss

In [35]:
best_=100
best_epoch=0
train_loss_arr=[]
test_loss_arr=[]
for epoch in range(1,epochs+1):
    train_loss_arr.append(train(epoch))
    val_loss = test(epoch)
    test_loss_arr.append(val_loss)
    if val_loss.cpu() < best_:
    #if val_loss < best_:
        best_=val_loss.cpu()
        #best_ = val_loss
        best_epoch = epoch
        torch.save(model.state_dict(),'./s3vdc_test1.pth')

tensor([[[[nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          ...,
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan]]],


        [[[nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          ...,
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan]]],


        [[[nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          ...,
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan],
          [nan, nan, nan,  ..., nan, nan, nan]]],


        ...,


        [[[nan, nan, nan,  ..., nan, nan, nan],
          [nan, 

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.