In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

from vae import VAE
from loss_function import loss_function

## Hyperparmeters

In [3]:
batch_size = 128
input_dim = 31

## Load Data

In [4]:
# file located at '/data' in parent directory
path = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'creditcard.csv')) 
df = pd.read_csv(path)
df['Time_diff_from_previous'] = df['Time'].diff()
df['Time_diff_from_previous'].iloc[0] = 0
df['Time_diff_from_last'] = df['Time'].diff().iloc[1:].reset_index(drop=True)
df['Time_diff_from_last'].iloc[-1] = 0
df['Amount'] = df['Amount']/df['Amount'].max()
df = df.drop(['Time'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
display(df.head())

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V23,V24,V25,V26,V27,V28,Amount,Class,Time_diff_from_previous,Time_diff_from_last
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.005824,0,0.0,0.0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0.000105,0,0.0,1.0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0.014739,0,1.0,0.0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.004807,0,0.0,1.0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0.002724,0,1.0,0.0


# Split Data

In [6]:
split_ratio = [0.6, 0.2, 0.2]

In [7]:
class CCFDataset(Dataset) : 
    def __init__(self, df, transform=transforms.ToTensor()) :
        self.df = df.reset_index(drop=True)
        
    def __len__(self) :
        return self.df.shape[0]
    
    def __getitem__(self, idx) :
        # x, y
        x = torch.from_numpy(self.df.drop(['Class'], axis=1).iloc[idx].values).type('torch.FloatTensor')
        #y = torch.from_numpy(self.df['Class'].iloc[idx]).type('torch.FloatTensor')
        y = torch.tensor([self.df['Class'].iloc[idx]], dtype=torch.float64)
        return x, y 

In [8]:
def split_data(seed, split_ratio) : 
    train_ratio = split_ratio[0]
    valid_ratio = split_ratio[1]
    test_ratio = split_ratio[2]
    # normal
    normal_data = df[df['Class']==0]
    normal_data = normal_data.reset_index(drop=True)
    # novelty
    novel_data = df[df['Class']==1]
    novel_data = novel_data.reset_index(drop=True)
    
    train_size = int(normal_data.shape[0]*train_ratio) # 60% train
    valid_size = int(normal_data.shape[0]*valid_ratio) # 20% valid
    test_size = int(normal_data.shape[0]*test_ratio) # 20% test
  
    # split train, valid, test
    train_data = normal_data[:train_size]
    valid_data = normal_data[train_size:train_size+valid_size]
    test_data = normal_data[train_size+valid_size:]
    
    test_data = pd.concat([test_data, novel_data])
    
    train_dataset = CCFDataset(train_data)
    valid_dataset = CCFDataset(valid_data)
    test_dataset = CCFDataset(test_data)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    return train_loader, valid_loader, test_loader

## input_dim=31 (except 'Time', 'Class'), hnum =1, z=5


In [None]:
seed = 1
zdims = 5
h_num = 1
epochs = 1
split_ratio = [0.6, 0.2, 0.2]


model = VAE(input_dim, zdims, h_num)
train_loader, valid_loader, test_loader = split_data(seed, split_ratio)

# train, validate the architecture
train_loss_list = []
valid_loss_list = []
test_loss_list = []
optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
print((len(train_loader.dataset) / batch_size))    

for idx_epoch, epoch in enumerate(range(1, epochs+1)) :
    # toggle train mode, intilaize loss for current epoch
    model.train()
    train_loss = 0
    valid_loss = 0
    test_loss = 0
        
    # training set 
    for idx_batch, (data, _) in enumerate(train_loader) :
        # initilaize
        optimizer.zero_grad()
        # prediction
        recon_batch, mu, logvar = model(data)
        # get loss
        loss = loss_function(recon_batch, data, mu, logvar, batch_size, input_dim)
        # backpropagate
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        print('Batch %d loss : %.4f' % (idx_batch, loss.item()))
        
    train_loss = train_loss / (len(train_loader.dataset) / batch_size)
    train_loss_list.append(train_loss)
        
    '''
    # validation set
    model.eval()
    for data, _ in valid_loader :
        yhat = model(data)
        criterion = nn.MSELoss()
        loss = criterion(yhat, _)
        valid_loss += loss.item()
    valid_loss = valid_loss / (len(valid_loader.dataset) / batch_size)  
    valid_loss_list.append(valid_loss)
    
    # test set
    for data, _ in test_labeled_only_loader :
        yhat = model(data)
        criterion = nn.MSELoss()
        loss = criterion(yhat, _.float())
        test_loss += loss.item()
    test_loss = test_loss / (len(test_loader.dataset) / batch_size)
    test_loss_list.append(test_loss)
    '''
    
plt.plot(train_loss_list)
plt.show()


1332.7265625
Batch 0 loss : 0.7190
Batch 1 loss : 0.7146
Batch 2 loss : 0.7117
Batch 3 loss : 0.7057
Batch 4 loss : 0.7089
Batch 5 loss : 0.7034
Batch 6 loss : 0.6975
Batch 7 loss : 0.7067
Batch 8 loss : 0.6911
Batch 9 loss : 0.6877
Batch 10 loss : 0.6964
Batch 11 loss : 0.6893
Batch 12 loss : 0.6839
Batch 13 loss : 0.6883
Batch 14 loss : 0.6772
Batch 15 loss : 0.6777
Batch 16 loss : 0.6732
Batch 17 loss : 0.6744
Batch 18 loss : 0.6718
Batch 19 loss : 0.6651
Batch 20 loss : 0.6594
Batch 21 loss : 0.6702
Batch 22 loss : 0.6601
Batch 23 loss : 0.6599
Batch 24 loss : 0.6565
Batch 25 loss : 0.6540
Batch 26 loss : 0.6481
Batch 27 loss : 0.6493
Batch 28 loss : 0.6514
Batch 29 loss : 0.6382
Batch 30 loss : 0.6415
Batch 31 loss : 0.6467
Batch 32 loss : 0.6351
Batch 33 loss : 0.6433
Batch 34 loss : 0.6326
Batch 35 loss : 0.6288
Batch 36 loss : 0.6300
Batch 37 loss : 0.6293
Batch 38 loss : 0.6351
Batch 39 loss : 0.6142
Batch 40 loss : 0.6176
Batch 41 loss : 0.6178
Batch 42 loss : 0.6188
Batch 43

## input_dim=33, hnum =2, z=5