In [1]:
import torchvision
from torchvision import transforms
from torch import nn
import torch.nn.functional as F
import torch

import time
from sklearn.preprocessing import MinMaxScaler

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors

ModuleNotFoundError: No module named 'torchvision'

In [40]:
data = pd.read_csv('dataset_with_targets.csv', header=None)

val_size = 5000

test_size = 10000

In [41]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.19,0.077409,1250.0,-90.3698,53.700094,-59.15493,0.276798,0.021783,11.644999,34.5399,-0.1495,2.372547,2.116338,1250.0,-305.75,-0.908867,0.051593
1,-0.26,0.126186,350.0,-86.317435,77.264642,-0.0,0.450444,0.121033,6.811875,142.023494,-0.381,0.772327,-0.003629,350.0,-244.75,0.893989,0.124688
2,-0.8,-0.398406,300.0,-86.232217,53.064664,-53.333333,0.014723,0.000987,7.820611,57.045273,0.00282,7.115817,6.407732,-300.0,-0.0,0.239919,0.4
3,0.54,-0.042877,4684.0,1634.814815,53.68,-38.888889,0.024397,0.001594,10.776373,26.110229,0.052905,6.099106,1.952781,-4684.0,10.71,-0.310448,0.268817
4,0.11,-0.057443,3020.0,-77.865729,55.213228,-25.757576,0.708303,0.079423,8.518989,69.86777,-0.208755,0.770984,0.061754,-3020.0,-443.74,-0.406361,0.083654


In [42]:
train_data = data.iloc[:-(test_size + val_size)]

val_data = data.iloc[-(test_size + val_size):-test_size]

test_data = data.iloc[-test_size:]

In [43]:
#using of standard uniform scale to normalize 

normalizer = MinMaxScaler(feature_range=(0,1))

X_train = normalizer.fit_transform(train_data.values)

X_val = normalizer.transform(val_data.values)

X_test = normalizer.transform(test_data.values)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

cuda


In [44]:
class VariationalAutoEncoder(nn.Module):
    def __init__(self, intermediate_dims, latent_dim, input_shape):
        """
        Parameters
        ----------

        intermediate_dims - list of ints (with len equal to the number of dense layers in encoder)
                          List of dims of dense layers. The last number is the
                          dimension of latent space
        latent_dim - int, dimension of mu and std vectors

        input_shape - tuple, shape of an input sample 
        """
        super().__init__()
        self.register_buffer('_initial_mu', torch.zeros((latent_dim)))
        self.register_buffer('_initial_sigma', torch.ones((latent_dim)))

        self.latent_distribution = torch.distributions.normal.Normal(
            loc=self._initial_mu,
            scale=self._initial_sigma
        )
        input_dim = np.prod(input_shape)
        self.encoder = nn.Sequential(*[
            nn.Linear(input_dim, intermediate_dims[0]),
            nn.ReLU(),
            nn.BatchNorm1d(intermediate_dims[0]),
            nn.Dropout(0.3),
            nn.Linear(intermediate_dims[0], intermediate_dims[1]),
            nn.ReLU(),
            nn.BatchNorm1d(intermediate_dims[1]),
            nn.Dropout(0.3),
            nn.Linear(intermediate_dims[1], intermediate_dims[2]),
            nn.ReLU(),
            nn.BatchNorm1d(intermediate_dims[2]),
            nn.Dropout(0.3)
        ])

        self.mu_repr = nn.Linear(intermediate_dims[2], latent_dim)
        self.log_sigma_repr = nn.Linear(intermediate_dims[2], latent_dim)
        
        self.decoder = nn.Sequential(*[
            nn.Linear(latent_dim, intermediate_dims[2]),
            nn.LeakyReLU(),
            nn.BatchNorm1d(intermediate_dims[2]),
            nn.Dropout(0.3),
            nn.Linear(intermediate_dims[2], intermediate_dims[1]),
            nn.LeakyReLU(),
            nn.BatchNorm1d(intermediate_dims[1]),
            nn.Dropout(0.3),
            nn.Linear(intermediate_dims[1], intermediate_dims[0]),
            nn.LeakyReLU(),
            nn.BatchNorm1d(intermediate_dims[0]),
            nn.Dropout(0.3),
            nn.Linear(intermediate_dims[0], input_dim),
            nn.Sigmoid()
        ])
    
    def _encode(self, x):
        latent_repr = self.encoder(x)
        mu_values = self.mu_repr(latent_repr)
        log_sigma_values = self.log_sigma_repr(latent_repr)
        return mu_values, log_sigma_values, latent_repr

    def decode(self, latent_sample):
      return self.decoder(latent_sample)
    
    def _reparametrize(self, sample, mu_values, log_sigma_values):
        latent_sample = torch.exp(log_sigma_values) * sample + mu_values
        return latent_sample

    def forward(self, x, raw_sample=None):
        mu_values, log_sigma_values, latent_repr = self._encode(x)

        if raw_sample is None:
            raw_sample = torch.randn_like(mu_values)

        latent_sample = self._reparametrize(raw_sample, mu_values, log_sigma_values)
        
        reconstructed_repr = self.decoder(latent_sample)
        
        return reconstructed_repr, latent_sample, mu_values, log_sigma_values

In [68]:

model = VariationalAutoEncoder([16, 12, 8], 8, X_train[0].shape).to(device)


In [69]:
# loss compution - the linear combination of reconstuction_loss and KL divergence

def compute_loss(batch_x):
    batch_x = torch.FloatTensor(batch_x).to(device)

    [predictions, latent, mu_values, log_sigma_values] = model(batch_x.to(device))
    
    kl_loss = 0.5 * torch.mean(torch.sum(
          mu_values.pow(2) + torch.exp(log_sigma_values) - 1. - log_sigma_values,
          dim=1
      ))
    bce_loss = loss_func(predictions, batch_x)
    
    return kl_loss, bce_loss, (bce_loss + kl_loss)/2./batch_x.shape[1]

In [70]:
def iterate_minibatches(X, batchsize, shuffle=True):
    indices = np.arange(len(X))
    if shuffle:
        indices = np.random.permutation(indices)
    for start in range(0, len(indices), batchsize):
        ix = indices[start: start + batchsize]
        yield X[ix]

In [71]:
#training process

opt = torch.optim.Adam(model.parameters(), lr=1e-4)

loss_func = torch.nn.modules.loss.BCELoss()  #reconstruction loss

epochs = 150

batch_size = 128

model.train()
loss_accumulator = []
bce_acc = []
kl_acc = []

val_loss_accumulator = []
val_bce_acc = []
val_kl_acc = []

for epoch_num in range(epochs):
  start_time = time.time()

  model.train(True)
  for batch_x in iterate_minibatches(X_train, batch_size):
    kl_loss, bce_loss, loss = compute_loss(batch_x)
    loss.backward()
    opt.step()
    opt.zero_grad()

    loss_accumulator.append(float(loss))
    bce_acc.append(float(bce_loss))
    kl_acc.append(float(kl_loss))

  model.train(False)

  for batch_x in iterate_minibatches(X_val, batch_size):
    kl_loss, bce_loss, loss = compute_loss(batch_x)

    val_loss_accumulator.append(float(loss))
    val_bce_acc.append(float(bce_loss))
    val_kl_acc.append(float(kl_loss))

  if epoch_num % 5 == 0:
    print("Epoch {} of {} took {:.3f}s".format(
          epoch_num + 1, epochs, time.time() - start_time))
    print("Training loss={:.6f}, KL divergence={:.7f}, BCE Loss={:.6f}".format(
        np.mean(loss_accumulator[-len(X_train) // batch_size :]),
        np.mean(kl_acc[-len(X_train) // batch_size :]),
        np.mean(bce_acc[-len(X_train) // batch_size :])))
    
    print("Val loss={:.6f}, KL divergence={:.7f}, BCE Loss={:.6f}".format(
        np.mean(val_loss_accumulator[-len(X_val) // batch_size :]),
        np.mean(val_kl_acc[-len(X_val) // batch_size :]),
        np.mean(val_bce_acc[-len(X_val) // batch_size :])))



Epoch 1 of 150 took 7.009s
Training loss=0.091291, KL divergence=2.3843787, BCE Loss=0.719516
Val loss=0.057942, KL divergence=1.2836504, BCE Loss=0.686387
Epoch 6 of 150 took 6.908s
Training loss=0.021533, KL divergence=0.1337659, BCE Loss=0.598361
Val loss=0.017943, KL divergence=0.0201699, BCE Loss=0.589893
Epoch 11 of 150 took 6.890s
Training loss=0.017626, KL divergence=0.0106856, BCE Loss=0.588609
Val loss=0.017236, KL divergence=0.0005467, BCE Loss=0.585462
Epoch 16 of 150 took 6.834s
Training loss=0.017295, KL divergence=0.0004862, BCE Loss=0.587550
Val loss=0.017227, KL divergence=0.0000125, BCE Loss=0.585709
Epoch 21 of 150 took 6.920s
Training loss=0.017265, KL divergence=0.0000006, BCE Loss=0.587010
Val loss=0.017210, KL divergence=0.0000004, BCE Loss=0.585137
Epoch 26 of 150 took 6.960s
Training loss=0.017257, KL divergence=0.0000004, BCE Loss=0.586741
Val loss=0.017231, KL divergence=0.0000002, BCE Loss=0.585860
Epoch 31 of 150 took 7.035s
Training loss=0.017252, KL diver

augmentations

In [72]:
def augmentation_forward(model, data):
    """
    simple augmentation - using original latent_sample as a sample to decode
    
    Returns
    ---------
    generated data - with shape of data
    """
    model.eval()
    
    _, _, latent_samples = model._encode(torch.FloatTensor(data).to(device))

    return model.decode(latent_samples).detach().cpu().numpy()

In [73]:
def augmentation_add_noise(model, data, alpha=0.1):
    """
    generating via adding noise to the latent samples of orignal data
    
    z_i_new = z_i + alpha*\ksi , where \ksi from normal(0, sigma), where
                          sigma is a sample standard deviation of latent samples
    
    Parameters
    ----------
    alpha - int, regularization for pertubation

    Returns
    ---------
    generated data - with shape of data
    """
    model.eval()

    _, _, latent_samples = model._encode(torch.FloatTensor(data).to(device))
    latent_samples = latent_samples.detach().cpu().numpy()
    
    sigma = latent_samples.std(axis=0)

    new_latent_samples = latent_samples + alpha*np.random.normal(scale=sigma, 
                                                      size=latent_samples.shape)

    return model.decode(torch.FloatTensor(new_latent_samples).to(device)).detach().cpu().numpy()


In [75]:
def augmentation_extrapolation(model, data, k=3, alpha=0.5):
    """
    generating via extrapolations of the latent samples
    using the nearest neighbours

    z_i_new = (z_k - z_i)*alpha + z_i, where z_k is the k-th nearest neighbor 
    in the latent space

    Parameters
    ----------
    k - int, number of neighbours used in KNN
    alpha -
    len(data)*k - number of generated data

    Returns
    ---------
    generated data - with shape (data.shape[0]*k, data.shape[1])
    """
    model.eval()

    _, _, latent_samples = model._encode(torch.FloatTensor(data).to(device))
    latent_samples = latent_samples.detach().cpu()

    neigh = NearestNeighbors(n_neighbors=k+1)
    neigh.fit(latent_samples)

    ids = neigh.kneighbors(latent_samples, return_distance=False)

    #extrapolations between closest latent vectors
    extrs = (latent_samples[ids] - latent_samples[:, np.newaxis])[:,1:,:]*alpha + \
                       latent_samples[:, np.newaxis]

    extrs =  extrs.reshape(extrs.shape[0]*extrs.shape[1],
                           extrs.shape[2])

    return model.decode(torch.FloatTensor(extrs).to(device)).detach().cpu().numpy()


In [76]:
generated_data_1 = augmentation_forward(model, np.concatenate((X_train, X_val), axis=0))

In [77]:
generated_data_2 = augmentation_extrapolation(model, np.concatenate((X_train, X_val), axis=0), k=4)

In [78]:
generated_data_3 = augmentation_add_noise(model, np.concatenate((X_train, X_val), axis=0), alpha=1)

converting data to original format

In [None]:
def to_original_format(generated_data, normalizer):
  """
  noramlizer - to denormalize data
  """
  descaled = normalizer.inverse_transform(generated_data)

  descaled[:, [2, 13]] = np.round(descaled[:, [2, 13]])

  gen_data = pd.DataFrame(descaled)

  return gen_data

In [None]:
to_original_format(generated_data_1, normalizer).to_csv("gen_data_1.csv", header=None, index=False)
to_original_format(generated_data_2, normalizer).to_csv("gen_data_2.csv", header=None, index=False)
to_original_format(generated_data_3, normalizer).to_csv("gen_data_3.csv", header=None, index=False)