# Loading Dataset

In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
#logging.basicConfig(filename='output.log',level=logging.DEBUG)
#logging.basicConfig(filename='example.log', filemode='w', level=logging.DEBUG)

import numpy as np

from collections import defaultdict

def text2numpy(num, lines_list, dim, labels, l_dict=None, i_dict=None):

  
    
    X = np.zeros((num , dim))
    Y = np.zeros((num, labels))

    label_list = []

    for i,l in enumerate(lines_list[:num]):

        tokens=l.strip().split(",")
        values=tokens[1:]

        for d,v in enumerate(values):
            X[i,d] = float(v)
            
        if l_dict != None:
            Y[i, l_dict[tokens[0]]] = 1
        else:
            label_list.append(tokens[0])


    if l_dict == None:
        
        sorted_label_ind_list = [(x,i) for x,i in zip(sorted(list(set(label_list))),range(labels))]
        sorted_ind_label_list = [(i,x) for x,i in sorted_label_ind_list]

        l_dict = dict(sorted_label_ind_list)
        logging.debug("Label dict: {}".format(l_dict))
        
        i_dict = dict(sorted_ind_label_list)
        logging.debug("Index dict: {}".format(i_dict))

        for i in range(num_tr_examples):
            Y[i, l_dict[label_list[i]]] = 1

    
    
    return X,Y, l_dict, i_dict


f = open("DATASETS/letters/letter-recognition.data")

lines = f.readlines()

logging.debug("Input data sample: LABEL, V0, V1, ..., V15")
logging.debug("Input labels \in \{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', \
       'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}")

    

total_examples = len(lines)
logging.info("# Samples: {}".format(total_examples))

num_tr_examples = 16000
num_te_examples = total_examples - num_tr_examples
dim = 16
labels = 26

logging.info("# training samples: {}".format(num_tr_examples))
logging.info("# training samples: {}".format(num_te_examples))

X_tr, Y_tr, label_dict, ind_dict = text2numpy(num_tr_examples, lines, dim, labels)
X_te, Y_te, label_dict, ind_dict = text2numpy(num_te_examples, lines[num_tr_examples:],\
                                              dim, labels, label_dict, ind_dict)


logging.debug("Read examples: {}".format(X_tr[:10]))

for y in Y_tr[:10]:
    ind=np.argmax(y, axis=0)
    logging.debug("Read labels: {}".format(ind_dict[ind]))

for i in lines[:10]:
    logging.debug("Real instances: {}".format(i))

#Normalization -> From 0-15 to 0-1
X_tr = X_tr / 15.0
X_te = X_te / 15.0

logging.debug("Cheking things...{}".format(np.sum(Y_tr)))
logging.debug("Cheking things...{}".format(np.sum(Y_te)))

one_sample = X_tr[0,:].copy()
another_sample = X_tr[1,:].copy()

print((one_sample - another_sample))
print(np.sum(np.square(one_sample - another_sample)))

INFO:root:# Samples: 20000
INFO:root:# training samples: 16000
INFO:root:# training samples: 4000


[-0.2        -0.26666667  0.         -0.13333333 -0.06666667 -0.13333333
  0.53333333 -0.33333333  0.13333333 -0.46666667  0.46666667 -0.06666667
 -0.13333333  0.         -0.26666667 -0.13333333]
1.11111111111


# Preparing the model

In [2]:
#Condensed code based on the code from: https://jmetzen.github.io/2015-11-27/vae.html
%matplotlib inline

import numpy as np
import tensorflow as tf
import os

np.random.seed(0)
tf.set_random_seed(0)

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

def encoder(x, weights, biases):
    # Generate probabilistic encoder (recognition network), which
    # maps inputs onto a normal distribution in latent space.
    # The transformation is parametrized and can be learned.
    layer_1 = tf.nn.softplus(tf.add(tf.matmul(x, weights['h1']), biases['b1'])) 
    layer_2 = tf.nn.softplus(tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])) 
    #Reparametrization trick
    z_mean = tf.add(tf.matmul(layer_2, weights['out_mean']), biases['out_mean'])
    z_log_sigma_sq = tf.add(tf.matmul(layer_2, weights['out_log_sigma']), biases['out_log_sigma'])
    return (z_mean, z_log_sigma_sq)

def decoder(z, weights, biases):
    # Generate probabilistic decoder (decoder network), which
    # maps points in latent space onto a Bernoulli distribution in data space.
    # The transformation is parametrized and can be learned.
    layer_1 = tf.nn.softplus(tf.add(tf.matmul(z, weights['h1']), biases['b1'])) 
    layer_2 = tf.nn.softplus(tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])) 
    #x_reconstr_mean = tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['out_mean']), biases['out_mean']))
    x_reconstr_mean = (tf.add(tf.matmul(layer_2, weights['out_mean']), biases['out_mean']))
    return x_reconstr_mean

def xavier_init(fan_in, fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)



# Instantiating the Model

In [3]:
X = X_tr
Y = Y_tr

logging.debug(np.mean(X,axis=0))
logging.debug(np.mean(X,axis=0).shape)

n_samples = X.shape[0]
x_dim = X.shape[1]
original_shape = x_dim
z_dim = 2

n_hidden_recog_1 = 128
n_hidden_recog_2 = 128

n_hidden_gener_1 = 128
n_hidden_gener_2 = 128

network_weights = dict()
network_weights['weights_recog'] = {
    'h1': tf.Variable(xavier_init(x_dim, n_hidden_recog_1)),
    'h2': tf.Variable(xavier_init(n_hidden_recog_1, n_hidden_recog_2)),
    'out_mean': tf.Variable(xavier_init(n_hidden_recog_2, z_dim)),
    'out_log_sigma': tf.Variable(xavier_init(n_hidden_recog_2, z_dim))}
network_weights['biases_recog'] = {
    'b1': tf.Variable(tf.zeros([n_hidden_recog_1], dtype=tf.float32)),
    'b2': tf.Variable(tf.zeros([n_hidden_recog_2], dtype=tf.float32)),
    'out_mean': tf.Variable(tf.zeros([z_dim], dtype=tf.float32)),
    'out_log_sigma': tf.Variable(tf.zeros([z_dim], dtype=tf.float32))}

network_weights['weights_gener'] = {
    'h1': tf.Variable(xavier_init(z_dim, n_hidden_gener_1)),
    'h2': tf.Variable(xavier_init(n_hidden_gener_1, n_hidden_gener_2)),
    'out_mean': tf.Variable(xavier_init(n_hidden_gener_2, x_dim)),
    'out_log_sigma': tf.Variable(xavier_init(n_hidden_gener_2, x_dim))}
network_weights['biases_gener'] = {
    'b1': tf.Variable(tf.zeros([n_hidden_gener_1], dtype=tf.float32)),
    'b2': tf.Variable(tf.zeros([n_hidden_gener_2], dtype=tf.float32)),
    'out_mean': tf.Variable(tf.zeros([x_dim], dtype=tf.float32)),
    'out_log_sigma': tf.Variable(tf.zeros([x_dim], dtype=tf.float32))}


#Original input
x_original = tf.placeholder(tf.float32, [None, x_dim])

#Encoding layer
z_mean, z_log_sigma_sq = encoder(x_original, network_weights["weights_recog"], network_weights["biases_recog"])

eps = tf.random_normal(shape=tf.shape(z_mean), mean=0.0, stddev=1.0, dtype=tf.float32)

#Sampling procedure
# z = mu + sigma*epsilon
z = tf.add(z_mean, tf.multiply(tf.sqrt(tf.exp(z_log_sigma_sq)), eps))

#Decoding layer
x_reconstructed = decoder(z, network_weights["weights_gener"], network_weights["biases_gener"])

#vector from outside to decode
z_input = tf.placeholder(tf.float32, shape=[None, z_dim])

#Decoding layer from outside's vector
x_reconstructed_sample = decoder(z_input, network_weights["weights_gener"], network_weights["biases_gener"])

#Reconstruction loss per sample (REVIEW)
per_sample_reconstructed_loss = tf.reduce_mean(tf.squared_difference(x_original, x_reconstructed), axis=1)

reconstructed_loss = tf.reduce_mean(per_sample_reconstructed_loss)

kl_divergence_vector = 1. + z_log_sigma_sq - tf.pow(z_mean, 2) - tf.exp(z_log_sigma_sq)

latent_loss = -.5 * tf.reduce_sum(kl_divergence_vector, reduction_indices=1)

cost = tf.reduce_mean(reconstructed_loss + latent_loss)

learning_rate=0.0001

optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

MODELS_PATH = "models_vae/"

training_epochs = 1000
display_step = 100
batch_size = 100

init = tf.global_variables_initializer()

VAE_SAVER = tf.train.Saver()

save_path = "models_vae/VANILLA_VAE.ckpt"

root_dir = "DATASETS/letters/"
base_file = "letters_Z_"

# Loading projections

In [4]:
dataset_z_x = np.load(root_dir + base_file + "X_training.npy")
dataset_z_y = np.load(root_dir + base_file + "Y_training.npy")

# Noise

In [5]:
np.random.seed(0)

std_dev = np.std(dataset_z_x,axis=0)

n_samples = dataset_z_x.shape[0]

noisy_dataset_z_x = np.zeros((n_samples, z_dim))

global_scale = 0.5

for i in range(dataset_z_x.shape[0]):
    
    noise = np.random.normal(loc=0, scale=std_dev)
    noisy_dataset_z_x[i] = dataset_z_x[i] + global_scale * noise


In [6]:
np.save(root_dir + "noise_" + base_file + "X_training.npy", noisy_dataset_z_x)
np.save(root_dir + "noise_" + base_file + "Y_training.npy", dataset_z_y)

# Noise - Reconstructed

In [7]:
noisy_dataset_x_x = np.zeros((noisy_dataset_z_x.shape[0], original_shape))

batch_size = 100

n_batches = noisy_dataset_z_x.shape[0]//batch_size

print("# batches: {}".format(n_batches))
  
with tf.Session() as sess:

    sess.run(init)

    VAE_SAVER.restore(sess, save_path)
    print("Model restored in file: {}".format(save_path))

    for n_batch in range(n_batches):
        
        batch_xs = noisy_dataset_z_x[n_batch * batch_size: (n_batch + 1) * batch_size]

        rescontructed = sess.run(x_reconstructed_sample,feed_dict={z_input: batch_xs})
        
        noisy_dataset_x_x[n_batch * batch_size: (n_batch + 1) * batch_size] = rescontructed.copy()


# batches: 160
Model restored in file: models_vae/VANILLA_VAE.ckpt


In [8]:
np.save(root_dir + "noise_reconstructed_" + base_file + "X_training.npy", noisy_dataset_x_x)
np.save(root_dir + "noise_reconstructed_" + base_file + "Y_training.npy", dataset_z_y)


# Interpolation

In [9]:
from sklearn.neighbors import NearestNeighbors
import time

n_samples = dataset_z_x.shape[0]

n_labels = dataset_z_y.shape[1]

#Setting the number of neighbors
K = 4

indexes_per_class = {}
distances_per_class = {}
indices_per_class = {}

total_start_time = time.time()

for l in range(n_labels):
    
    start_time = time.time()
    
    indexes_per_class[l] = np.where(dataset_z_y[:,l] == 1)
    
    #Performing NN over the selected instances (same class)
    class_members = dataset_z_x[indexes_per_class[l]]
    
    #Fitting the subset
    nbrs = NearestNeighbors(n_neighbors=K, algorithm='ball_tree').fit(class_members)
    
    #Getting the distances and indices
    distances, indices = nbrs.kneighbors(class_members)
    
    distances_per_class[l] = distances.copy()
    indices_per_class[l] = indices.copy()
    
    logging.debug("Per label time: {}".format(time.time() - start_time))
    
logging.debug("Total time: {}".format(time.time() - total_start_time))

In [10]:
counter = 0

total_interpolated_instances_x = np.zeros((n_samples * K * (K - 1), z_dim))
total_interpolated_instances_y = np.zeros((n_samples * K * (K - 1), n_labels))

total_start_time = time.time()

for l in range(n_labels):
    
    start_time = time.time()
    
    #Picking the class members
    class_members = dataset_z_x[indexes_per_class[l]]
    
    n_members = class_members.shape[0]
    
    for n_member in range(n_members):
        
        kneighbors = class_members[indices_per_class[l][n_member]]
        
        #print(kneighbors)

        #Generating interpolations:
        #For each pair of neighbouring vectors, a new vector is generated
        # K * (K - 1) pairs

        degree_of_interpolation = 0.5

        for i in range(K):
            i_sample = kneighbors[i].copy()
            for j in range(K):
                if j == i:
                    pass
                else:
                    j_sample = kneighbors[j].copy()

                    interpolated_instance = (i_sample - j_sample) * degree_of_interpolation + j_sample

                    total_interpolated_instances_x[counter] = interpolated_instance.copy()
                    total_interpolated_instances_y[counter, l] = 1
                    counter+=1

    logging.debug("Per label time: {}".format(time.time() - start_time))

logging.info("Total number of instances: {}".format(counter))
logging.info("Total number of instances (pred): {}".format(total_interpolated_instances_x.shape[0]))
logging.info("Total time: {}".format(time.time() - total_start_time))

INFO:root:Total number of instances: 192000
INFO:root:Total number of instances (pred): 192000
INFO:root:Total time: 0.6484637260437012


In [11]:
np.random.seed(0)

perm = np.random.permutation(total_interpolated_instances_x.shape[0])

total_interpolated_instances_x = total_interpolated_instances_x[perm]
total_interpolated_instances_y = total_interpolated_instances_y[perm]  

In [12]:
np.save(root_dir + "interpolated_" + base_file + "X_training.npy", total_interpolated_instances_x)
np.save(root_dir + "interpolated_" + base_file + "Y_training.npy", total_interpolated_instances_y)

# Interpolated - Reconstructed

In [13]:
interpolated_dataset_x_x = np.zeros((total_interpolated_instances_x.shape[0], original_shape))

batch_size = 100

n_batches = total_interpolated_instances_x.shape[0]//batch_size

print("# batches: {}".format(n_batches))
  
with tf.Session() as sess:

    sess.run(init)

    VAE_SAVER.restore(sess, save_path)
    print("Model restored in file: {}".format(save_path))

    for n_batch in range(n_batches):
        
        batch_xs = total_interpolated_instances_x[n_batch * batch_size: (n_batch + 1) * batch_size]

        rescontructed = sess.run(x_reconstructed_sample,feed_dict={z_input: batch_xs})
        
        interpolated_dataset_x_x[n_batch * batch_size: (n_batch + 1) * batch_size] = rescontructed.copy()


# batches: 1920
Model restored in file: models_vae/VANILLA_VAE.ckpt


In [14]:
np.save(root_dir + "interpolated_reconstructed_" + base_file + "X_training.npy", interpolated_dataset_x_x)
np.save(root_dir + "interpolated_reconstructed_" + base_file + "Y_training.npy", total_interpolated_instances_y)

# Extrapolation

In [15]:
from sklearn.neighbors import NearestNeighbors
import time

init = tf.global_variables_initializer()

VAE_SAVER = tf.train.Saver()

n_samples = dataset_z_x.shape[0]

n_labels = dataset_z_y.shape[1]

#Setting the number of neighbors
K = 4

indexes_per_class = {}
distances_per_class = {}
indices_per_class = {}

total_start_time = time.time()

for l in range(n_labels):
    
    start_time = time.time()
    
    indexes_per_class[l] = np.where(dataset_z_y[:,l] == 1)
    
    #Performing NN over the selected instances (same class)
    class_members = dataset_z_x[indexes_per_class[l]]
    
    #Fitting the subset
    nbrs = NearestNeighbors(n_neighbors=K, algorithm='ball_tree').fit(class_members)
    
    #Getting the distances and indices
    distances, indices = nbrs.kneighbors(class_members)
    
    distances_per_class[l] = distances.copy()
    indices_per_class[l] = indices.copy()
    
    logging.debug("Per label time: {}".format(time.time() - start_time))
    
logging.info("Total time: {}".format(time.time() - total_start_time))

INFO:root:Total time: 0.05968928337097168


In [16]:
counter = 0

total_extrapolated_instances_x = np.zeros((n_samples * K * (K - 1), z_dim))
total_extrapolated_instances_y = np.zeros((n_samples * K * (K - 1), n_labels))

total_start_time = time.time()

for l in range(n_labels):
    
    start_time = time.time()
    
    #Picking the class members
    class_members = dataset_z_x[indexes_per_class[l]]
    
    n_members = class_members.shape[0]
    
    for n_member in range(n_members):
        
        kneighbors = class_members[indices_per_class[l][n_member]]
        
        #print(kneighbors)

        #Generating interpolations:
        #For each pair of neighbouring vectors, a new vector is generated
        # K * (K - 1) pairs

        degree_of_extrapolation = 0.5

        for i in range(K):
            i_sample = kneighbors[i].copy()
            for j in range(K):
                if j == i:
                    pass
                else:
                    j_sample = kneighbors[j].copy()

                    extrapolated_instance = (j_sample - i_sample) * degree_of_extrapolation + j_sample

                    total_extrapolated_instances_x[counter] = extrapolated_instance.copy()
                    total_extrapolated_instances_y[counter, l] = 1
                    counter+=1

    logging.debug("Per label time: {}".format(time.time() - start_time))

logging.info("Total number of instances: {}".format(counter))
logging.info("Total number of instances (pred): {}".format(total_extrapolated_instances_x.shape[0]))
logging.info("Total time: {}".format(time.time() - total_start_time))

INFO:root:Total number of instances: 192000
INFO:root:Total number of instances (pred): 192000
INFO:root:Total time: 0.6861999034881592


In [17]:
np.random.seed(0)

perm = np.random.permutation(total_extrapolated_instances_x.shape[0])

total_extrapolated_instances_x = total_extrapolated_instances_x[perm]
total_extrapolated_instances_y = total_extrapolated_instances_y[perm]   

In [18]:
np.save(root_dir + "extrapolated_" + base_file + "X_training.npy", total_extrapolated_instances_x)
np.save(root_dir + "extrapolated_" + base_file + "Y_training.npy", total_extrapolated_instances_y)

In [21]:
extrapolated_dataset_x_x = np.zeros((total_extrapolated_instances_x.shape[0], original_shape))

batch_size = 100

n_batches = total_extrapolated_instances_x.shape[0]//batch_size

print("# batches: {}".format(n_batches))
  
with tf.Session() as sess:

    sess.run(init)

    VAE_SAVER.restore(sess, save_path)
    print("Model restored in file: {}".format(save_path))

    for n_batch in range(n_batches):
        
        batch_xs = total_extrapolated_instances_x[n_batch * batch_size: (n_batch + 1) * batch_size]

        rescontructed = sess.run(x_reconstructed_sample,feed_dict={z_input: batch_xs})
        
        extrapolated_dataset_x_x[n_batch * batch_size: (n_batch + 1) * batch_size] = rescontructed.copy()


# batches: 1920
Model restored in file: models_vae/VANILLA_VAE.ckpt


In [22]:
np.save(root_dir + "extrapolated_reconstructed_" + base_file + "X_training.npy", extrapolated_dataset_x_x)
np.save(root_dir + "extrapolated_reconstructed_" + base_file + "Y_training.npy", total_extrapolated_instances_y)