In [None]:
from __future__ import print_function
from torch import nn, optim
import numpy as np
from random import randint
import copy                                                # to keep track of the evolutoin of the weights of the network
import os   
# from sklearn import manifold                               # for the t-SNE
# import matplotlib                                          # for the t-SNE
# from itertools import cycle, islice                        # for the t-SNE
from matplotlib import pyplot as plt 
from collections import OrderedDict
import pandas as pd
import datetime
import time
import torch                        
import torch.utils.data
from torch.nn import functional as F
# from torchvision import transforms
from datasets import YeastDataset # dataloader loading the yeast dataset in the correct format, replace it with one suitable for your problem
from data_elaboration_utilities import *
print("Libraries loaded")

In [None]:
# recover the parameters and define the datasets
records_path = "where the previsously tried combinations are stored \Records.csv"
experiment_path = "specific experiment"    
records = pd.read_csv(records_path, encoding="utf-8")   
# best = sorted(records["validation_loss"])[:10]               
# parameters = records.loc[records["validation_loss"] == best[0]].iloc[0]
parameters = records.loc[records["experiment"] == experiment_path].iloc[0]

no_cuda = False               # SHOULD BE FALSE                    
seed = 1                   
log_interval = 10       
percent_train = 0.7  
percent_validation = 0.2

dataset_path = parameters["dataset_path"]

training_path = ""                                                    
test_path = "" 
parameters["training_path"] = training_path
parameters["test_path"] = test_path

input_size = parameters["reconstruction_weight"]  # default

reconstruction_weight =   [input_size]        

num_neurons = parameters["num_neurons"]

z_size = parameters["z_size"]

cuda = not no_cuda and torch.cuda.is_available()
torch.manual_seed(seed)

device = torch.device("cuda" if cuda else "cpu")               # use GPU if available

kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
partition, labels = splitTrainingValidationTestSet(dataset_path, percent_train, percent_validation)
    
training_set = YeastDataset(partition['training'], labels)
validation_set = YeastDataset(partition['validation'], labels)
test_set = YeastDataset(partition['test'], labels)
        
torch.set_printoptions(precision=9)                                 # to print more digits for the loss

print("Device: ", device)                           # simple check

In [None]:
print(dataset_path)

In [5]:
class trialVAE(nn.Module):
    def __init__(self, num_neurons, z_size, dropout):
        super(trialVAE, self).__init__()
    
        self.input_size = input_size
        self.num_neurons = num_neurons
        self.z_size = z_size
        self.dropout = dropout
        
        self.fc1 = nn.Linear(self.input_size, self.num_neurons)
        self.fc21 = nn.Linear(self.num_neurons, self.z_size)
        self.fc22 = nn.Linear(self.num_neurons, self.z_size)
        self.fc3 = nn.Linear(self.z_size, self.num_neurons)
        self.fc4 = nn.Linear(self.num_neurons, self.input_size)
        self.dropt = nn.Dropout(self.dropout)

    def encode(self, x):
        h1 = F.relu(self.fc1(x.float()))               
        # h1 = torch.tanh(self.fc1(x.float()))      # MSE doesn't learn and all the weights change at approximately each epoch
        return self.fc21(self.dropt(h1)), self.fc22(self.dropt(h1))

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return eps.mul(std).add_(mu)

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        # h3 = torch.tanh(self.fc3(z))              # MSE doesn't learn and all the weights change at approximately each epoch
        return torch.sigmoid(self.fc4(self.dropt(h3)))         
        # return F.softmax(self.fc4(h3), dim=1)     # MSE doesn't decrease with it
        # return torch.tanh(self.fc4(h3))           # MSE doesn't decrease like with the sigmoid

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)        # representation to get
        return self.decode(z), mu, logvar

In [6]:
batch_size = int(parameters["batch_size"])
model = trialVAE(parameters["num_neurons"], parameters["z_size"], parameters["dropout"])
model.load_state_dict(torch.load("path to the weights of the selected experiment" + "\\" + parameters["experiment"] + "\\" + "weights.pt"), 
                                         strict=True)
model.to(device)
    
train_loader = torch.utils.data.DataLoader(training_set, shuffle=True, batch_size=batch_size, **kwargs)
validation_loader = torch.utils.data.DataLoader(validation_set, batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True, **kwargs)

processed_datasets = [(train_loader, "training", # modify the path below so that it is possible to get more processed datasets
                       # for the same original dataset
                       "training-associated file.csv"), 
                      (validation_loader, "validation", "validation-associated file.csv"), 
                      (test_loader, "test", "test-associated file.csv")
                     ]  

directory_path = "where to save the results" + "\\"

for processed in processed_datasets:
    new_data = list()
    for i in range(len(processed[0].dataset)):             # len(partition["training"])
        # data, label = training_set.__getitem__(i)        # equivalent to using dataloader except for the fact that 
        data = partition[processed[1]][i][2:]                # you're not using tensors
        label = partition[processed[1]][i][0]           
        value = partition[processed[1]][i][1]
        model.eval()
        with torch.no_grad():
            data = torch.from_numpy(data)                  # now IT IS EQUIVALENT to using dataloader
            data = data.to(device)
            mu, logvar = model.encode(data)               # process the data with the autoencoder
            z = model.reparameterize(mu, logvar).cpu()
            z = z.numpy()
            element = [label]
            value = [value]
            element.extend(value)
            element.extend(z)
            new_data.append(element)  
        
    new_dataset = pd.DataFrame(np.array(new_data))                     
    new_dataset.rename(columns={0: 'Row', 1: 'log2relT'}, inplace=True)
    new_dataset.to_csv(directory_path + processed[2], encoding="utf-8")
