In [2]:
import os
import numpy as np 

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

from time import time

# Train on the GPU if one is available.
cuda = torch.cuda.is_available()

NSW_MODE = True

## Autoencoder

The code below implements an Autoencoder as described in Section 3 of the paper. 

### Data Loading

CovariateDataset extends the base Pytorch dataset. It loads saved covariate files from the data folder and serves them to the Pytorch model during the training process. 

In [3]:
if NSW_MODE:
    RAW_DATA_DIR = "../Data/NSW/Raw/"
    PROCESSED_DATA_DIR = "../Data/NSW/Processed/"
    FILE_TYPE = ".csv"
    DELIM = ","
else:
    RAW_DATA_DIR = "../Data/Raw/"
    PROCESSED_DATA_DIR = "../Data/Processed/"
    FILE_TYPE = ".csv"
    DELIM = ","

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_name_args=[]):
        self.file_name = file_name_pattern.format(*file_name_args)
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name + FILE_TYPE, delimiter=DELIM)
        if not NSW_MODE:
            self.data = self.data[:, 1:] # remove bias
        
    def __getitem__(self, index):
        return (self.data[index].astype(float), 0)

    def __len__(self):
        return self.data.shape[0]
    
    def save_processed_data(self, data, loss):
        name = PROCESSED_DATA_DIR + self.file_name+"_{}.csv".format(loss)
        np.savetxt(name, data, delimiter=",")

In [35]:
# CovariateDataset("nsw74_all_covars").data

### Model Definition

The finalized Autoencoder had one hidden layer with 128 units and ReLu activations. This architecture results from experimentation with deeper networks which performed substantially worse when measured using the loss functions described below. This lines up with the theory outlined in the Section 3 of the paper and described further in Goodfellow (2016). A single hidden layer tends to work best in Autoencoders. The choice is use 4 units in the inner most layer was largely arbitrary. There is nothing which serves to guide this choice a priori. The rough rule of thumb I used was around 50% of the original number of dimensions. 

A summary of the complete architecture is presented below. 

In [4]:
if NSW_MODE:
    FEATURES = 8
else:
    FEATURES = 10
    
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(FEATURES, 128),
            nn.ReLU(True),
            nn.Linear(128, 4))
        self.decoder = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(True),
            nn.Linear(128, FEATURES))

    def forward(self, x):
        encoded_values = self.encoder(x)
        x = self.decoder(encoded_values)
        return x, encoded_values

In [5]:
print(autoencoder())

autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU(inplace)
    (2): Linear(in_features=128, out_features=4, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU(inplace)
    (2): Linear(in_features=128, out_features=8, bias=True)
  )
)


### Define Loss Functions

Two flavours of autoencoder were implemented in this paper. The first was an unregularized autoencoder and the second was regularized through an L1 measure of sparsity in the latent space. The loss for the first version is based purely on quality of reconstruction as measured through mean squared error between the output and input (as suggested in Goodfellow, 2016). The regularized AE adds the L1 norm of the compressed representation to the reconstruction loss. It was found that a hyper-parameter was required to scale the regularizer. Without this parameter, the model tended to produce latent representations extremely close to zero in all dimensions and had very poor reconstruction performence. IE, it disgarded most information in order to satisfy the regularization. The hyperparameter was tuned via experimentation in order to bring the reconstruction loss roughly in line with the results from the unregularized AE with some disparity allowed/encouraged as a positive result of regularization on the latent space. In the sparse encoder, perfect reconstruction is not the goal. See Section 3 above for more on this.  

In [6]:
def reconstruction_sparsity_loss(output, target, encoded_values):
        l1_norm_scale = torch.FloatTensor([0.0005])
        sparsity_scalar = Variable(l1_norm_scale)
        if cuda:
            sparsity_scalar = sparsity_scalar.cuda()
            
        mse_loss = nn.MSELoss()
        reconstruction_loss = mse_loss(output, target)
        sparsity_loss = encoded_values.abs().sum()*sparsity_scalar
        return reconstruction_loss + sparsity_loss 
    
def reconstruction_loss(output, target, encoded_values):
    mse_loss = nn.MSELoss()
    reconstruction_loss = mse_loss(output, target)
    return reconstruction_loss

### Training Code

The code below follows a fairly standard template for training a PyTorch model. We run a specified number of epochs made up of minibatches drawn from the DataLoader instance. For each minibatch, a forward pass is performed through the model and loss is calculated. Then backpropagation is performed to find the gradient of the loss with regard to the model parameters. The ADAM optimizer is used to adjust the network parameters. 

Two important hyperparameters are set on the optimizer. Firstly, the learning rate, which controls how big an adjustment takes place in the weights for a given gradient and second is the weight decay which acts as a a secondary regularizer. The learning rate needs to be large enough for meaningful adjust to take place based on the information coming from the loss value but also small enough to allow convergence in the stochastic gradient descent process. The trade off between these two goals is somehwat mitigated by the implementation of learning rate annealing: the learning rate starts off relatively high (at 0.1 in this case) and is reduced by a factor of 10 after 2000 epochs and again at 500 epochs. This allows large updates to the weights during the start of training and smaller updates as the network approaches optima later on. Ideally, we would implement 'restarts' which involve increasing the learning rate and repeatedly annealing. This can prevent the network from converging in a relatively poor local optima by forcing it to explore the loss space more widely - see Huang (2017) for more on this. This was not implemented here because it is isn't a feature of PyTorch and I didn't feel it justified the effort. The value of `weight_decay` specifies how quickly the network weights should 'decay' to zero if not other update is made to them. It was found that a small value for this parameter aided in the quality of reconstruction. 

The batch size is set to 1000 (the size of the dataset) because the model is small enough that the GPU I was using could handle this. This improves the accuracy of the 'stochastic' gradient (which is actually the true gradient in this case given a batch is the full sample). 


In [9]:
loss_functions = ["reconstruction", "sparsity"]

def train_model(model_class, dataset, dataset_tag, loss="reconstruction", verbose=True):
    model = model_class()
    if cuda:
        model = model.cuda()

    if NSW_MODE:
        num_epochs = 20000
        batch_size = 1024
        learning_rate = 1e-2
        schedule = [1500, 7000]
    else:
        num_epochs = 10000
        batch_size = 1000
        learning_rate = 1e-1
        schedule = [2000, 5000]
    
    lr_sched = True
    
    # Set the loss function
    if loss == loss_functions[0]:
        criterion = reconstruction_loss
    elif loss == loss_functions[1]:
        criterion = reconstruction_sparsity_loss
    
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, schedule, gamma=0.1)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    final_loss = None
    
    for epoch in range(num_epochs):
        if lr_sched:
            scheduler.step()
        
        # Iterate over batches in the epoch (only 1 in this model)
        for data in dataloader:
            data_batch, _ = data
            data_batch = Variable(data_batch)
            data_batch = data_batch.float()

            if cuda:
                data_batch = data_batch.cuda()

            # Forward pass
            output, encoded_values = model(data_batch)
            
            # Find the loss
            loss = criterion(output, data_batch, encoded_values)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        if epoch%int(num_epochs/10) == int(num_epochs/10)-1 and verbose:
            print('epoch [{}/{}], loss:{:.4f}'
                  .format(epoch + 1, num_epochs, loss.data[0]))
        
        if epoch == (num_epochs-1):
            final_loss = loss.data[0]
            print("Final loss: loss:{:.4f}".format(final_loss))

    torch.save(model.state_dict(), "../Models/simple_autoencoder_{}.pth".format(dataset_tag))
    return model, final_loss

def encode_data(model, dataset, loss):
    all_data = torch.from_numpy(dataset.data)
    all_data = Variable(all_data)
    all_data = all_data.float()
    
    if cuda:
        all_data = all_data.cuda()

    output = model.encoder(all_data)
    
    if cuda:
        output = output.cpu()
        
    dataset.save_processed_data(output.data.numpy(), loss)
    return output.data.numpy()

#### Perform a single training run on Monte Carlo

In [None]:
ds_number = 1
dataset = CovariateDataset("n_{}_model_{}_v_{}_covar_data", [1000, "A_add_lin", ds_number])
trained_model, final_loss = train_model(
                                    autoencoder,
                                    dataset,
                                    ds_number,
                                    loss="sparsity",
                                    verbose=True)
encode_data(trained_model, dataset, "sparsity")

#### Perform a single training run on NSW

In [10]:
dataset = CovariateDataset("nsw74_all_covars")
trained_model, final_loss = train_model(
                                    autoencoder,
                                    dataset,
                                    "NSW",
                                    loss="reconstruction",
                                    verbose=True)

epoch [2000/20000], loss:9.4558
epoch [4000/20000], loss:7.0777
epoch [6000/20000], loss:9.1079
epoch [8000/20000], loss:1.8893
epoch [10000/20000], loss:1.6152
epoch [12000/20000], loss:1.1869
epoch [14000/20000], loss:0.8571
epoch [16000/20000], loss:0.7110
epoch [18000/20000], loss:0.6589
epoch [20000/20000], loss:0.4651
Final loss: loss:0.4651


In [1]:
encode_data(trained_model, dataset, "reconstruction")

NameError: name 'encode_data' is not defined

### Train and Encode for Monte Carlo

This is utility code which trains and Autoencoder model for a given range of persisted datasets and then persistents the encoded data back to disk. This was run in order to create input for the Monte Carlo Experiment. The only nuance worth mentioning is that the function stores and reports the dataset parameters for any models which had an end loss higher than prespecified targets. This can occur by pure chance if a model gets stuck in a flat, high-loss part of the loss space. These models were rerun later and, in all cases, better network parameters which satisfied the loss criteria were found. 

In [None]:
def run_for_range(start, end):
    models_to_rerun = []
    datasets_to_process = range(start, end)
    assignment_model_names = ['A_add_lin', 'B_add_mild_nlin', 'C_add_mod_nlin', 'D_mild_nadd_lin',
                         'E_mild_nadd_mild_nlin', 'F_mod_nadd_lin', 'G_mod_nadd_mod_nlin']

    for dataset_number in datasets_to_process:
        print("Starting run for Dataset {}".format(dataset_number))

        for model_name in assignment_model_names:
            print("-- Running for model name: ", model_name)

            for loss_type in loss_functions:
                print("---- Running for loss: ", loss_type)

                start = time()

                dataset = CovariateDataset("n_{}_model_{}_v_{}_covar_data", [1000, model_name, dataset_number])
                trained_model, final_loss = train_model(
                                                    autoencoder,
                                                    dataset,
                                                    dataset_number,
                                                    loss=loss_type,
                                                    verbose=True)
                encode_data(trained_model, dataset, loss=loss_type)

                print("---- Done in ", time() - start, " seconds\n")

                # Catch bad runs
                if loss_type == loss_functions[0] and final_loss > 0.30:
                    models_to_rerun.append((model_name, dataset_number, loss_type))
                elif loss_type == loss_functions[1] and final_loss > 1.0:
                    models_to_rerun.append((model_name, dataset_number, loss_type))

        print("================\n\n")

    print("Rerun: ", models_to_rerun)
    return models_to_rerun

In [None]:
for model_name, dataset_number, loss_type in models_to_rerun:
    dataset = CovariateDataset("n_{}_model_{}_v_{}_covar_data", [1000, model_name, dataset_number])
    trained_model, final_loss = train_model(
                                        autoencoder,
                                        dataset,
                                        dataset_number,
                                        loss=loss_type,
                                        verbose=True)
    encode_data(trained_model, dataset, loss=loss_type)