In [1]:
import os
import numpy as np 

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

from time import time

cuda = torch.cuda.is_available()

### Load Data

In [2]:
RAW_DATA_DIR = "../Data/Raw/"
PROCESSED_DATA_DIR = "../Data/Processed/"

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_name_args):
        self.file_name = file_name_pattern.format(*file_name_args)
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name + ".csv", delimiter=",")[:, 1:] # remove bias
        
    def __getitem__(self, index):
        return (self.data[index].astype(float), 0)

    def __len__(self):
        return self.data.shape[0]
    
    def save_processed_data(self, data, loss):
        name = PROCESSED_DATA_DIR + self.file_name+"_{}.csv".format(loss)
        np.savetxt(name, data, delimiter=",")

### Define Model

In [3]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(10, 128),
            nn.ReLU(True),
            nn.Linear(128, 4))
        self.decoder = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(True),
            nn.Linear(128, 10))

    def forward(self, x):
        encoded_values = self.encoder(x)
        x = self.decoder(encoded_values)
        return x, encoded_values

### Train and Process Utils

In [4]:
def reconstruction_sparsity_loss(output, target, encoded_values):
        sparsity_scalar = Variable(torch.FloatTensor([0.0005]))
        if cuda:
            sparsity_scalar = sparsity_scalar.cuda()
            
        mse_loss = nn.MSELoss()
        reconstruction_loss = mse_loss(output, target)
        sparsity_loss = encoded_values.abs().sum()*sparsity_scalar
        return reconstruction_loss + sparsity_loss 
    
def reconstruction_loss(output, target, encoded_values):
    mse_loss = nn.MSELoss()
    reconstruction_loss = mse_loss(output, target)
    return reconstruction_loss

In [5]:
loss_functions = ["reconstruction", "sparsity"]

def train_model(model_class, dataset, dataset_number, loss="reconstruction", verbose=True):
    model = model_class()
    if cuda:
        model = model.cuda()

    num_epochs = 10000
    batch_size = 1000
    learning_rate = 1e-1
    lr_sched = True
    
    
    if loss == loss_functions[0]:
        criterion = reconstruction_loss
    elif loss == loss_functions[1]:
        criterion = reconstruction_sparsity_loss
         
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2000, 5000], gamma=0.1)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    final_loss = None
    
    for epoch in range(num_epochs):
        if lr_sched:
            scheduler.step()

        for data in dataloader:
            data_batch, _ = data
            data_batch = Variable(data_batch)
            data_batch = data_batch.float()

            if cuda:
                data_batch = data_batch.cuda()

            # Forward pass
            output, encoded_values = model(data_batch)

            loss = criterion(output, data_batch, encoded_values)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        if epoch%int(num_epochs/10) == int(num_epochs/10)-1 and verbose:
            print('epoch [{}/{}], loss:{:.4f}'
                  .format(epoch + 1, num_epochs, loss.data[0]))
        
        if epoch == (num_epochs-1):
            final_loss = loss.data[0]
            print("Final loss: loss:{:.4f}".format(final_loss))

    torch.save(model.state_dict(), "../Models/simple_autoencoder_{}.pth".format(dataset_number))
    return model, final_loss

def encode_data(model, dataset, loss):
    all_data = torch.from_numpy(dataset.data)
    all_data = Variable(all_data)
    all_data = all_data.float()
    
    if cuda:
        all_data = all_data.cuda()

    output = model.encoder(all_data)
    
    if cuda:
        output = output.cpu()
        
    dataset.save_processed_data(output.data.numpy(), loss)
    return output.data.numpy()

In [6]:
# dataset = CovariateDataset("n_{}_model_{}_v_{}_covar_data", [1000, "A_add_lin", 1])
# trained_model, final_loss = train_model(
#                                     autoencoder,
#                                     dataset,
#                                     loss="sparsity",
#                                     verbose=True)
# encode_data(trained_model, dataset, "sparsity")

### Train and Encode

In [15]:
def run_for_range(start, end):
    models_to_rerun = []
    datasets_to_process = range(start, end)
    assignment_model_names = ['A_add_lin', 'B_add_mild_nlin', 'C_add_mod_nlin', 'D_mild_nadd_lin',
                         'E_mild_nadd_mild_nlin', 'F_mod_nadd_lin', 'G_mod_nadd_mod_nlin']

    for dataset_number in datasets_to_process:
        print("Starting run for Dataset {}".format(dataset_number))

        for model_name in assignment_model_names:
            print("-- Running for model name: ", model_name)

            for loss_type in loss_functions:
                print("---- Running for loss: ", loss_type)

                start = time()

                dataset = CovariateDataset("n_{}_model_{}_v_{}_covar_data", [1000, model_name, dataset_number])
                trained_model, final_loss = train_model(
                                                    autoencoder,
                                                    dataset,
                                                    dataset_number,
                                                    loss=loss_type,
                                                    verbose=True)
                encode_data(trained_model, dataset, loss=loss_type)

                print("---- Done in ", time() - start, " seconds\n")

                # Catch bad runs
                if loss_type == loss_functions[0] and final_loss > 0.30:
                    models_to_rerun.append((model_name, dataset_number, loss_type))
                elif loss_type == loss_functions[1] and final_loss > 1.0:
                    models_to_rerun.append((model_name, dataset_number, loss_type))

        print("================\n\n")

    print("Rerun: ", models_to_rerun)
    return models_to_rerun


# In[38]:


run_for_range(275, 300)

Starting run for Dataset 275
-- Running for model name:  A_add_lin
---- Running for loss:  reconstruction
epoch [1000/10000], loss:0.6139
epoch [2000/10000], loss:0.6139
epoch [3000/10000], loss:0.6139
epoch [4000/10000], loss:0.6139
epoch [5000/10000], loss:0.6139
epoch [6000/10000], loss:0.6139
epoch [7000/10000], loss:0.6139
epoch [8000/10000], loss:0.6139
epoch [9000/10000], loss:0.4075
epoch [10000/10000], loss:0.3119
Final loss: loss:0.3119
---- Done in  306.6536464691162  seconds

---- Running for loss:  sparsity
epoch [1000/10000], loss:0.4444
epoch [2000/10000], loss:0.3923
epoch [3000/10000], loss:0.3325
epoch [4000/10000], loss:0.3244
epoch [5000/10000], loss:0.3215
epoch [6000/10000], loss:0.3200
epoch [7000/10000], loss:0.3195
epoch [8000/10000], loss:0.3187
epoch [9000/10000], loss:0.3179
epoch [10000/10000], loss:0.3170
Final loss: loss:0.3170
---- Done in  239.48682641983032  seconds

-- Running for model name:  B_add_mild_nlin
---- Running for loss:  reconstruction
epo

KeyboardInterrupt: 

In [10]:
models_to_rerun = [('A_add_lin', 169, 'sparsity'), ('D_mild_nadd_lin', 169, 'reconstruction'), 
                   ('E_mild_nadd_mild_nlin', 170, 'reconstruction')]

models_to_rerun += [('E_mild_nadd_mild_nlin', 144, 'reconstruction'), ('F_mod_nadd_lin', 144, 'reconstruction'), 
                    ('C_add_mod_nlin', 146, 'reconstruction'), ('E_mild_nadd_mild_nlin', 147, 'reconstruction'), 
                    ('B_add_mild_nlin', 148, 'reconstruction'), ('B_add_mild_nlin', 149, 'reconstruction')]

models_to_rerun += [('C_add_mod_nlin', 120, 'sparsity'), ('C_add_mod_nlin', 122, 'reconstruction'), 
                    ('D_mild_nadd_lin', 123, 'reconstruction')]

models_to_rerun = [('A_add_lin', 169, 'sparsity'), ('D_mild_nadd_lin', 169, 'reconstruction'),
                   ('E_mild_nadd_mild_nlin', 170,'reconstruction'), 
                   ('E_mild_nadd_mild_nlin', 144, 'reconstruction'),
                   ('F_mod_nadd_lin', 144, 'reconstruction'), 
                   ('C_add_mod_nlin', 146, 'reconstruction'),
                   ('E_mild_nadd_mild_nlin', 147, 'reconstruction'),
                   ('B_add_mild_nlin', 148, 'reconstruction'), ('B_add_mild_nlin', 149, 'reconstruction'), 
                   ('C_add_mod_nlin', 120, 'sparsity'), ('C_add_mod_nlin', 122, 'reconstruction'), 
                   ('D_mild_nadd_lin', 123, 'reconstruction'), ('A_add_lin', 78, 'sparsity'), 
                   ('A_add_lin', 79, 'sparsity'), ('A_add_lin', 86, 'sparsity'), ('A_add_lin', 102, 'sparsity'),
                   ('A_add_lin', 105, 'sparsity'), ('A_add_lin', 117, 'sparsity'),
                   ('A_add_lin', 144, 'sparsity'), ('A_add_lin', 165, 'sparsity'), 
                   ('A_add_lin', 167, 'sparsity'), ('A_add_lin', 175, 'sparsity'), 
                   ('A_add_lin', 196, 'sparsity'), ('B_add_mild_nlin', 75, 'sparsity'), 
                   ('B_add_mild_nlin', 98, 'sparsity'), ('B_add_mild_nlin', 135, 'sparsity'),
                   ('B_add_mild_nlin', 174, 'sparsity'), ('B_add_mild_nlin', 183, 'sparsity'), ('B_add_mild_nlin', 184, 'sparsity'),
                   ('B_add_mild_nlin', 198, 'sparsity'), ('C_add_mod_nlin', 55, 'sparsity'), ('C_add_mod_nlin', 84, 'sparsity'),
                   ('C_add_mod_nlin', 99, 'sparsity'), ('C_add_mod_nlin', 104, 'sparsity'), 
                   ('C_add_mod_nlin', 116, 'sparsity'), ('C_add_mod_nlin', 119, 'sparsity'), 
                   ('C_add_mod_nlin', 136, 'sparsity'), ('C_add_mod_nlin', 169, 'sparsity'), ('C_add_mod_nlin', 176, 'sparsity'), ('D_mild_nadd_lin', 61, 'sparsity'), 
                   ('D_mild_nadd_lin', 68, 'sparsity'), ('D_mild_nadd_lin', 112, 'sparsity'), ('D_mild_nadd_lin', 123, 'sparsity'), ('D_mild_nadd_lin', 128, 'sparsity'), ('D_mild_nadd_lin', 134, 'sparsity'), ('D_mild_nadd_lin', 143, 'sparsity'), ('D_mild_nadd_lin', 154, 'sparsity'), ('D_mild_nadd_lin', 156, 'sparsity'), ('D_mild_nadd_lin', 165, 'sparsity'), ('D_mild_nadd_lin', 172, 'sparsity'), ('D_mild_nadd_lin', 177, 'sparsity'), ('D_mild_nadd_lin', 183, 'sparsity'), ('E_mild_nadd_mild_nlin', 54, 'sparsity'), ('E_mild_nadd_mild_nlin', 86, 'sparsity'), ('E_mild_nadd_mild_nlin', 153, 'sparsity'), ('E_mild_nadd_mild_nlin', 155, 'sparsity'), ('E_mild_nadd_mild_nlin',163, 'sparsity'), ('E_mild_nadd_mild_nlin', 173, 'sparsity'), ('E_mild_nadd_mild_nlin', 191, 'sparsity'), ('E_mild_nadd_mild_nlin', 197, 'sparsity'), ('F_mod_nadd_lin', 77, 'sparsity'), ('F_mod_nadd_lin', 80, 'sparsity'), ('F_mod_nadd_lin', 111, 'sparsity'), ('F_mod_nadd_lin', 117, 'sparsity'), ('F_mod_nadd_lin', 120, 'sparsity'), ('F_mod_nadd_lin', 122, 'sparsity'), ('F_mod_nadd_lin', 124, 'sparsity'), ('F_mod_nadd_lin', 137, 'sparsity'), ('F_mod_nadd_lin', 199, 'sparsity'), ('G_mod_nadd_mod_nlin', 68, 'sparsity'), ('G_mod_nadd_mod_nlin', 85, 'sparsity'), ('G_mod_nadd_mod_nlin', 89, 'sparsity'), ('G_mod_nadd_mod_nlin', 118, 'sparsity'), ('G_mod_nadd_mod_nlin', 120, 'sparsity'), ('G_mod_nadd_mod_nlin', 128, 'sparsity'), ('G_mod_nadd_mod_nlin', 153, 'sparsity'), ('G_mod_nadd_mod_nlin', 162, 'sparsity'), ('G_mod_nadd_mod_nlin', 196, 'sparsity'), ('G_mod_nadd_mod_nlin', 199, 'sparsity'), ('G_mod_nadd_mod_nlin', 187, 'reconstruction')]

models_to_rerun = [x for x in models_to_rerun if x[2] != "sparsity"]
len(models_to_rerun)

11

In [12]:
# models_to_rerun = [('A_add_lin', 12, 'sparsity'), ('G_mod_nadd_mod_nlin', 40, 'sparsity')]

for model_name, dataset_number, loss_type in models_to_rerun:
    dataset = CovariateDataset("n_{}_model_{}_v_{}_covar_data", [1000, model_name, dataset_number])
    trained_model, final_loss = train_model(
                                        autoencoder,
                                        dataset,
                                        dataset_number,
                                        loss=loss_type,
                                        verbose=True)
    encode_data(trained_model, dataset, loss=loss_type)

epoch [1000/10000], loss:0.1666
epoch [2000/10000], loss:0.1637
epoch [3000/10000], loss:0.1496
epoch [4000/10000], loss:0.1477
epoch [5000/10000], loss:0.1454
epoch [6000/10000], loss:0.1446
epoch [7000/10000], loss:0.1442
epoch [8000/10000], loss:0.1428
epoch [9000/10000], loss:0.1414
epoch [10000/10000], loss:0.1396
Final loss: loss:0.1396
epoch [1000/10000], loss:0.1917
epoch [2000/10000], loss:0.1723
epoch [3000/10000], loss:0.1670
epoch [4000/10000], loss:0.1620
epoch [5000/10000], loss:0.1576
epoch [6000/10000], loss:0.1566
epoch [7000/10000], loss:0.1555
epoch [8000/10000], loss:0.1545
epoch [9000/10000], loss:0.1528
epoch [10000/10000], loss:0.1505
Final loss: loss:0.1505
epoch [1000/10000], loss:0.1786
epoch [2000/10000], loss:0.1634
epoch [3000/10000], loss:0.1559
epoch [4000/10000], loss:0.1526
epoch [5000/10000], loss:0.1493
epoch [6000/10000], loss:0.1486
epoch [7000/10000], loss:0.1479
epoch [8000/10000], loss:0.1471
epoch [9000/10000], loss:0.1457
epoch [10000/10000], l