In [3]:
import os
import numpy as np 

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

cuda = torch.cuda.is_available()

### Load Data

In [31]:
RAW_DATA_DIR = "../Data/Raw/"
PROCESSED_DATA_DIR = "../Data/Processed/"

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_name_args):
        self.file_name = file_name_pattern.format(*file_name_args)
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name, delimiter=",")[:, 1:] # remove bias
        
    def __getitem__(self, index):
        return (self.data[index].astype(float), 0)

    def __len__(self):
        return self.data.shape[0]
    
    def save_processed_data(self, data):
        np.savetxt(PROCESSED_DATA_DIR + self.file_name, data, delimiter=",")

### Define Model

In [17]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(10, 128),
            nn.ReLU(True),
            nn.Linear(128, 4))
        self.decoder = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(True),
            nn.Linear(128, 10))

    def forward(self, x):
        encoded_values = self.encoder(x)
        x = self.decoder(encoded_values)
        return x, encoded_values

### Train and Process Utils

In [33]:
def reconstruction_sparsity_loss(output, target, encoded_values):
        sparsity_scalar = Variable(torch.FloatTensor([0.1]))
        if cuda:
            sparsity_scalar = sparsity_scalar.cuda()
            
        mse_loss = nn.MSELoss()
        reconstruction_loss = mse_loss(output, target)
        sparsity_loss = encoded_values.abs().sum()*sparsity_scalar
        return reconstruction_loss + sparsity_loss 
    
def reconstruction_loss(output, target, encoded_values):
    mse_loss = nn.MSELoss()
    reconstruction_loss = mse_loss(output, target)
    return reconstruction_loss

In [37]:
def train_model(model_class, dataset, loss="reconstruction", verbose=True):
    model = model_class()
    if cuda:
        model = model.cuda()

    num_epochs = 15000
    batch_size = 1000
    learning_rate = 1e-1
    lr_sched = True
    
    
    if loss == "reconstruction":
        criterion = reconstruction_loss
    elif loss == "sparsity":
        criterion = reconstruction_sparsity_loss
         
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2000, 5000], gamma=0.1)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        if lr_sched:
            scheduler.step()

        for data in dataloader:
            data_batch, _ = data
            data_batch = Variable(data_batch)
            data_batch = data_batch.float()

            if cuda:
                data_batch = data_batch.cuda()

            # Forward pass
            output, encoded_values = model(data_batch)

            loss = criterion(output, data_batch, encoded_values)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        if epoch%int(num_epochs/10) == int(num_epochs/10)-1 and verbose:
            print('epoch [{}/{}], loss:{:.4f}'
                  .format(epoch + 1, num_epochs, loss.data[0]))

    torch.save(model.state_dict(), "../Models/simple_autoencoder_{}.pth".format(dataset_number))
    return model

def encode_data(model, dataset):
    all_data = torch.from_numpy(dataset.data)
    all_data = Variable(all_data)
    all_data = all_data.float()
    
    if cuda:
        all_data = all_data.cuda()

    output = model.encoder(all_data)
    
    if cuda:
        output = output.cpu()
        
    dataset.save_processed_data(output.data.numpy())

In [None]:
assignment_model_names = ['A_add_lin', 'B_add_mild_nlin', 'C_add_mod_nlin', 'D_mild_nadd_lin',
                     'E_mild_nadd_mild_nlin', 'F_mod_nadd_lin', 'G_mod_nadd_mod_nlin']
num_datasets_to_process = 100

for dataset_number in range(num_datasets_to_process):
    print("Starting run for Dataset {}".format(dataset_number))
    for model_name in assignment_model_names[:1]:
        print("Running for model name: ", model_name)
        dataset = CovariateDataset("n_{}_model_{}_v_{}_covar_data.csv", [1000, model_name, dataset_number])
        trained_model = train_model(autoencoder, dataset, verbose=False)
        encode_data(trained_model, dataset)
    print("================\n\n")

Starting run for Dataset 0
Running for model name:  A_add_lin
