In [1]:
import os
import numpy as np 

import torch
import torchvision
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset

cuda = False

### Load Data

In [36]:
RAW_DATA_DIR = "../Data/Raw/"
PROCESSED_DATA_DIR = "../Data/Processed/"

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_number):
        self.file_name = file_name_pattern.format(file_number)
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name, delimiter=",")[:, 1:] # remove bias
        
    def __getitem__(self, index):
        return (self.data[index].astype(float), 0)

    def __len__(self):
        return self.data.shape[0]
    
    def save_processed_data(self, data):
        np.savetxt(PROCESSED_DATA_DIR + self.file_name, data, delimiter=",")

### Define Model

In [33]:
class autoencoder(nn.Module):
    def __init__(self):
        super(autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(11, 128),
            nn.ReLU(True),
            nn.Linear(128, 4))
        self.decoder = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(True),
            nn.Linear(128, 11))

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

### Train and Process Utils

In [34]:
def train_model(model_class, dataset):
    model = model_class()
    if cuda:
        model = model.cuda()

    num_epochs = 20000
    batch_size = 1000
    learning_rate = 1e-1
    lr_sched = True

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=learning_rate, weight_decay=1e-5)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2000, 5000], gamma=0.1)

    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(num_epochs):
        if lr_sched:
            scheduler.step()

        for data in dataloader:
            data_batch, _ = data
            data_batch = Variable(data_batch)
            data_batch = data_batch.float()

            if cuda:
                data_batch = data_batch.cuda()

            # Forward pass
            output = model(data_batch)

            loss = criterion(output, data_batch)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        if epoch%int(num_epochs/10) == int(num_epochs/10)-1:
            print('epoch [{}/{}], loss:{:.4f}'
                  .format(epoch + 1, num_epochs, loss.data[0]))

    torch.save(model.state_dict(), "../Models/simple_autoencoder_{}.pth".format(dataset_number))
    return model

def encode_data(model, dataset):
    all_data = torch.from_numpy(dataset.data)
    all_data = Variable(all_data)
    all_data = all_data.float()
    if cuda:
        all_data = all_data.cuda()

    output = model.encoder(all_data)
    dataset.save_processed_data(output.data.numpy())

In [35]:
num_datasets_to_process = 1

for dataset_number in range(num_datasets_to_process):
    print("Starting run for Dataset {}".format(dataset_number))
    dataset = CovariateDataset("n_1000_model_A_add_lin_v_{}_covar_data.csv", dataset_number)
    trained_model = train_model(autoencoder, dataset)
    encode_data(trained_model, dataset)
    print("================\n\n")

Starting run for Dataset 0
epoch [2000/20000], loss:0.1650
epoch [4000/20000], loss:0.1396
epoch [6000/20000], loss:0.1359
epoch [8000/20000], loss:0.1349
epoch [10000/20000], loss:0.1329
epoch [12000/20000], loss:0.1300
epoch [14000/20000], loss:0.1264
epoch [16000/20000], loss:0.1211
epoch [18000/20000], loss:0.1169
epoch [20000/20000], loss:0.1145


