In [1]:
import os
import torch
import torch.utils.data
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchvision import datasets, transforms

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

import numpy as np
from time import time

CUDA = torch.cuda.is_available()

In [65]:
set([1,2,3])^set([1])

{2, 3}

In [83]:
RAW_DATA_DIR = "../Data/Raw/"
PROCESSED_DATA_DIR = "../Data/Processed/Regression/"

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_name_args, train=0.8):
        self.train = True
        
        self.file_name = file_name_pattern.format(*file_name_args, "covar")
        self.assignment_file_name = file_name_pattern.format(*file_name_args, "assignment")
        
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name + ".csv", delimiter=",")[:, 1:] # remove bias
        self.assignment_data = np.loadtxt(RAW_DATA_DIR + self.assignment_file_name + ".csv", delimiter=",")
        
        all_indeces = list(range(len(self.data)))
        self.train_indeces = np.random.choice(all_indeces, int(len(self.data)*train))
        self.test_indeces = list(set(all_indeces)^set(self.train_indeces))
    
    def active_data(self):
        if self.train:
            return self.data[self.train_indeces], self.assignment_data[self.train_indeces]
        else:
            return self.data[self.test_indeces], self.assignment_data[self.test_indeces]
            
    def __getitem__(self, index):
        covar_data, assignment_data = self.active_data()
        return (covar_data[index], assignment_data[index])

    def __len__(self):
        return self.active_data()[0].shape[0]
    
    def save_processed_data(self, data):
        name = PROCESSED_DATA_DIR + self.file_name+".csv"
        np.savetxt(name, data, delimiter=",")

In [148]:
# Based on an example from https://github.com/pytorch/examples/blob/master/vae/main.py
# Extended to place a different prior on binary vs normal vars

INTERMEDIATE_DIMS_1 = 1024
INTERMEDIATE_DIMS_2 = 512
INTERMEDIATE_DIMS_3 = 512
INTERMEDIATE_DIMS_4 = 256
INTERMEDIATE_DIMS_5 = 128
INTERMEDIATE_DIMS_6 = 64
INTERMEDIATE_DIMS_7 = 32

FEATURES = 10

LOSS_SCALE = 1000000

class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()
        
        # ENCODER LAYERS
        self.dense1 = nn.Linear(FEATURES, INTERMEDIATE_DIMS_1)
        self.dense2 = nn.Linear(INTERMEDIATE_DIMS_1, INTERMEDIATE_DIMS_2)
        self.dense3 = nn.Linear(INTERMEDIATE_DIMS_2, INTERMEDIATE_DIMS_3)
        self.dense4 = nn.Linear(INTERMEDIATE_DIMS_3, INTERMEDIATE_DIMS_4)
        self.dense5 = nn.Linear(INTERMEDIATE_DIMS_4, INTERMEDIATE_DIMS_5)
        self.dense6 = nn.Linear(INTERMEDIATE_DIMS_5, INTERMEDIATE_DIMS_6)
        self.dense7 = nn.Linear(INTERMEDIATE_DIMS_6, INTERMEDIATE_DIMS_7)
        self.dense8 = nn.Linear(INTERMEDIATE_DIMS_7, 1)
        
        # Activations
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        h1 = self.dropout(self.relu(self.dense1(x)))
        h2 = self.dropout(self.relu(self.dense2(h1)))
        h3 = self.dropout(self.relu(self.dense3(h2)))
        h4 = self.dropout(self.relu(self.dense4(h3)))
        h5 = self.dropout(self.relu(self.dense5(h4)))
        h6 = self.dropout(self.relu(self.dense6(h5)))
        h7 = self.relu(self.dense7(h6))
        
        return self.sigmoid(self.dense8(h7))

def train(model, optimizer, epoch, train_loader, log_results=False):
    model.train()
    train_loss = 0

    for batch_idx, (data, target_class) in enumerate(train_loader):
        data = Variable(data)
        target_class = Variable(target_class)
        
        data = data.float()
        target_class = target_class.float()
        
        if CUDA:
            data = data.cuda()
            target_class = target_class.cuda()
        
        optimizer.zero_grad()

        output_propensity = model(data)
        
        # calculate loss
        loss = F.binary_cross_entropy(output_propensity, target_class.view(-1, 1))*LOSS_SCALE
        train_loss += loss.data[0]
        
        # Find the gradient and descend
        loss.backward()
        optimizer.step()
        
    if log_results:
        print('====> Epoch: {} Average loss: {:.8f}'.format(
              epoch, train_loss / len(train_loader.dataset)))
        
        
def test(model, epoch, test_loader):
    # toggle model to test / inference mode
    model.eval()
    test_loss = 0

    for i, (data, target_class) in enumerate(test_loader):
        data = Variable(data, volatile=True)
        target_class = Variable(target_class, volatile=True)
        
        data = data.float()
        target_class = target_class.float()
        
        if CUDA:
            data = data.cuda()
            target_class = target_class.cuda()

        output_propensity = model(data)
        
        # calculate loss
        loss = F.binary_cross_entropy(output_propensity, target_class.view(-1, 1))*LOSS_SCALE
        test_loss += loss.data[0]

    test_loss /= len(test_loader.dataset)
    print('====> Test set loss: {:.4f}'.format(test_loss))
    
def predict(model, test_loader):
    # Show reconstruction
    model.eval()
    print("Training state: ", model.training)
    
    original_data, targets = next(iter(test_loader))
    
    original_data = Variable(original_data)
    original_data = original_data.float()
    
    if CUDA:
        original_data = original_data.cuda()
        
    return original_data, targets, model(original_data)

In [149]:
def train_model(model_class, train_set, test_set, dataset_number, verbose=True, model=None):
    if model is None:
        model = model_class()
        if CUDA:
            model = model.cuda()

    num_epochs = 10000
    train_batch_size = 400
    test_batch_size = 200
    learning_rate = 1e-2
    lr_sched = True
         
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [int(num_epochs/4), int(num_epochs/2)], gamma=0.5)

    train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=test_batch_size, shuffle=True)
    
    for epoch in range(1, num_epochs+1):
        checkpoint_interval = int(num_epochs/10)
        
        if lr_sched:
            scheduler.step()

        log = False
        if epoch%checkpoint_interval == 0:
            log = True
            
        train(model, optimizer, epoch, train_loader, log_results=log)
        if log:
            test(model, epoch, train_loader)
    
    original_data, targets, output = predict(model, test_loader)
    
    return model, original_data, targets, output

def encode_data(model, dataset):
    original_data, output = predict(model, dataset)
    
    if CUDA:
        output = output.cpu()
    
    output = output.numpy()
        
    dataset.save_processed_data(output)

In [150]:
train_set = CovariateDataset("n_{}_model_{}_v_{}_{}_data", [1000, "A_add_lin", 1])
test_set = CovariateDataset("n_{}_model_{}_v_{}_{}_data", [1000, "A_add_lin", 1])
test_set.train = False

trained_model, original_data, targets, output = \
    train_model(Regressor, train_set, test_set, 1,verbose=True)

# encode_data(trained_model, dataset)

====> Epoch: 1000 Average loss: 1688.20226563
====> Test set loss: 1689.8445
====> Epoch: 2000 Average loss: 1689.42031250
====> Test set loss: 1689.8373
====> Epoch: 3000 Average loss: 1689.78710938
====> Test set loss: 1689.8380
====> Epoch: 4000 Average loss: 1773.02562500
====> Test set loss: 1689.8438
====> Epoch: 5000 Average loss: 1690.94406250
====> Test set loss: 1689.8412
====> Epoch: 6000 Average loss: 1689.37773438
====> Test set loss: 1689.8438
====> Epoch: 7000 Average loss: 1689.54195312
====> Test set loss: 1689.8402
====> Epoch: 8000 Average loss: 1689.26132813
====> Test set loss: 1689.8404
====> Epoch: 9000 Average loss: 1687.45156250
====> Test set loss: 1689.8413
====> Epoch: 10000 Average loss: 1852.04562500
====> Test set loss: 1689.8564
Training state:  False


In [147]:
res = np.abs(output.data.numpy().reshape(1, -1)[0] -  targets.numpy().reshape(1, -1)[0])> 0.5
np.unique(res, return_counts=True)

(array([False,  True]), array([169,  31]))

In [93]:
39/200

0.195