In [1]:
import os
import torch
import torch.utils.data
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchvision import datasets, transforms

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

import numpy as np
from time import time

CUDA = torch.cuda.is_available()

In [None]:
# all_indeces = np.array(range(len(self.data)))
# treat_indeces = all_indeces[self.assignment_data.astype(int) == 1]
# control_indeces = all_indeces[self.assignment_data.astype(int) == 0]

# self.treat_train_indeces = np.random.choice(all_indeces, int(len(self.data)*train/2), replace=False)
# self.treat_test_indeces = list(set(treat_indeces)^set(self.treat_train_indeces))

# self.control_train_indeces = np.random.choice(control_indeces, int(len(self.data)*train/2), replace=False)
# self.control_test_indeces = list(set(control_indeces)^set(self.control_train_indeces))

# self.train_indeces = np.hstack([self.treat_train_indeces, self.control_train_indeces])
# self.test_indeces = np.hstack([self.treat_test_indeces, self.control_test_indeces])

In [542]:
RAW_DATA_DIR = "../Data/Raw/"
PROCESSED_DATA_DIR = "../Data/Processed/Regression/"

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_name_args, train=0.8):
        self.train = True
        self.test_on_all = False
        
        self.file_name = file_name_pattern.format(*file_name_args, "covar")
        self.assignment_file_name = file_name_pattern.format(*file_name_args, "assignment")
        
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name + ".csv", delimiter=",")[:, 1:] # remove bias
        self.assignment_data = np.loadtxt(
            RAW_DATA_DIR + self.assignment_file_name + ".csv", delimiter=",").astype(int)
        
        self.all_indeces = np.array(range(len(self.data)))
        treat_indeces = self.all_indeces[self.assignment_data.astype(int) == 1]
        control_indeces = self.all_indeces[self.assignment_data.astype(int) == 0]
        num_training = int(len(self.data)*train)
        
        self.train_indeces = np.random.choice(self.all_indeces, num_training, replace=False)
        self.test_indeces = list(set(self.all_indeces)^set(self.train_indeces))
        
        num_treated_in_train = len(np.intersect1d(treat_indeces, self.train_indeces, assume_unique=True))
        num_control_in_train = num_training - num_treated_in_train
        
        treat_weight = num_training / (2 * num_treated_in_train)
        control_weight = num_training / (2 * num_control_in_train)
        
        weighter = np.vectorize(lambda index: treat_weight if index in\
            treat_indeces else control_weight)
        
        self.weights = weighter(self.all_indeces)
        
    def active_data(self, index=0):
        if self.train:
            return self.data[self.train_indeces], self.assignment_data[self.train_indeces], \
                self.weights[self.train_indeces][index]
        elif:
            if self.test_on_all:
                indeces = self.all_indeces
            else: 
                indeces = self.test_indeces
            
            return self.data[indeces], self.assignment_data[indeces], 1
            
    def __getitem__(self, index):
        covar_data, assignment_data, weight_data = self.active_data(index)
        class_vector = np.zeros(2)
        class_vector[int(assignment_data[index])] = 1
        
        return (covar_data[index], class_vector, weight_data)

    def __len__(self):
        return self.active_data()[0].shape[0]
    
    def save_processed_data(self, data):
        name = PROCESSED_DATA_DIR + self.file_name+".csv"
        np.savetxt(name, data, delimiter=",")

SyntaxError: invalid syntax (<ipython-input-542-0b21ffedcb38>, line 37)

In [536]:
# Based on an example from https://github.com/pytorch/examples/blob/master/vae/main.py
# Extended to place a different prior on binary vs normal vars

class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()
        
        INTERMEDIATE_DIMS_1 = 32
        INTERMEDIATE_DIMS_2 = 32
        INTERMEDIATE_DIMS_3 = 16
        INTERMEDIATE_DIMS_4 = 16
#         INTERMEDIATE_DIMS_5 = 16
#         INTERMEDIATE_DIMS_6 = 8

        FEATURES = 10

        LOSS_SCALE = 1

        # ENCODER LAYERS
        self.dense1 = nn.Linear(FEATURES, INTERMEDIATE_DIMS_1)
        self.dense2 = nn.Linear(INTERMEDIATE_DIMS_1, INTERMEDIATE_DIMS_2)
        self.dense3 = nn.Linear(INTERMEDIATE_DIMS_2, INTERMEDIATE_DIMS_3)
        self.dense4 = nn.Linear(INTERMEDIATE_DIMS_3, INTERMEDIATE_DIMS_4)
#         self.dense5 = nn.Linear(INTERMEDIATE_DIMS_4, INTERMEDIATE_DIMS_5)
#         self.dense6 = nn.Linear(INTERMEDIATE_DIMS_5, INTERMEDIATE_DIMS_6)
        self.dense5 = nn.Linear(INTERMEDIATE_DIMS_4, 2)
        
        # Activations
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.6)

    def forward(self, x):
        h1 = self.dropout(self.relu(self.dense1(x)))
        h2 = self.dropout(self.relu(self.dense2(h1)))
        h3 = self.dropout(self.relu(self.dense3(h2)))
        h4 = self.dropout(self.relu(self.dense4(h3)))
#         h5 = self.dropout(self.relu(self.dense5(h4)))
#         h6 = self.dropout(self.relu(self.dense6(h5)))
        
        return self.softmax(self.dense5(h4))



def train(model, optimizer, epoch, train_loader, log_results=False):
    model.train()
    train_loss = 0

    for batch_idx, (data, target_class, weights) in enumerate(train_loader):
        data = Variable(data)
        target_class = Variable(target_class)
        weights = Variable(weights)
        
        data = data.float()
        target_class = target_class.float()
        weights = weights.float()
        
        if CUDA:
            data = data.cuda()
            target_class = target_class.cuda()
            weights = weights.cuda()
        
        optimizer.zero_grad()

        output_propensity = model(data)
        
        # calculate loss
        loss_criterion = nn.BCELoss(weight=weights.view(weights.shape[0], 1), size_average=False)
        loss = loss_criterion(output_propensity, target_class)

        train_loss += loss.data[0]
        
        # Find the gradient and descend
        loss.backward()
        optimizer.step()
        
    if log_results:
        print('====> Epoch: {} Average loss: {:.8f}'.format(
              epoch, train_loss / len(train_loader.dataset)))
        
        
def test(model, epoch, test_loader):
    # toggle model to test / inference mode
    model.eval()
    test_loss = 0

    for i, (data, target_class, weights) in enumerate(test_loader):
        data = Variable(data, volatile=True)
        target_class = Variable(target_class, volatile=True)
        weights = Variable(weights, volatile=True)
        
        data = data.float()
        target_class = target_class.float()
        weights = weights.float()
        
        if CUDA:
            data = data.cuda()
            target_class = target_class.cuda()
            weights = weights.cuda()

        output_propensity = model(data)
        
        # calculate loss
        loss_criterion = nn.BCELoss(weight=weights.view(weights.shape[0], 1), size_average=False)
        loss = loss_criterion(output_propensity, target_class)
        test_loss += loss.data[0]

    test_loss /= len(test_loader.dataset)
    print('====> Test set loss: {:.4f}'.format(test_loss))
    
def predict(model, predict_loader):
    # Show reconstruction
    model.eval()
    print("Training state: ", model.training)
    
    original_data, targets, _ = next(iter(predict_loader))
    
    original_data = Variable(original_data)
    original_data = original_data.float()
    
    if CUDA:
        original_data = original_data.cuda()
        
    return original_data, targets, model(original_data)

In [554]:
def train_model(model_class, train_set, test_set, predict_set, dataset_number, verbose=True, model=None):
    if model is None:
        model = model_class()
        if CUDA:
            model = model.cuda()

    num_epochs = 1000
    train_batch_size = 128
    test_batch_size = 200
    learning_rate = 1e-3
    lr_sched = True
         
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [int(num_epochs/10), int(num_epochs/2)], gamma=0.1)

    train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=test_batch_size, shuffle=True)
    predict_loader = DataLoader(predict_set, batch_size=1000, shuffle=True)
    
    for epoch in range(1, num_epochs+1):
        checkpoint_interval = int(num_epochs/10)
        
        if lr_sched:
            scheduler.step()

        log = False
        if epoch%checkpoint_interval == 0:
            log = True
            
        train(model, optimizer, epoch, train_loader, log_results=log)
        if log:
            test(model, epoch, test_loader)
    
    original_data, targets, output = predict(model, predict_loader)
    
    return model, original_data, targets, output

def encode_data(model, dataset):
    original_data, output = predict(model, dataset)
    
    if CUDA:
        output = output.cpu()
    
    output = output.numpy()
        
    dataset.save_processed_data(output)

In [560]:
# G_mod_nadd_mod_nlin
import copy
train_set = CovariateDataset("n_{}_model_{}_v_{}_{}_data", [1000, "G_mod_nadd_mod_nlin", 1], train=0.8)
test_set = copy.deepcopy(train_set)
test_set.train = False

predict_set = copy.deepcopy(train_set)
predict_set.train = False
predict_set.test_on_all = True

trained_model, original_data, targets, output = \
    train_model(Regressor, train_set, test_set,predict_set, 1,verbose=True)
# encode_data(trained_model, dataset)

====> Epoch: 100 Average loss: 1.24830600
====> Test set loss: 1.1533
====> Epoch: 200 Average loss: 1.23988860
====> Test set loss: 1.1396
====> Epoch: 300 Average loss: 1.24327443
====> Test set loss: 1.1302
====> Epoch: 400 Average loss: 1.25340954
====> Test set loss: 1.1169
====> Epoch: 500 Average loss: 1.25517376
====> Test set loss: 1.1101
====> Epoch: 600 Average loss: 1.18138154
====> Test set loss: 1.1092
====> Epoch: 700 Average loss: 1.23065025
====> Test set loss: 1.1087
====> Epoch: 800 Average loss: 1.23970110
====> Test set loss: 1.1078
====> Epoch: 900 Average loss: 1.20337008
====> Test set loss: 1.1071
====> Epoch: 1000 Average loss: 1.23729564
====> Test set loss: 1.1062
Training state:  False


In [561]:
# res = np.abs(output.data.numpy().reshape(1, -1)[0] -  targets.numpy().reshape(1, -1)[0])> 0.5
# np.unique(res, return_counts=True)

classes = np.argmax(output.data.numpy(), axis=1)

treat = 0
treat_hits = 0
control = 0
control_hits = 0

checker = lambda target, klass: target[klass] == 1
for target, klass in (zip(targets.numpy(), classes)):
    if klass == 1:
        treat += 1
        if checker(target, klass):
            treat_hits += 1
    else:
        control += 1
        if checker(target, klass):
            control_hits += 1
        
print("Overall", (treat_hits + control_hits)/len(classes))
print("Class 1: ", treat_hits/treat)
print("Class 0: ", control_hits/control)

Overall 0.741
Class 1:  0.7887788778877888
Class 0:  0.6675126903553299


In [562]:
(np.round(output.data.numpy()[:, 1], 2))

array([0.32, 0.61, 0.49, 0.71, 0.23, 0.71, 0.71, 0.68, 0.48, 0.62, 0.71,
       0.71, 0.68, 0.6 , 0.41, 0.55, 0.69, 0.61, 0.62, 0.71, 0.71, 0.51,
       0.7 , 0.51, 0.43, 0.39, 0.71, 0.51, 0.71, 0.71, 0.5 , 0.69, 0.71,
       0.33, 0.39, 0.71, 0.71, 0.44, 0.63, 0.46, 0.47, 0.49, 0.18, 0.4 ,
       0.51, 0.44, 0.46, 0.52, 0.46, 0.71, 0.51, 0.61, 0.15, 0.71, 0.15,
       0.62, 0.7 , 0.71, 0.11, 0.71, 0.47, 0.36, 0.55, 0.44, 0.29, 0.47,
       0.44, 0.28, 0.67, 0.49, 0.62, 0.71, 0.5 , 0.66, 0.44, 0.45, 0.55,
       0.71, 0.49, 0.57, 0.53, 0.14, 0.45, 0.29, 0.71, 0.32, 0.64, 0.41,
       0.19, 0.54, 0.51, 0.55, 0.71, 0.54, 0.71, 0.71, 0.39, 0.71, 0.23,
       0.65, 0.39, 0.41, 0.59, 0.45, 0.27, 0.7 , 0.5 , 0.71, 0.68, 0.71,
       0.71, 0.71, 0.45, 0.29, 0.6 , 0.6 , 0.44, 0.49, 0.37, 0.67, 0.71,
       0.62, 0.71, 0.71, 0.47, 0.15, 0.29, 0.65, 0.71, 0.5 , 0.51, 0.71,
       0.16, 0.45, 0.66, 0.54, 0.57, 0.48, 0.48, 0.51, 0.7 , 0.44, 0.71,
       0.52, 0.42, 0.5 , 0.71, 0.41, 0.68, 0.39, 0.