In [2]:
import os
import torch
import torch.utils.data
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import functional as F
from torchvision import datasets, transforms

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

import numpy as np
from time import time
import copy

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

CUDA = torch.cuda.is_available()
NSW_MODE = True

### Load Data

This Dataset class is an augmented version of the code in the Autoencoder/Variational Autoencoder. The propensity model in this file is trained via supervised learning and as such we must manage training labels as well as covariate data. Additionally, we now need to create a training and test set to evaluate performance and to implement a mechanism to ensure class-label balance in these sets. Unfortunately, PyTorch doesn't provide these mechanisms out the box. This class provides a weighting utility which will weight the loss associated with observations inversely to their commonality in the training set in order to ensure balanced learning. In hindsight, I probably should have found a way to implement this funcitonality using Scikit learn code (as used below for accuracy analysis) rather than implementing it myself. 

In [37]:
if NSW_MODE:
    RAW_DATA_DIR = "../Data/NSW/Raw/"
    PROCESSED_DATA_DIR = "../Data/NSW/Processed/Regression/"
    COVAR_SUFFIX = "covars"
    ASS_SUFFIX = "assignments"
    FILE_TYPE = ".csv"
    DELIM = ","
else:
    RAW_DATA_DIR = "../Data/NSW/Raw/"
    PROCESSED_DATA_DIR = "../Data/NSW/Processed/Regression/"
    COVAR_SUFFIX = "covar"
    ASS_SUFFIX = "assignment"
    FILE_TYPE = ".csv"
    DELIM = ","

class CovariateDataset(Dataset):
    def __init__(self, file_name_pattern, file_name_args=[], train_size=0.8, test_size=0.2, test_train_complement=True):
        self.train = True
        self.test_on_all = False
        
        self.file_name = file_name_pattern.format(*file_name_args, COVAR_SUFFIX)
        self.assignment_file_name = file_name_pattern.format(*file_name_args, ASS_SUFFIX)
        
        self.data = np.loadtxt(RAW_DATA_DIR + self.file_name + ".csv", delimiter=",")
        if not NSW_MODE:
            self.data = self.data[:, 1:] # remove bias
        self.assignment_data = np.loadtxt(
            RAW_DATA_DIR + self.assignment_file_name + ".csv", delimiter=",").astype(int)
        
        # Create a test and train set
        self.all_indeces = np.array(range(len(self.data)))
        treat_indeces = self.all_indeces[self.assignment_data.astype(int) == 1]
        control_indeces = self.all_indeces[self.assignment_data.astype(int) == 0]
        
        num_training = int(len(self.data)*train_size)
        
        # Random select training set
        self.train_indeces = np.random.choice(self.all_indeces, num_training, replace=False)
        
        # Create a test set based on supplied settings.
        if test_train_complement:
            self.test_indeces = list(set(self.all_indeces)^set(self.train_indeces))      
        else:
            self.test_indeces = np.random.choice(self.all_indeces, int(len(self.data)*(1-test_size)), replace=False)
        
        # Calculate class weights for the training set
        num_treated_in_train = len(np.intersect1d(treat_indeces, self.train_indeces, assume_unique=True))
        num_control_in_train = num_training - num_treated_in_train
        
        treat_weight = num_training / (2 * num_treated_in_train)
        control_weight = num_training / (2 * num_control_in_train)
        
        weighter = np.vectorize(lambda index: treat_weight if index in\
            treat_indeces else control_weight)
        
        self.weights = weighter(self.all_indeces)
        
    def active_data(self, index=0):
        if self.train:
            return self.data[self.train_indeces], self.assignment_data[self.train_indeces], \
                self.weights[self.train_indeces][index]
        else:
            if self.test_on_all:
                indeces = self.all_indeces
            else: 
                indeces = self.test_indeces
            
            return self.data[indeces], self.assignment_data[indeces], 1
            
    def __getitem__(self, index):
        covar_data, assignment_data, weight_data = self.active_data(index)
        class_vector = np.zeros(2)
        class_vector[int(assignment_data[index])] = 1
        
        return (covar_data[index], class_vector, weight_data)

    def __len__(self):
        return self.active_data()[0].shape[0]
    
    def save_processed_data(self, data):
        name = PROCESSED_DATA_DIR + self.file_name+".csv"
        np.savetxt(name, data, delimiter=",")
        
def get_datasets(file_name_format, file_name_args, **kwargs):
    train_set = CovariateDataset(file_name_format, file_name_args, **kwargs)
    test_set = copy.deepcopy(train_set)
    test_set.train = False

    predict_set = copy.deepcopy(train_set)
    predict_set.train = False
    predict_set.test_on_all = True
    
    return train_set, test_set, predict_set

In [16]:
CovariateDataset("nsw74_all_{}").data.shape

(16177, 8)

### Model Definition

The general architecture for this model is discussed extensively in Section 3. It invoves a number of hidden layers ending in two hidden units with a softmax activation. In hindsight, it might have been better to use the functionally identical design of a single output unit with a sigmoid activation. The 2-unit output was my first instinctual design. 

It was found that a relatively large number of narrow layers led to the best performance. As discussed in Section 3 - this is probably because depth allows the network to learn progressively more abstract features about the input data.

The only nuance worth mentioned is the presence of the dropout layers which randomly zero the weights in the hidden layers with probability 0.6 on each run. It was found that a combination of multiple hidden layers and aggressive regualrization (p=0.6) provided the best classification results. This is in line with the best practice in the field which is that networks should not be regularized by reducing their size. Rather, the size should be expetnded to allow more complex abstract representations with regularization used to prevent overfit. 

In [33]:
# Based on an example from https://github.com/pytorch/examples/blob/master/vae/main.py
# Extended to place a different prior on binary vs normal vars
if NSW_MODE:
    FEATURES = 8
    LAYER_WIDTH_1 = 64
    LAYER_WIDTH_2 = 32
else:
    FEATURES = 10
    LAYER_WIDTH_1 = 16
    LAYER_WIDTH_2 = 16
    
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()
        
        INTERMEDIATE_DIMS_1 = LAYER_WIDTH_1
        INTERMEDIATE_DIMS_2 = LAYER_WIDTH_1
        INTERMEDIATE_DIMS_3 = LAYER_WIDTH_2
        INTERMEDIATE_DIMS_4 = LAYER_WIDTH_2

        

        LOSS_SCALE = 1

        # ENCODER LAYERS
        self.dense1 = nn.Linear(FEATURES, INTERMEDIATE_DIMS_1)
        self.dense2 = nn.Linear(INTERMEDIATE_DIMS_1, INTERMEDIATE_DIMS_2)
        self.dense3 = nn.Linear(INTERMEDIATE_DIMS_2, INTERMEDIATE_DIMS_3)
        self.dense4 = nn.Linear(INTERMEDIATE_DIMS_3, INTERMEDIATE_DIMS_4)
        self.dense5 = nn.Linear(INTERMEDIATE_DIMS_4, 2)
        
        # Activations
        self.softmax = nn.Softmax(dim=1)
        self.relu = nn.ReLU()
        
        if NSW_MODE:
            self.dropout = nn.Dropout(p=0.2)
        else:
            self.dropout = nn.Dropout(p=0.6)
            
    
    # Perform a forward pass
    def forward(self, x):
        h1 = self.dropout(self.relu(self.dense1(x)))
        h2 = self.dropout(self.relu(self.dense2(h1)))
        h3 = self.dropout(self.relu(self.dense3(h2)))
        h4 = self.dropout(self.relu(self.dense4(h3)))
        
        return self.softmax(self.dense5(h4))

In [32]:
print(Regressor())

Regressor(
  (dense1): Linear(in_features=8, out_features=64, bias=True)
  (dense2): Linear(in_features=64, out_features=64, bias=True)
  (dense3): Linear(in_features=64, out_features=32, bias=True)
  (dense4): Linear(in_features=32, out_features=32, bias=True)
  (dense5): Linear(in_features=32, out_features=2, bias=True)
  (softmax): Softmax()
  (relu): ReLU()
  (dropout): Dropout(p=0.3)
)


### Train and Test Code

This code is very similar in structure to the code used afor the AE and VAE above. The only difference is that additional methods are provided to test the trained model used the labelled test set after each epoch. The two methods which perform this role are `test` and `accuracy`. The accuracy is a simple measure of percentage-correct. In this case, there is no particular emphasis on false positives/negatives so this metric is sufficient.

Wrapper code is provided to run a scikit learn logistic regression on a PyTorch dataset. The results from the standard logistic regression were used as a benchmark for the models performence during optimization. 

In [19]:
def train(model, optimizer, epoch, train_loader, log_results=False):
    model.train()
    train_loss = 0

    for batch_idx, (data, target_class, weights) in enumerate(train_loader):
        data = Variable(data)
        target_class = Variable(target_class)
        weights = Variable(weights)
        
        data = data.float()
        target_class = target_class.float()
        weights = weights.float()
        
        if CUDA:
            data = data.cuda()
            target_class = target_class.cuda()
            weights = weights.cuda()
        
        optimizer.zero_grad()

        output_propensity = model(data)
        
        # calculate loss
        loss_criterion = nn.BCELoss(weight=weights.view(weights.shape[0], 1), size_average=False)
        loss = loss_criterion(output_propensity, target_class)

        train_loss += loss.data[0]
        
        # Find the gradient and descend
        loss.backward()
        optimizer.step()
        
    if log_results:
        print('====> Epoch: {} Average loss: {:.8f}'.format(
              epoch, train_loss / len(train_loader.dataset)))
        
# Evaluate the model's performance on the test set      
def test(model, epoch, test_loader):
    # toggle model to test / inference mode
    # to prevent the dropout regularization
    model.eval()
    test_loss = 0

    for i, (data, target_class, weights) in enumerate(test_loader):
        data = Variable(data, volatile=True)
        target_class = Variable(target_class, volatile=True)
        weights = Variable(weights, volatile=True)
        
        data = data.float()
        target_class = target_class.float()
        weights = weights.float()
        
        if CUDA:
            data = data.cuda()
            target_class = target_class.cuda()
            weights = weights.cuda()

        output_propensity = model(data)
        
        # calculate loss
        loss_criterion = nn.BCELoss(weight=weights.view(weights.shape[0], 1), size_average=False)
        loss = loss_criterion(output_propensity, target_class)
        test_loss += loss.data[0]

    test_loss /= len(test_loader.dataset)
    
    if CUDA:
        output_propensity = output_propensity.cpu()
        target_class = target_class.cpu()
        
    score = accuracy(output_propensity.data.numpy(), target_class.data.numpy(), verbose=False)
    print('====> Test set loss: {:.4f}, {}%'.format(test_loss, score*100))

# Output predictions for all data
def predict(model, predict_loader):
    # Show reconstruction
    model.eval()
    print("Training state: ", model.training)
    
    original_data, targets, _ = next(iter(predict_loader))
    
    original_data = Variable(original_data)
    original_data = original_data.float()
    
    if CUDA:
        original_data = original_data.cuda()
        
    return original_data, targets, model(original_data)

# Use scikit learn functions to evaluate accuracy
def accuracy(output_data, targets, verbose=True):
    classes = np.argmax(output_data, axis=1)
    targets = np.argmax(targets, axis=1)
    
    if verbose:
        print(classification_report(targets, classes))
    return accuracy_score(targets, classes)

# Run a standard logistic classifier on a PyTorch dataset
# to serve as a benchmark for the NN performance.
def run_logistic(train_set, verbose=True):
    model = LogisticRegression(class_weight="balanced")
    
    X = train_set.data
    y = train_set.assignment_data

    X_train = X[train_set.train_indeces]
    X_test = X[train_set.test_indeces]
    y_train = y[train_set.train_indeces]
    y_test = y[train_set.test_indeces]

    model.fit(X_train, y_train)
    predictions = model.predict(X)
    
    if verbose:
        print(classification_report(y, predictions))
    
    return accuracy_score(y, predictions)

In [21]:
def train_model(model_class, train_set, test_set, predict_set, dataset_number, verbose=True, model=None):
    if model is None:
        model = model_class()
        if CUDA:
            model = model.cuda()

    num_epochs = 10
    train_batch_size = 512
    test_batch_size = 250
    learning_rate = 1e-3
    lr_sched = False
         
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [int(num_epochs/5), int(num_epochs/2)], gamma=0.1)

    train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True)
    test_loader = DataLoader(test_set, batch_size=test_batch_size, shuffle=True)
    predict_loader = DataLoader(predict_set, batch_size=1000, shuffle=False)
    
    for epoch in range(1, num_epochs+1):
        checkpoint_interval = int(num_epochs/10)
        
        if lr_sched:
            scheduler.step()

        log = False
        if epoch%checkpoint_interval == 0:
            log = True
            
        train(model, optimizer, epoch, train_loader, log_results=log)
        if log:
            test(model, epoch, test_loader)
    
    original_data, targets, output = predict(model, predict_loader)
    if CUDA:
        output = output.cpu()
        targets = targets.cpu()
    
    return model, original_data, targets, output

def encode_data(dataset, output_data):
    
    if CUDA:
        output_data = output_data.cpu()
        
    dataset.save_processed_data(output_data.data.numpy()[:, 1])


#### Run single Monte Carlo training session

In [None]:
train_set, test_set, predict_set = get_datasets(
    "n_{}_model_{}_v_{}_{}_data", [1000, "G_mod_nadd_mod_nlin", 1],
    train_size=0.8, test_train_complement=True)

start = time()
trained_model, original_data, targets, output = \
    train_model(Regressor, train_set, test_set,predict_set, 1,verbose=True)
print("Elapsed: ", time() - start)


acc = accuracy(output.data.cpu().numpy(), targets.numpy(), verbose=False)
print("Complete set accuracy: {}%".format(acc*100))

encode_data(train_set, output)

#### Run single NSW training session

In [22]:
train_set, test_set, predict_set = get_datasets(
    "nsw74_all_{}", [],
    train_size=0.8, test_train_complement=True)

In [25]:
run_logistic(train_set, verbose=True)

             precision    recall  f1-score   support

          0       1.00      0.92      0.96     15992
          1       0.11      0.87      0.19       185

avg / total       0.99      0.92      0.95     16177



0.917660876553131

In [34]:
start = time()
trained_model, original_data, targets, output = \
    train_model(Regressor, train_set, test_set,predict_set, 1,verbose=True)
print("Elapsed: ", time() - start)

====> Epoch: 1 Average loss: 10.17286797
====> Test set loss: 0.4339, 99.57627118644068%
====> Epoch: 2 Average loss: 10.30364156
====> Test set loss: 0.4363, 97.45762711864407%
====> Epoch: 3 Average loss: 10.42895070
====> Test set loss: 0.4365, 98.72881355932203%
====> Epoch: 4 Average loss: 10.06276574
====> Test set loss: 0.4365, 99.57627118644068%
====> Epoch: 5 Average loss: 10.12491370
====> Test set loss: 0.4367, 99.15254237288136%
====> Epoch: 6 Average loss: 10.70515808
====> Test set loss: 0.4367, 97.88135593220339%
====> Epoch: 7 Average loss: 10.98514212
====> Test set loss: 0.4367, 99.15254237288136%
====> Epoch: 8 Average loss: 10.79809389
====> Test set loss: 0.4367, 99.15254237288136%
====> Epoch: 9 Average loss: 9.88545247
====> Test set loss: 0.4367, 97.88135593220339%
====> Epoch: 10 Average loss: 10.75704047
====> Test set loss: 0.4367, 99.15254237288136%
Training state:  False
Elapsed:  487.83007884025574


In [35]:
acc = accuracy(output.data.cpu().numpy(), targets.numpy(), verbose=False)
print("Complete set accuracy: {}%".format(acc*100))

Complete set accuracy: 84.2%


In [36]:
encode_data(train_set, output)

### Process Data

In [None]:
assignment_model_names = ['A_add_lin', 'B_add_mild_nlin', 'C_add_mod_nlin', 'D_mild_nadd_lin',
                     'E_mild_nadd_mild_nlin', 'F_mod_nadd_lin', 'G_mod_nadd_mod_nlin']

nn_accuracies = []
log_accuracies = []

for dataset_number in range(275, 300):
    print("Starting run for Dataset {}".format(dataset_number))
    
    for model_name in assignment_model_names:
        print("---- Running for model name: ", model_name)
        
        start = time()

        train_set, test_set, predict_set = get_datasets(
            "n_{}_model_{}_v_{}_{}_data", [1000, model_name, dataset_number], train_size=0.8)

        trained_model, original_data, targets, output = \
            train_model(Regressor, train_set, test_set,predict_set, 1,verbose=True)
        
        nn_acc = accuracy(output.data.cpu().numpy(), targets.numpy(), verbose=False)
        print("Complete set accuracy: {}%".format(nn_acc*100))
        
        log_acc = run_logistic(train_set, verbose=False)
        print("Log accuracy: {}%".format(log_acc*100))
        
        nn_accuracies.append(nn_acc)
        log_accuracies.append(log_acc)

        encode_data(train_set, output)

        print("---- Done in ", time() - start, " seconds\n")
                
    print("================\n\n")