# Disentangled Causal Effect Variational Autoencoder

**Inputs:**
- data/heart_disease_cleaned.csv

**Outputs:**
- DCEVEA model
- data/fair_disease_dcevae.csv
- data/cf_disease_dcevea.csv

## Setup and imports

In [1]:
try:
  from google.colab import userdata
  from google.colab import drive
  drive.mount('/content/drive')
  PROJECT_ROOT = userdata.get('PROJECT_ROOT')
except ImportError:
  PROJECT_ROOT = '/'


Mounted at /content/drive


In [89]:
import pandas as pd
import numpy as np
import torch
import torch.utils.data as utils
import re
from torch import nn

## Classes and functions





### DCEVAE Model

### Utils

In [93]:
def make_loader(X_ind, X_desc, X_corr, X_sens, Y, index, batch_size=32):
  X_ind_fact = X_ind[index]
  X_desc_fact = X_desc[index]
  X_corr_fact = X_corr[index]
  X_sens_fact = X_sens[index]
  Y_fact = Y[index]

  permuted_indices = np.random.permutation(X_ind_fact.shape[0])
  X_ind_fake = X_ind[permuted_indices]
  X_desc_fake = X_desc[permuted_indices]
  X_corr_fake = X_corr[permuted_indices]
  X_sens_fake = X_sens[permuted_indices]
  Y_fake = Y[permuted_indices]

  X_ind_tensor = torch.tensor(X_ind_fact, dtype=torch.float32)
  X_desc_tensor = torch.tensor(X_desc_fact, dtype=torch.float32)
  X_corr_tensor = torch.tensor(X_corr_fact, dtype=torch.float32)
  X_sens_tensor = torch.tensor(X_sens_fact, dtype=torch.float32)
  Y_tensor = torch.tensor(Y_fact, dtype=torch.float32)
  X_ind_tensor_2 = torch.tensor(X_ind_fake, dtype=torch.float32)
  X_desc_tensor_2 = torch.tensor(X_desc_fake, dtype=torch.float32)
  X_corr_tensor_2 = torch.tensor(X_corr_fake, dtype=torch.float32)
  X_sens_tensor_2 = torch.tensor(X_sens_fake, dtype=torch.float32)
  Y_tensor_2 = torch.tensor(Y_fake, dtype=torch.float32)

  dataset = utils.TensorDataset(X_ind_tensor, X_desc_tensor, X_corr_tensor, X_sens_tensor, Y_tensor,
                                X_ind_tensor_2, X_desc_tensor_2, X_corr_tensor_2, X_sens_tensor_2, Y_tensor_2)
  loader = utils.DataLoader(dataset, batch_size=batch_size, shuffle=True)

  return loader

def make_bucketed_loader(dataset, map, val_size=0.1, test_size=0.1, batch_size=32, seed=4):
  '''
    Creates train, validation and test DataLoader for the given dataset, \
    separating features into independent, sensitive, descendant, and correlated features.

    Input:
      - dataset: a pandas DataFrame
      - map: a dictionary mapping feature names to buckets
      - val_size: the proportion of the dataset to use for validation
      - test_size: the proportion of the dataset to use for testing
      - batch_size: the batch size for the DataLoader
      - seed: a seed for the random number generator

    Output:
      - train_loader: Training DataLoader
      - val_loader: Validation DataLoader
      - test_loader: Testing DataLoader
  '''
  np.random.seed(seed=seed)

  ## BUCKET DATASET
  # Independent, Descendant, Correlated features
  r_ind = re.compile(f'{"|".join(map['ind'])}')
  X_ind = dataset.filter(regex=r_ind).to_numpy()
  r_desc = re.compile(f'{"|".join(map['desc'])}')
  X_desc = dataset.filter(regex=r_desc).to_numpy()
  r_corr = re.compile(f'{"|".join(map['corr'])}')
  X_corr = dataset.filter(regex=r_corr).to_numpy()

  # Sensitive attribute and Target
  X_sens = dataset[map['sens']].to_numpy().reshape(-1, 1)
  Y = dataset[map['target']].to_numpy().reshape(-1, 1)

  ## TRAIN-VAL-TRAIN SPLIT
  N = X_ind.shape[0]
  shuffled_indices = np.random.permutation(N)
  val_count = int(N * val_size)
  test_count = int(N * test_size)
  val_index = shuffled_indices[:val_count]
  test_index = shuffled_indices[val_count:val_count+test_count]
  train_index = shuffled_indices[val_count+test_count:]

  # Training loader
  train_loader = make_loader(X_ind, X_desc, X_corr, X_sens, Y, train_index, batch_size)

  # Validation loader
  val_loader = make_loader(X_ind, X_desc, X_corr, X_sens, Y, val_index, batch_size)

  # Test loader
  test_loader = make_loader(X_ind, X_desc, X_corr, X_sens, Y, test_index, batch_size)

  return train_loader, val_loader, test_loader

## Data preparation

In [94]:
heart_disease = pd.read_csv(PROJECT_ROOT + '/data/heart_disease_cleaned.csv')

# Hot-on encoding for categorical features
heart_disease_encoded = pd.get_dummies(heart_disease, columns=['cp','ecg','slope'], drop_first=True, dtype=int)

feature_mapping = {
    'ind': ['age'], # Features independent of the protected attribute and unconfounded
    'sens': 'sex', # Sensitive attribute
    'desc': ['cp', 'ecg', 'ang'], # Features descendant of the protected attribute
    'corr': ['bp', 'chol', 'fbs', 'mhr', 'st', 'slope'], # Features correlated with the protected attribute
    'target': 'cvd' # Target outcome
}

# Bucketed data loaders for training , validation, and test
train_loader, val_loader, test_loader = make_bucketed_loader(heart_disease_encoded, feature_mapping)