# Robustness Analysis for Deep Vertically Federated Learning

Within this notebook, we train an ensemble of “black box” deep neural network in a vertically federated setting construction and ask cognitively motivated questions. Question such as:
- How does the network’s behavior / performance change as a function of an exogenous factor?
    - Network depth
    - Noise applied to different parts of the enesmble model
- How can we modify the training algorithm to increase the network’s robustness?
    - Can we take advantage of the vetically federated setting?

In [3]:
# Importing Required Libraries

# Libraries for Data Handling
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset

# Libraries for Algorithms
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import average_precision_score, accuracy_score
import helper

# Libraries for Data Visulation Tools
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# experiment params:

# n distinct parties 
n_distinct_parties = 5

# dataset dir (must be generated by pre-processor for respective n_distinct_parties)
dataset_dir = f"datasets/pre_processed_adult_{n_distinct_parties}p"

# results dir (where to save the results)
results_dir = f"results/adult_{n_distinct_parties}p"

In [5]:
# Import pre-processed Dataset
test_datframes = []
train_dataframes = []
for i in range(1, n_distinct_parties+1):
    test_datframes.append(pd.read_csv(f"{dataset_dir}/test_data_party_{i}.csv", sep=',', header=0))
    train_dataframes.append(pd.read_csv(f"{dataset_dir}/train_data_party_{i}.csv", sep=',', header=0))

test_y_dataframe = pd.read_csv(f"{dataset_dir}/test_labels.csv", sep=',', header=0)
train_y_dataframe = pd.read_csv(f"{dataset_dir}/train_labels.csv", sep=',', header=0)

# Convert DataFrames to Tensors
test_datasets = []
train_datasets = []
for i in range(n_distinct_parties):
    test_datasets.append(torch.tensor(test_datframes[i].values, dtype=torch.float32))
    train_datasets.append(torch.tensor(train_dataframes[i].values, dtype=torch.float32))

test_y = torch.tensor(test_y_dataframe.values, dtype=torch.float32)
train_y = torch.tensor(train_y_dataframe.values, dtype=torch.float32)

# identify feature->party parition:
party_paritions = [len(i.columns) for i in train_dataframes]

In [6]:
# Create dataset/dataloader for party input (concatonate all parties for clean dataloader)

# Create dataset
train_dataset = TensorDataset(torch.cat(train_datasets, dim=1), train_y)
test_dataset = TensorDataset(torch.cat(test_datasets, dim=1), test_y)

# Create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [7]:
# define models
client_output_embedding_size = 32
client_models = helper.generateClientModels(party_paritions, client_output_embedding_size)

fusion_model = helper.generateFusionModel(client_output_embedding_size * n_distinct_parties)


# define optimizers and loss function
client_optimizers = [optim.Adam(model.parameters(), lr=0.001) for model in client_models]
fusion_optimizer = optim.SGD(fusion_model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

In [9]:
%%script true
# Train ensemble model without noise

training_params = {
    "client_models": client_models,
    "fusion_model": fusion_model,
    "party_paritions": party_paritions,
    "train_dataloader": train_dataloader,
    "client_optimizers": client_optimizers,
    "fusion_optimizer": fusion_optimizer,
    "loss_fn": loss_fn,
    "epochs": 10
}

losses, epo, auprc_values = helper.trainEnsemble(**training_params)