In [1]:
import sys
from pathlib import Path
parent_dir = str(Path.cwd().parent)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import syft as sy
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
np.random.seed(666)
from Distributed_HM_Data import HMSaleTrainDataLoader, Distributed_HM, binary_acc
from utils_models import SalesNN, CustomersNN, ProductsNN

dataDir = Path.cwd().parent.parent/'Data/'

# model will train on CPU since PySyft 0.2.9 exist bugs with CUDA
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print(device)

In [2]:
import sys
import logging

# preserve training log
so = open("config2.log", 'w', 10)
sys.stdout.echo = so
sys.stderr.echo = so

get_ipython().log.handlers[0].stream = so
get_ipython().log.setLevel(logging.INFO)

In [3]:
# customized models for specific configuration

class GovernanceNN(nn.Module):
    """ Partial model for goverance side
    Args:
        agg_latent_input (int): aggregated input of latent vectors from client models
    
    """
    def __init__(
            self,
            input_size: int = 85,
            hidden_size_1: int = 128,
            hidden_size_2: int = 64,
        ):
        super().__init__()
        self.relu = nn.LeakyReLU()
        in_channels = (
            [input_size] 
            + [hidden_size_1]
            + [hidden_size_2]
        )
        self.decoder = nn.Sequential(
            *[nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]) for i in range(len(in_channels)-1) if i != len(in_channels)-1]
        )
    def forward(self, agg_latent_input):
        
        for layer in self.decoder:
            agg_latent_input = layer(agg_latent_input)
            agg_latent_input = self.relu(agg_latent_input)
        
        out = agg_latent_input
        
        return out

class LabelOwner(nn.Module):
    """ Partial model for label owner
    Args:
        agg_latent_input (int): aggregated input of latent vectors from client models
    
    """
    def __init__(
            self,
            input_size: int = 64,
            output_size: int = 2,
        ):
        super().__init__()

        self.lin = nn.Linear(in_features=input_size, out_features=output_size)
        
    def forward(self, server_out):
        
        out = self.lin(server_out)
        return out   

# define the Split Neural Network

class SplitNN(nn.Module):
    def __init__(self, models, optimizers, data_owner, server, label_owner):
        self.models = models
        self.optimizers = optimizers
        self.data_owners = data_owner
        self.server = server
        self.label_owner = label_owner
        
#         self.outputs = [None]*len(self.models)
        super().__init__()
        
    def forward(self, data_pointer):
        
        #individual client's output upto their respective cut layer
        client_output = {}

        #outputs that is moved to server and subjected to concatenate for server input
        remote_output = []
        
        for owner in self.data_owners:
            if owner.id == "sales_domain":
                client_output[owner.id] = self.models[owner.id](data_pointer[owner.id][0], data_pointer[owner.id][1], data_pointer[owner.id][2], data_pointer[owner.id][3])
                remote_output.append(
                    client_output[owner.id].move(server, requires_grad=True)
                )
            elif owner.id == "customer_domain":
                client_output[owner.id] = self.models[owner.id](data_pointer[owner.id][0], data_pointer[owner.id][1])
                remote_output.append(
                    client_output[owner.id].move(server, requires_grad=True)
                )
            elif owner.id == "product_domain":
                client_output[owner.id] = self.models[owner.id](data_pointer[owner.id][0], data_pointer[owner.id][1], data_pointer[owner.id][2])
                remote_output.append(
                    client_output[owner.id].move(server, requires_grad=True)
                )
        # concat outputs from clients and send to server side
        server_input = torch.cat(remote_output, dim=-1)
        # make prediction on server model and send to the label owner client
        server_output = self.models["server"](server_input)
        server_output.move(label_owner, requires_grad=True)
        pred = self.models["label_owner"](server_output)
        
        return pred

    def zero_grads(self):
        for opt in self.optimizers:
            opt.zero_grad()
        
    def step(self):
        for opt in self.optimizers:
            opt.step()
    
    def train(self):
        for loc in self.models.keys():
            self.models[loc].train()
#             if loc == "server":
#                 for i in range(len(self.models[loc])):
#                     self.models[loc][i].train()
#             else:
#                 self.models[loc].train()
    
    def eval(self):
        for loc in self.models.keys():
            self.models[loc].eval()        
    
    def load_weights(self, file_prefix):
        for loc in self.models.keys():
            self.models[loc].load_state_dict(torch.load(f"{file_prefix}_{loc}_weights.pth"))
            
    @property
    def location(self):
        return self.models[0].location if self.models and len(self.models) else None

# training function
    
def train(x, label, splitNN):
    
    #1) Zero our grads
    splitNN.zero_grads()
    
    #2) Make a prediction
    pred = splitNN.forward(x)
  
    #3) Figure out how much we missed by
    criterion = nn.CrossEntropyLoss()
    loss = criterion(pred, label)
    
    #4) Backprop the loss on the end layer
    loss.backward()
    
    #5) Feed Gradients backward through the network
    #splitNN.backward()
    
    #6) Change the weights
    splitNN.step()
    
    return loss.detach().get()
 

In [4]:
hm_data = pd.read_csv(dataDir/'medium_train.csv')
all_products_id = hm_data["article_id"].unique()
train_data = HMSaleTrainDataLoader(hm_data, all_products_id)
train_loader = DataLoader(train_data, batch_size=1024, shuffle=True)

# set up virtual worker
hook = sy.TorchHook(torch)
sales_domain = sy.VirtualWorker(hook, id="sales_domain")
customer_domain = sy.VirtualWorker(hook, id="customer_domain")
product_domain = sy.VirtualWorker(hook, id="product_domain")
server = sy.VirtualWorker(hook, id="server")
label_owner = sy.VirtualWorker(hook, id="label_owner")

data_owners = (sales_domain, customer_domain, product_domain)
model_locations = [sales_domain, customer_domain, product_domain, server, label_owner]

distributed_trainloader = Distributed_HM(data_owners=data_owners, data_loader=train_loader)

# set up parameters for model
num_users = len(hm_data.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(hm_data.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(hm_data.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(hm_data.index_name.unique())

models = {
    "sales_domain": SalesNN(num_users=num_users, num_items=num_items),
    "customer_domain": CustomersNN(),
    "product_domain": ProductsNN(num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name),
    "server": GovernanceNN(),
    "label_owner": LabelOwner(),
}

# set up optimizer for clients' model
optimizers = [
    optim.Adam(models[location.id].parameters(), lr=0.003)
    for location in model_locations
]

for location in model_locations:
    models[location.id].send(location)

  0%|          | 0/26416 [00:00<?, ?it/s]

num_users: 880
num_items: 16450
num_product_groups: 13
num_color_groups: 49


In [5]:
print(models)

epochs = 120
torch.autograd.set_detect_anomaly(True)
splitnn = SplitNN(models, optimizers, data_owners, server, label_owner)

for i in range(epochs):
    running_loss = 0.0
    splitnn.train()
    for data_ptr, labels in distributed_trainloader:  
        labels = labels.send(label_owner)
        loss = train(data_ptr, labels, splitnn)
        running_loss += loss
    else:
        print("Epoch {} - Training loss: {}".format(i, running_loss/len(distributed_trainloader)))
        

{'sales_domain': SalesNN(
  (user_embedding_layer): Embedding(880, 16)
  (item_embedding_layer): Embedding(16450, 32)
  (relu): LeakyReLU(negative_slope=0.01)
), 'customer_domain': CustomersNN(
  (relu): LeakyReLU(negative_slope=0.01)
  (encoder): Sequential(
    (0): Linear(in_features=2, out_features=5, bias=True)
  )
), 'product_domain': ProductsNN(
  (product_group_embedding_layer): Embedding(13, 8)
  (color_group_embedding_layer): Embedding(49, 16)
  (index_name_embedding_layer): Embedding(10, 6)
  (relu): LeakyReLU(negative_slope=0.01)
), 'server': GovernanceNN(
  (relu): LeakyReLU(negative_slope=0.01)
  (decoder): Sequential(
    (0): Linear(in_features=85, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
  )
), 'label_owner': LabelOwner(
  (lin): Linear(in_features=64, out_features=2, bias=True)
)}
Epoch 0 - Training loss: 0.5059341788291931
Epoch 1 - Training loss: 0.497627854347229
Epoch 2 - Training loss: 0.48744991421699524
Epoch 3 -

In [6]:
def binary_acc(y_pred, y_test):
    acc = 0.0
    y_pred_label = torch.softmax(y_pred, dim=1)
    _, y_pred_label = torch.max(y_pred_label, dim = 1)
    correct_pred = (y_pred_label == y_test).sum()
    
    acc = correct_pred.item()/y_test.shape[0]
    return acc 

def predict(models, dataloader, dataset_name):
    test_acc = 0.0
    with torch.no_grad():
        for data_ptr, label in dataloader:
            output = splitnn.forward(data_ptr).get()
            acc = binary_acc(output, label)
            test_acc += acc

    print("Accuracy on dataset {} is ({:.3}%)".format(dataset_name, 100* test_acc/len(dataloader)))

In [7]:
# prepare and distribute test dataset
hm_test_data = pd.read_csv(dataDir/'medium_test.csv')
test_data = HMSaleTrainDataLoader(hm_test_data, all_products_id)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
distributed_testloader = Distributed_HM(data_owners=data_owners, data_loader=test_loader)

#Accuracy on train and test sets
splitnn.eval()
# predict(models, distributed_trainloader, "Train set")
predict(models, distributed_testloader, "Test set")

  0%|          | 0/498 [00:00<?, ?it/s]

Accuracy on dataset Test set is (87.7%)


In [8]:
def save_weights(models, file_prefix):
    for loc in models.keys():
         torch.save(models[loc].get().state_dict(), f"{file_prefix}_{loc}_weights.pth")

save_weights(models, "Split_RecNN")

# Load weights and reproduce experiments

In [9]:
def load_weights(models, file_prefix):
    for loc in models.keys():
        model_weights = torch.load(f"{file_prefix}_{loc}_weights.pth")
        models[loc].load_state_dict(model_weights)  

In [10]:
hm_data = pd.read_csv(dataDir/'medium_train.csv')
all_products_id = hm_data["article_id"].unique()
train_data = HMSaleTrainDataLoader(hm_data, all_products_id)
train_loader = DataLoader(train_data, batch_size=1024, shuffle=True)

# set up parameters for model
num_users = len(hm_data.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(hm_data.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(hm_data.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(hm_data.index_name.unique())

# load the model parameters
models = {
    "sales_domain": SalesNN(num_users=num_users, num_items=num_items),
    "customer_domain": CustomersNN(),
    "product_domain": ProductsNN(num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name),
    "server": GovernanceNN(),
    "label_owner": LabelOwner(),
}

# Load the weights locally
load_weights(models, "Split_RecNN")

# set up virtual worker
hook = sy.TorchHook(torch)
sales_domain = sy.VirtualWorker(hook, id="sales_domain")
customer_domain = sy.VirtualWorker(hook, id="customer_domain")
product_domain = sy.VirtualWorker(hook, id="product_domain")
server = sy.VirtualWorker(hook, id="server")
label_owner = sy.VirtualWorker(hook, id="label_owner")

data_owners = (sales_domain, customer_domain, product_domain)
model_locations = [sales_domain, customer_domain, product_domain, server, label_owner]
distributed_trainloader = Distributed_HM(data_owners=data_owners, data_loader=train_loader)

for location in model_locations:
    models[location.id].send(location)
    
# set up optimizer for clients' model
optimizers = [
    optim.Adam(models[location.id].parameters(), lr=0.003)
    for location in model_locations
]


  0%|          | 0/26416 [00:00<?, ?it/s]



num_users: 880
num_items: 16450
num_product_groups: 13
num_color_groups: 49


In [12]:
# prepare and distribute test dataset
hm_test_data = pd.read_csv(dataDir/'medium_test.csv')
test_data = HMSaleTrainDataLoader(hm_test_data, all_products_id)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)
distributed_testloader = Distributed_HM(data_owners=data_owners, data_loader=test_loader)

splitnn = SplitNN(models, optimizers, data_owners, server, label_owner)

#Accuracy on train and test sets
splitnn.eval()
# predict(models, distributed_trainloader, "Train set")
predict(models, distributed_testloader, "Test set")

  0%|          | 0/498 [00:00<?, ?it/s]

Accuracy on dataset Test set is (87.1%)
