In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import syft as sy
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
np.random.seed(666)

hook = sy.TorchHook(torch)
from Distributed_HM_Data import Distributed_HM, binary_acc

dataDir = Path.cwd().parent/'Data/'
print("Test")

Test


In [2]:
class HMSaleTrainDataLoader(Dataset):
    """HMSaleTrainDataLoader Training set of HM sales data

    Args:
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id):
        self.customers, self.products, self.prices, self.sales_channels, \
        self.club_status, self.age_groups, self.product_groups, self.color_groups, \
        self.index_name, self.labels = self.get_dataset(transactions, all_products_id)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.prices[idx], self.sales_channels[idx], self.club_status[idx], \
               self.age_groups[idx], self.product_groups[idx], self.color_groups[idx], self.index_name[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id):
        customers, products, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name, labels  = [], [], [], [], [], [], [], [], [], []
        customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                                       transactions["price"], transactions["sales_channel_id"], 
                                       transactions["club_member_status"], transactions["age"], 
                                       transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))
        
        """negative sampling"""
        # set up negative:positive ratio as 4:1
        negative_samples = 4

        for u, i, price, sale, club, age, product, color, index in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            prices.append(price)
            sales_channels.append(sale)
            club_status.append(club)
            age_groups.append(age)
            product_groups.append(product)
            color_groups.append(color)
            index_name.append(index)
            labels.append(1)
            for _ in range(negative_samples):
                negative_product = np.random.choice(all_products_id)
                while (u, negative_product, price, sale, club, age, product, color, index) in customer_product_set:
                    negative_product = np.random.choice(all_products_id)
                customers.append(u)
                products.append(negative_product)
                prices.append(price)
                sales_channels.append(sale)
                club_status.append(club)
                age_groups.append(age)
                product_groups.append(product)
                color_groups.append(color)
                index_name.append(index)
                labels.append(0)
        return torch.tensor(customers), torch.tensor(products), torch.tensor(prices), torch.tensor(sales_channels), \
               torch.tensor(club_status), torch.tensor(age_groups), torch.tensor(product_groups), torch.tensor(color_groups), \
               torch.tensor(index_name), torch.tensor(labels)


In [3]:
hm_data = pd.read_csv(dataDir/'small_train.csv')
all_products_id = hm_data["article_id"].unique()
train_data = HMSaleTrainDataLoader(hm_data, all_products_id)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

  0%|          | 0/1403 [00:00<?, ?it/s]

In [4]:
# set up virtual worker
sales_domain = sy.VirtualWorker(hook, id="sales_domain")
customer_domain = sy.VirtualWorker(hook, id="customer_domain")
product_domain = sy.VirtualWorker(hook, id="product_domain")
server = sy.VirtualWorker(hook, id="server")

data_owners = (sales_domain, customer_domain, product_domain)
model_locations = [sales_domain, customer_domain, product_domain, server]

distributed_trainloader = Distributed_HM(data_owners=data_owners, data_loader=train_loader)

In [89]:
class SalesNN(nn.Module):
    """ Partial model for sales domain
    Args:
        num_users (int): Number of users
        num_items (int): Number of products
        prices (float): price of transactions
        sales_channels (float): sales channels
    
    """
    def __init__(
            self, 
            num_users: int, 
            num_items: int,
            input_size: int = 194,
            user_embedding_dim: int = 64,
            item_embedding_dim: int = 128,
            hidden_size_1: int = 64,
            hidden_size_2: int = 256,
            output_size: int = 32,
        ):
        super().__init__()
        self.user_embedding_layer = nn.Embedding(num_embeddings=num_users, embedding_dim=user_embedding_dim)
        self.item_embedding_layer = nn.Embedding(num_embeddings=num_items, embedding_dim=item_embedding_dim)
        self.relu = nn.LeakyReLU()
        self.encoder = nn.Sequential()
        if num_users and num_items is not None:
                in_channels = (
                    [input_size] 
                    + [hidden_size_2]
                    + [hidden_size_1]
                    + [output_size]
                )
        else:
            raise ValueError
        for i in range(len(in_channels)-1):
            if i != len(in_channels)-1:
                self.encoder.append(nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]))
        
    def forward(self, user_input, item_input, prices, sales_channels):
        user_embedding = self.user_embedding_layer(user_input)
        item_embedding = self.item_embedding_layer(item_input)
        latent_vec = torch.cat([user_embedding, item_embedding, prices, sales_channels], dim=-1)
        
        for layer in self.encoder:
            latent_vec = layer(latent_vec)
            latent_vec = self.relu(latent_vec)
        
        return latent_vec

class CustomersNN(nn.Module):
    """ Partial model for customer domain
    Args:
        club_status (int): active or inactive customers' status
        age_groups (int): age of customers
    
    """
    def __init__(
            self,
            input_size: int = 2,
            hidden_size_1: int = 16,
            hidden_size_2: int = 32,
            output_size: int = 8,
        ):
        super().__init__()
        self.relu = nn.LeakyReLU()
        self.encoder = nn.Sequential()
        in_channels = (
                    [input_size] 
                    + [hidden_size_2]
                    + [hidden_size_1]
                    + [output_size]
                )
        for i in range(len(in_channels)-1):
            if i != len(in_channels)-1:
                self.encoder.append(nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]))
        
    def forward(self, club_status, age_groups):
        
        latent_vec = torch.cat([club_status, age_groups], dim=-1)
        
        for layer in self.encoder:
            latent_vec = layer(latent_vec)
            latent_vec = self.relu(latent_vec)
        
        return latent_vec

class ProductsNN(nn.Module):
    """ Partial model for product domain
    Args:
        num_product_groups (int): Number of product groups
        num_color_groups: (int): Number of color groups
        num_index_name: (int): Number of index name
    
    """
    def __init__(
            self,
            num_product_groups: int,
            num_color_groups: int,
            num_index_name: int,
            product_group_embedding_dim: int = 8,
            color_group_embedding_dim: int = 16,
            index_name_embedding_dim: int = 6,
            input_size: int = 30,
            hidden_size_1: int = 32,
            hidden_size_2: int = 64,
            output_size: int = 16,
        ):
        super().__init__()
        self.product_group_embedding_layer = nn.Embedding(num_embeddings=num_product_groups, embedding_dim=product_group_embedding_dim)
        self.color_group_embedding_layer = nn.Embedding(num_embeddings=num_color_groups, embedding_dim=color_group_embedding_dim)
        self.index_name_embedding_layer = nn.Embedding(num_embeddings=num_index_name, embedding_dim=index_name_embedding_dim)

        self.relu = nn.LeakyReLU()
        self.encoder = nn.Sequential()
        if num_product_groups and num_color_groups and num_index_name is not None:
                in_channels = (
                    [input_size] 
                    + [hidden_size_2]
                    + [hidden_size_1]
                    + [output_size]
                )
        else:
            raise ValueError
        for i in range(len(in_channels)-1):
            if i != len(in_channels)-1:
                self.encoder.append(nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]))
        
    def forward(self, product_groups, color_groups, index_name):
        product_group_embedding = self.product_group_embedding_layer(product_groups)
        color_group_embedding = self.color_group_embedding_layer(color_groups)
        index_name_embedding = self.index_name_embedding_layer(index_name)
        latent_vec = torch.cat([product_group_embedding, color_group_embedding, index_name_embedding], dim=-1)
        
        for layer in self.encoder:
            latent_vec = layer(latent_vec)
            latent_vec = self.relu(latent_vec)
        
        return latent_vec

class GovernanceNN(nn.Module):
    """ Partial model for goverance side
    Args:
        agg_latent_input (int): aggregated input of latent vectors from client models
    
    """
    def __init__(
            self,
            input_size: int = 56,
            hidden_size_1: int = 128,
            hidden_size_2: int = 64,
            output_size: int = 2,
        ):
        super().__init__()
        self.relu = nn.LeakyReLU()
        self.decoder = nn.Sequential()
        in_channels = (
                    [input_size] 
                    + [hidden_size_2]
                    + [hidden_size_1]
                    + [output_size]
                )
        for i in range(len(in_channels)-1):
            if i != len(in_channels)-1:
                self.decoder.append(nn.Linear(in_features=in_channels[i], out_features=in_channels[i+1]))
        
    def forward(self, agg_latent_input):
        
        for layer in self.decoder:
            agg_latent_input = layer(agg_latent_input)
            agg_latent_input = self.relu(agg_latent_input)
        
        out = agg_latent_input
        
        return out

In [None]:
# embedding_size = [16, 32, 10, 8, 6]
# input_size = [50, 2, 24]
# hidden_size = [16, 32, 64, 128]
# output_size = [112, 2]

# class ConcatLayer(nn.Module):
#     def __init__(self, num_inputs):
#         super().__init__()
#         self.num_inputs = num_inputs
        
#     def forward(self, *inputs):
#         squeezed_inputs = [torch.squeeze(x, dim=0) for x in inputs]
#         return torch.cat(squeezed_inputs, dim=-1)

# models = {
#     "sales_domain": nn.ModuleList([
#                     nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_size[0]),
#                     nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_size[1]),
#                     ConcatLayer(num_inputs=3),
#                     nn.Sequential(
#                         nn.Linear(in_features=input_size[0], out_features=hidden_size[3], bias=True),
#                         nn.LeakyReLU(),
#                         nn.Linear(in_features=hidden_size[3], out_features=hidden_size[2], bias=True),
#                         nn.LeakyReLU()
#                     )
#     ]),
#     "customer_domain": nn.Sequential(
#                        nn.Linear(in_features=input_size[1], out_features=hidden_size[1], bias=True),
#                        nn.LeakyReLU(),
#                        nn.Linear(in_features=hidden_size[1], out_features=hidden_size[0], bias=True),
#                        nn.LeakyReLU()
    
#     ),
#     "product_domain": nn.ModuleList([
#                       nn.Embedding(num_embeddings=num_product_groups, embedding_dim=embedding_size[2]),
#                       nn.Embedding(num_embeddings=num_color_groups, embedding_dim=embedding_size[3]),
#                       nn.Embedding(num_embeddings=num_index_name, embedding_dim=embedding_size[4]),
#                       ConcatLayer(num_inputs=3),
#                       nn.Sequential(
#                           nn.Linear(in_features=input_size[2], out_features=hidden_size[2], bias=True),
#                           nn.LeakyReLU(),
#                           nn.Linear(in_features=hidden_size[2], out_features=hidden_size[1], bias=True),
#                           nn.LeakyReLU()
#                       )
#     ]),
#     "server": nn.Sequential(
#               nn.Linear(in_features=output_size[0], out_features=hidden_size[2], bias=True),
#               nn.LeakyReLU(),
#               nn.Linear(in_features=hidden_size[2], out_features=hidden_size[0], bias=True),
#               nn.LeakyReLU(),
#               nn.Linear(in_features=hidden_size[0], out_features=output_size[1], bias=True),
#     )
# }

In [73]:
# set up parameters for model
num_users = len(hm_data.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(hm_data.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(hm_data.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(hm_data.index_name.unique())

models = {
    "sales_domain": SalesNN(num_users=num_users, num_items=num_items),
    "customer_domain": CustomersNN(),
    "product_domain": ProductsNN(num_product_groups=num_product_groups, num_color_groups=num_color_groups, num_index_name=num_index_name),
    "server": GovernanceNN(),
}

# set up optimizer for clients' model
optimizers = [
    optim.SGD(models[location.id].parameters(), lr=0.05)
    for location in model_locations
]

for location in model_locations:
    models[location.id].send(location)

num_users: 44
num_items: 1355
num_product_groups: 9
num_color_groups: 44


In [84]:
def predict(data_pointer, models, data_owners, server):
    
    # receive output from each clients
    client_output = {}
    
    # aggregated client output for server side
    remote_output = []
    
    for owner in data_owners:
        if owner.id == "sales_domain":
            sales_part_ptr = models[owner.id](data_pointer[owner.id][0], data_pointer[owner.id][1], data_pointer[owner.id][2], data_pointer[owner.id][3])
            
            print(sales_part_ptr.get())
            print(sales_part_ptr.get().size())
            print("TEST")
            
            client_poutput[owner.id] = models[owner.id][3](concat_part_ptr)
            remote_output.append(
                client_output[owner.id].move(server)
            )
        elif owner.id == "customer_domain":
            client_output[owner.id] = models[owner.id](data_pointer[owner.id])
            remote_output.append(
                client_output[owner.id].move(server)
            )
        elif owner.id == "product_domain":
            part_1 = models[owner.id](data_pointer[owner.id][0])
            part_2 = models[owner.id](data_pointer[owner.id][1])
            part_3 = models[owner.id](data_pointer[owner.id][2])
            concat_part_ptr = models[owner.id][3](part_1, part_2, part_3)
            client_output[owner.id] = models[owner.id][3](concat_part_ptr)
            remote_output.append(
                client_output[owner.id].move(server)
            )
    
def train(data_pointer, label, data_owners, models, optmizers, server):
    for opt in optimizers:
        opt.zero_grad()
    
    pred = predict(data_pointer, models, data_owners, server)
    acc = binary_acc(pred, label)
    
    # loss function
    loss_fn = nn.CrossEntropyLoss()
    loss = loss_fn(pred, label)
    loss.backward()
    
    for opt in optimizers:
        opt.step()
    
    return loss.detach().get(), acc
    

In [85]:
# model training
epochs = 20

for i in range(epochs):
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    #  iterate over each datapoints 
    for data_ptr, label in distributed_trainloader:
        
        # send labels to server's location for training
        label = label.send(server)
        
        loss, acc = train(data_ptr, label, data_owners, models, optimizers, server)
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    if cur_acc > best_acc:
        best_acc = cur_acc
        # torch.save(model.state_dict(), 'best_model.pt')
    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(trainloader):.6f} | Acc: {epoch_acc/len(trainloader):.4f}')

print(f'\nBest Accuracy: {best_acc:.3f}')

tensor([[ 4.9529e-01,  6.2474e-01, -8.9927e-01,  ...,  1.2804e+00,
          3.2604e-01,  4.5298e-01],
        [-1.5664e+00,  1.6941e+00, -2.0681e-03,  ..., -1.9709e+00,
         -1.0262e+00,  2.1196e+00],
        [ 4.9529e-01,  6.2474e-01, -8.9927e-01,  ..., -9.1416e-01,
          1.9485e-01, -1.1253e+00],
        ...,
        [-8.7036e-01, -1.2154e+00,  9.0904e-01,  ...,  4.1737e-01,
         -3.9016e-01, -4.2336e-01],
        [ 6.1625e-01,  6.6836e-01,  8.5924e-02,  ..., -9.1518e-01,
         -7.8924e-01,  5.0368e-01],
        [ 8.0404e-01, -7.6087e-01, -6.6247e-01,  ...,  1.0041e+00,
         -1.5550e-01,  1.3555e+00]], grad_fn=<CatBackward>)
torch.Size([64, 48])
TEST


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)