In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import syft as sy
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
np.random.seed(666)

hook = sy.TorchHook(torch)
from Distributed_HM_Data import Distributed_HM, binary_acc

dataDir = Path.cwd().parent/'Data/'
print("Test")

Test


In [2]:
class HMSaleTrainDataLoader(Dataset):
    """HMSaleTrainDataLoader Training set of HM sales data

    Args:
        transactions (pd.DataFrame): Dataframe of transaction records
        all_products_id (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id):
        self.customers, self.products, self.prices, self.sales_channels, \
        self.club_status, self.age_groups, self.product_groups, self.color_groups, \
        self.index_name, self.labels = self.get_dataset(transactions, all_products_id)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.prices[idx], self.sales_channels[idx], self.club_status[idx], \
               self.age_groups[idx], self.product_groups[idx], self.color_groups[idx], self.index_name[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id):
        customers, products, prices, sales_channels, club_status, age_groups, product_groups, color_groups, index_name, labels  = [], [], [], [], [], [], [], [], [], []
        customer_product_set = set(zip(transactions["customer_id"], transactions["article_id"], 
                                       transactions["price"], transactions["sales_channel_id"], 
                                       transactions["club_member_status"], transactions["age"], 
                                       transactions["product_group_name"], transactions["colour_group_name"], transactions["index_name"]))
        
        """negative sampling"""
        # set up negative:positive ratio as 4:1
        negative_samples = 4

        for u, i, price, sale, club, age, product, color, index in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            prices.append(price)
            sales_channels.append(sale)
            club_status.append(club)
            age_groups.append(age)
            product_groups.append(product)
            color_groups.append(color)
            index_name.append(index)
            labels.append(1)
            for _ in range(negative_samples):
                negative_product = np.random.choice(all_products_id)
                while (u, negative_product, price, sale, club, age, product, color, index) in customer_product_set:
                    negative_product = np.random.choice(all_products_id)
                customers.append(u)
                products.append(negative_product)
                prices.append(price)
                sales_channels.append(sale)
                club_status.append(club)
                age_groups.append(age)
                product_groups.append(product)
                color_groups.append(color)
                index_name.append(index)
                labels.append(0)
        return torch.tensor(customers), torch.tensor(products), torch.tensor(prices), torch.tensor(sales_channels), \
               torch.tensor(club_status), torch.tensor(age_groups), torch.tensor(product_groups), torch.tensor(color_groups), \
               torch.tensor(index_name), torch.tensor(labels)


In [3]:
hm_data = pd.read_csv(dataDir/'small_train.csv')
all_products_id = hm_data["article_id"].unique()
train_data = HMSaleTrainDataLoader(hm_data, all_products_id)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

  0%|          | 0/1403 [00:00<?, ?it/s]

In [4]:
# set up virtual worker
sales_domain = sy.VirtualWorker(hook, id="sales_domain")
customer_domain = sy.VirtualWorker(hook, id="customer_domain")
product_domain = sy.VirtualWorker(hook, id="product_domain")
server = sy.VirtualWorker(hook, id="server")

data_owners = (sales_domain, customer_domain, product_domain)
model_locations = [sales_domain, customer_domain, product_domain, server]

distributed_trainloader = Distributed_HM(data_owners=data_owners, data_loader=train_loader)

In [None]:
class SplitNN(torch.nn.modules):
    def __init__(self, models, optimizers, data_owner, server):
        self.data_owner = 

In [11]:
# set up parameters for model
num_users = len(train_transactions.customer_id.unique())
print("num_users:", num_users)
num_items = len(all_products_id)
print("num_items:", num_items)
num_product_groups = len(train_transactions.product_group_name.unique())
print("num_product_groups:", num_product_groups)
num_color_groups = len(train_transactions.colour_group_name.unique())
print("num_color_groups:", num_color_groups)
num_index_name = len(train_transactions.index_name.unique())

embedding_size = [16, 32, 10, 8, 6]
input_size = [50, 2, 24]
hidden_size = [64, 128, 256]

models = {
    "sales_domain": nn.ModuleList([
                    nn.Embedding(num_embeddings=num_users, embedding_dim=embedding_size[0]),
                    nn.Embedding(num_embeddings=num_items, embedding_dim=embedding_size[1]),
                    nn.Sequential(
                        nn.Linear(in_features=input_size[0], out_features=hidden_size[], bias=True),
                        nn.LeakyReLU(),
                        nn.Linear(),
                        nn.LeakyReLU()
                    )
    ]),
    "customer_domain": nn.Sequential(
                       nn.Linear(in_features=input_size[1], out_features=hidden_size[], bias=True),
                       nn.LeakyReLU(),
                       nn.Linear(),
                       nn.LeakyReLU()
    
    ),
    "product_domain": nn.ModuleList([
                      nn.Embedding(num_embeddings=num_product_groups, embedding_dim=embedding_size[2]),
                      nn.Embedding(num_embeddings=num_color_groups, embedding_dim=embedding_size[3])
                      nn.Embedding(num_embeddings=num_index_name, embedding_dim=embedding_size[4])
                      nn.Sequential(
                          nn.Linear(in_features=input_size[2], out_features=hidden_size[], bias=True),
                          nn.LeakyReLU(),
                          nn.Linear(),
                          nn.LeakyReLU()
                      )
    ]),
    "server": nn.Sequential(
              nn.Linear(),
              nn.LeakyReLU(),
              nn.Linear(),
              nn.LeakyReLU(),
              nn.Linear(),
              nn.LeakyReLU(),
              nn.Linear(),
              nn.LeakyReLU(),
              nn.Linear(),
    )
}

SyntaxError: invalid syntax (2874729936.py, line 4)