In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

dataDir = Path.cwd().parent.parent/'backup/HM_data'
np.random.seed(666)

transactions = pd.read_csv(dataDir/'transactions.csv')
# articles = pd.read_csv(dataDir/'articles.csv')
# customers = pd.read_csv(dataDir/'customers.csv')

In [None]:
len(transactions.customer_id.unique())

In [None]:
rand_userIds = np.random.choice(transactions['customer_id'].unique(), 
                                size=int(len(transactions['customer_id'].unique())*0.1), 
                                replace=False)

transactions = transactions.loc[transactions['customer_id'].isin(rand_userIds)]
transactions["interaction"] = 1

print('There are {} rows of data from {} users'.format(len(transactions), len(rand_userIds)))

# map string type customer_id to int type
customer_mapper = {}
keys = transactions.customer_id.unique()
values = list(range(1, len(transactions.customer_id.unique())+1))
customer_mapper.update(zip(keys, values))

transactions["customer_id"] = transactions["customer_id"].map(customer_mapper)
transactions.head()

In [None]:
# training set and test set

transactions['rank_latest'] = transactions.groupby(['customer_id'])['t_dat'].rank(method='first', ascending=False)

train_transactions = transactions[transactions['rank_latest'] != 1]
test_transactions = transactions[transactions['rank_latest'] == 1]
# get a list of all articles id
all_products_id = transactions["article_id"].unique()

# drop columns that we no longer need
# train_transactions = train_transactions[['customer_id', 'article_id', 'price']]
# test_transactions = test_transactions[['customer_id', 'article_id', 'price']]
# comb_transactions = train_transactions.groupby(by=["customer_id", "article_id"], sort=False, as_index=False).sum(["interaction"])

In [None]:
class HMSaleTrainDataLoader(Dataset):
    """HMSaleTrainDataLoader Training set of HM sales data

    Args:
        transaction (pd.DataFrame): Dataframe of transaction records
        products (list): A list contains all product ids
    """
    def __init__(self, transactions, all_products_id):
        self.customers, self.products, self.labels = self.get_dataset(transactions, all_products_id)

    def __len__(self):
        return len(self.customers)
    
    def __getitem__(self, idx):
        return self.customers[idx], self.products[idx], self.labels[idx]
    
    def get_dataset(self, transactions, all_products_id):
        customers, products, labels = [], [], []
        customer_product_set = set(zip(train_transactions['customer_id'], train_transactions['article_id']))

        # set up negative:positive ratio as 4:1
        negative_samples = 4

        for u, i in tqdm(customer_product_set):
            customers.append(u)
            products.append(i)
            labels.append(1)
            for _ in range(negative_samples):
                negative_product = np.random.choice(all_products_id)
                while (u, negative_product) in customer_product_set:
                    negative_product = np.random.choice(all_products_id)
                customers.append(u)
                products.append(negative_product)
                labels.append(0)    
        return torch.tensor(customers), torch.tensor(products), torch.tensor(labels)   
    

In [None]:
HEHE = HMSaleTrainDataLoader(transactions, all_products_id)

In [None]:
HEHE.__getitem__(1)