In [3]:
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from typing import Tuple, Dict
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from random import randint

from tqdm import tqdm_notebook
from einops import rearrange
import faiss
from metrics import apk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%ls data

articles.csv  customers.csv  sample_submission.csv  transactions_train.csv


In [3]:
transactions = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str})
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


## 1. Embeddings of each article:

In [171]:
articles = pd.read_csv('data/articles.csv')
articles.set_index('article_id', inplace=True, drop=False)
articles.head()

Unnamed: 0_level_0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108775015,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
108775044,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
108775051,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
110065001,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
110065002,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [172]:
def reduce_unique_values(dataframe: pd.DataFrame, col_max_counts: Dict) -> Dict:
    '''
    Takes as input a dataframe, with some columns we'd like embeddings of,
    and maximum unique values per column.
    
    Changes the dataframe inplace, so that only the top n values for each column remain,
    and returns the mappings string -> idx for each column. 
    '''
    
    mappings = {}
    for col in col_max_counts:
        top_n = dataframe[col].value_counts().head(col_max_counts[col]-1).index.to_list() # minus one, to leave room for "other"
        # change all other values to "other"
        if dataframe[col].nunique() > col_max_counts[col]:
            dataframe.loc[~dataframe[col].isin(top_n), col] = 'other'
            top_n.append('other')
        mappings[col] = {top_n_attr: idx for (top_n_attr, idx) in zip(top_n, range(len(top_n)))}
    return mappings

In [173]:
# I want to create embeddings of all these; 
# but first I need to select only those top N values, which have a meaningful number of occurences

select_top_n = {
    'prod_name': 500,
    'product_type_name': 500,
    'product_group_name': 100,    
    'graphical_appearance_name': 100,
    'colour_group_name': 100, 
    'perceived_colour_value_name': 100,
    'perceived_colour_master_name': 100,
    'department_name': 230,
    'index_name': 100,
    'garment_group_name': 100,
    'section_name': 100,
    'garment_group_name': 100
}

mappings_articles = reduce_unique_values(articles, select_top_n)
mappings_articles['perceived_colour_value_name']

{'Dark': 0,
 'Dusty Light': 1,
 'Light': 2,
 'Medium Dusty': 3,
 'Bright': 4,
 'Medium': 5,
 'Undefined': 6,
 'Unknown': 7}

In [174]:
class GroupEmbedding(nn.Module):
    '''
    Given something with a lot of thing we want embeddings of - 
    for example, articles of clothing, or customers,
    and mappings string->index, creates torch.nn.Embedding layers,
    and concatenates their outputs
    '''
    
    def __init__(self, mappings: Dict):
        super().__init__()
        self.mappings = mappings
        self.embeddings = {}
        for col in mappings:
            embedding_dim = 10
            self.embeddings[col] = nn.Embedding(num_embeddings = len(mappings[col]), embedding_dim=10)
            for param in self.embeddings[col].parameters():
                self.register_parameter(name=f'embedding_{col}', 
                                        param=param)
        
    def forward(self, dataframe: pd.DataFrame) -> torch.Tensor:
        embedded_columns = []
        for col in self.mappings:
            strings = dataframe[col]
            indexes = torch.tensor([self.mappings[col][item] for item in strings], dtype=torch.long)
            embedded_columns.append(self.embeddings[col](indexes))
            
        return torch.cat(embedded_columns, axis=1)

In [175]:
article_embedder = GroupEmbedding(mappings_articles)
article_embedder(articles.sample(100))

tensor([[ 0.4207, -0.9936,  0.2123,  ..., -0.9900,  1.6430, -1.2487],
        [ 0.4207, -0.9936,  0.2123,  ...,  0.0239,  0.1084,  0.0634],
        [ 0.4207, -0.9936,  0.2123,  ...,  1.4412, -1.0926,  1.3601],
        ...,
        [ 0.4207, -0.9936,  0.2123,  ...,  0.9818, -1.5221,  1.1575],
        [ 0.4207, -0.9936,  0.2123,  ..., -0.0825, -0.5578,  0.4116],
        [ 0.4207, -0.9936,  0.2123,  ..., -0.1279,  0.6329, -1.1696]],
       grad_fn=<CatBackward0>)

## 2. Embeddings of each customer

In [176]:
customers = pd.read_csv('data/customers.csv')
customers.age.fillna(32, inplace=True) # median age
customers.set_index('customer_id', inplace=True, drop=False)

In [177]:
select_top_n = {
    'customer_id': 100_000_000, # separate embedding for each customer. 
    'age': 100, # all ages are embedded, unless we have really old people. 
    'postal_code': 50_000 # 5 or 6 users for the smallest codes remaining. 
}

mappings_customers = reduce_unique_values(customers, select_top_n)

In [178]:
customer_embedder = GroupEmbedding(mappings_customers)
customer_embedder(customers.sample(10_000)).shape

torch.Size([10000, 30])

## 3. Data Loader

In [100]:
transactions = pd.read_csv('data/transactions_train.csv')

## select a period of 3 months for train, and a week for val
train = transactions[transactions.t_dat.between('2020-06-15', '2020-09-14')]
val = transactions[transactions.t_dat.between('2020-09-15', '2020-09-22')]

In [99]:
class TransactionDataLoader():
    def __init__(self, 
                 customers: pd.DataFrame, 
                 articles: pd.DataFrame, 
                 txns: pd.DataFrame, 
                 negative_samples: int = 10):
        self.customers = customers
        self.articles = articles
        self.txns = txns
#         self.current_idx = 0 
        self.negative_samples = negative_samples # per one positive sample
    
    def __len__(self):
        return len(self.txns)
    
    def get_batch(self, batch_size: int = 1000):
        batch_index_start = randint(0, len(self) - batch_size)
        # for speed purposes, do not sample randomly, but give data iteratively
        batch_txns = self.txns.iloc[batch_index_start:batch_index_start + batch_size]
        batch_customers = self.customers.loc[batch_txns.customer_id]
        positive_articles = self.articles.loc[batch_txns.article_id]

        # now, negative sampling:
        negative_articles = self.articles.sample(batch_size*self.negative_samples)
        labels = torch.ones(batch_size*(1+self.negative_samples))
        labels[batch_size:] = -1
        batch_customers = pd.concat([batch_customers for _ in range(self.negative_samples + 1)])
        batch_articles = pd.concat([positive_articles, negative_articles])
        return batch_customers, batch_articles, labels

In [98]:
tdl = TransactionDataLoader(customers, articles, train, negative_samples=5)

## 4. create models and train

In [214]:
customer_tower = nn.Sequential(
    customer_embedder,
    nn.ReLU(),
    nn.Linear(30, 128),
    nn.Linear(128, 128)
)

article_tower = nn.Sequential(
    article_embedder,
    nn.ReLU(),
    nn.Linear(110, 128),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Linear(128, 128)
)

# we want a higher learning rate, since we have a lot fewer datapoints
optimizer_customers = torch.optim.Adam(params=customer_tower.parameters(), 
                                       lr = 0.05) 
optimizer_articles = torch.optim.Adam(params = article_tower.parameters(),
                                      lr = 0.001)

In [222]:
writer = SummaryWriter(log_dir='logs/two_tower_v8_cosine_loss')

In [None]:
## Noise contrastive estimation (NCE) loss
## https://sthalles.github.io/simple-self-supervised-learning/

cosine_loss = nn.CosineEmbeddingLoss()
for batch in range(20_000): # around 2 iterations over the train dataset
    batch_cust, batch_articles, batch_labels = tdl.get_batch()
    cust_emb = customer_tower(batch_cust)
    art_emb = article_tower(batch_articles)
    loss = cosine_loss(
        art_emb,
        cust_emb,
        batch_labels
    )
    loss.backward()
    optimizer_articles.step()
    optimizer_customers.step()
    optimizer_articles.zero_grad()
    optimizer_customers.zero_grad()
    writer.add_scalar("Loss/train", loss.item(), batch)

## 4. Evaluate

In [228]:
all_article_embeddings = article_tower(articles)
val_customers = customers[customers.customer_id.isin(val.customer_id.unique())]
val_customers_embeddings = customer_tower(customers)

## normalize by norm
all_article_embeddings = torch.div(all_article_embeddings, 
                                   rearrange(torch.linalg.norm(all_article_embeddings, dim=1), 'w -> w 1'))
val_customers_embeddings = torch.div(val_customers_embeddings, 
                                     rearrange(torch.linalg.norm(val_customers_embeddings, dim=1), 'w -> w 1'))

In [229]:
torch.save(all_article_embeddings, 'article_embeddings.pt')
torch.save(val_customers_embeddings, 'val_customer_embeddings.pt')

In [4]:
all_article_embeddings = torch.load('article_embeddings.pt')
val_customers_embeddings = torch.load('val_customer_embeddings.pt')

In [254]:
index = faiss.IndexFlatIP(128)

In [255]:
index.add(all_article_embeddings.detach().numpy())

### we'll first evaluate for the first 10 thousand customers in the val dataset:

In [257]:
predicted_products = index.search(val_customers_embeddings[:10000].detach().numpy(), k=12)[1]

In [259]:
val_10k_cust = val_customers[:10_000].customer_id.unique()
val_10k = val[val.customer_id.isin(val_10k_cust)]
articles['idx'] = range(len(articles))
val_10k_art = pd.merge(val_10k, articles[['article_id', 'idx']], 
                       how='left', 
                       left_on='article_id', 
                       right_index=True)
actual_bought_articles = [list(val_10k_art[val_10k_art.customer_id == cid].idx.unique()) for cid in val_10k_cust]

### So, awful predictive power. 

In [273]:
mapk(actual_bought_articles, predicted_idx,k=12)

0.0

### Are we at least predicting the right section? Yes. 

In [302]:
## top 12 products predicted from this sections:
articles[articles.idx.isin(predicted_products[7])].section_name

article_id
589599033    Womens Swimwear, beachwear
599580012    Womens Swimwear, beachwear
599580017    Womens Swimwear, beachwear
599580044    Womens Swimwear, beachwear
688728023               Womens Lingerie
730683040              Ladies H&M Sport
730683054              Ladies H&M Sport
759871004                Divided Basics
776237008    Womens Swimwear, beachwear
854679002    Womens Swimwear, beachwear
854683002    Womens Swimwear, beachwear
894140003    Womens Swimwear, beachwear
Name: section_name, dtype: object

In [301]:
## actually bought from the following sections:
articles[articles.idx.isin(actual_bought_articles[7])].section_name

article_id
158340001    Womens Nightwear, Socks & Tigh
448509014                Divided Collection
486639003                   Womens Lingerie
799365027                      Ladies Denim
867966009                      Ladies Denim
879189005                    Kids Outerwear
903762001                    Kids Outerwear
907149001                    Divided Basics
915529003        Womens Everyday Collection
915529005        Womens Everyday Collection
918171001    Womens Nightwear, Socks & Tigh
932798002    Womens Nightwear, Socks & Tigh
935892001    Womens Nightwear, Socks & Tigh
936622001    Womens Nightwear, Socks & Tigh
Name: section_name, dtype: object