# Two tower model

In this notebook I implement my two tower model. This includes two sub models that are trained on learning two embeddings. One model learns the item (article) embedding, and the other the user (customer) embedding. I use Pytorch for the neural network imeplementations. Additional resources are required to train the models on the full transaction dataset. 

In [4]:
# Necessary additional packages
!pip install pytorch-nlp

Collecting pytorch-nlp
  Downloading pytorch_nlp-0.5.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 2.3 MB/s eta 0:00:011
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


In [5]:
# imports
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
from torchnlp.encoders import LabelEncoder

In [6]:
# Hyperparameters (to fine-tune)

# learning rate
lr = 0.05

# batch size
batch_size = 512

# embedded dimension
embed_dim = 128

# internal criterion
internal_criterion = nn.CrossEntropyLoss()

In [7]:
# load data in dataframes -> paths need to be adjusted based on where the notebook is ran
# article_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv")
# customer_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv")
transactions_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

# split data

train_df = transactions_df[(transactions_df['t_dat'] >= '2020-09-05') & (transactions_df['t_dat'] <= '2020-09-10')]
test_df = transactions_df[(transactions_df['t_dat'] >= '2020-09-15')]

# Encoders to encode customer and article ids into numerical values

customer_encoder = LabelEncoder(train_df['customer_id'].unique(), reserved_labels=['unknown'], unknown_index=0)
article_encoder = LabelEncoder(train_df['article_id'].unique(), reserved_labels=['unknown'], unknown_index=0)


FileNotFoundError: [Errno 2] No such file or directory: '../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv'

In [273]:
# Custom dataset (for more flexibility if needed)
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, transactions):
        self.transactions = transactions
        
    def __len__(self) -> int:
        "Returns the total number of samples."
        return len(self.transactions)
    
    def __getitem__(self, index: int):
        
        row = self.transactions.iloc[[index]]
        return row['customer_id'].item(), row['article_id'].item()


In [274]:
# Dataset and dataloader intialization
train_data = CustomDataset(train_df[['customer_id','article_id']])
test_data = CustomDataset(test_df[['customer_id', 'article_id']])

# dataloaders
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, drop_last=True)

In [275]:
# Create Two tower model using PyTorch
class TwoTower(nn.Module):
    # In our case, items are articles and users are customers
    def __init__(self, n_users, n_items):
        super(TwoTower, self).__init__()
        
        self.user_embedding = nn.Embedding(num_embeddings=n_users, embedding_dim=embed_dim)
        self.item_embedding = nn.Embedding(num_embeddings=n_items, embedding_dim=embed_dim)
        
        self.user_layers = nn.Sequential(
            nn.Linear(128, 64,bias=True),
            nn.LeakyReLU(),
            nn.Linear(64, 32,bias=True),
            nn.LeakyReLU()
        )
        
        self.item_layers = nn.Sequential(
            nn.Linear(128, 64,bias=True),
            nn.LeakyReLU(),
            nn.Linear(64, 32,bias=True),
            nn.LeakyReLU()
        )

        self.dot_product = torch.matmul
    
    def forward(self, users, items):
        
        user_embedding = self.user_embedding(users)
        item_embedding = self.item_embedding(items)
        
        user_embedding = self.user_layers(user_embedding)
        item_embedding = self.item_layers(item_embedding)
    
        return self.dot_product(user_embedding, item_embedding.t())

In [276]:
# Probabilities of an article being selected from the transactions dataset -> necessary for in batch negative sampling
article_counts = train_df.groupby('article_id')['article_id'].count().to_dict()
amount_of_transactions = len(train_df)
article_probs = {i: article_counts[i]/amount_of_transactions for i in article_counts.keys()}

# Custom loss class, to seperate the training and evaluation loss computation
class CustomLoss:
    def __init__(self, article_probs):
        self.article_probs = article_probs
    
    def __call__(self, predicted_values, true_values, training):
        if training:
            decoded_values = article_encoder.batch_decode(true_values)
            true_value_probs = list(map(lambda x: self.article_probs[x], decoded_values))
            predicted_values = torch.sub(predicted_values, torch.log(torch.FloatTensor(true_value_probs)))
            true_values = torch.arange(batch_size)
        
        loss = internal_criterion(predicted_values, true_values)
        return loss

criterion = CustomLoss(article_probs)

In [277]:
# Device selection
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

Selected device: cpu


In [278]:
# Training function
def train(model, device, dataloader, optimizer):
    model.train()
    train_loss = 0
    for batchidx, data in enumerate(dataloader):
        print(str(batchidx) + "/" + str(len(dataloader)))
        customers, articles = data
        customers = customer_encoder.batch_encode(customers)
        articles = article_encoder.batch_encode(articles.tolist())
        
        optimizer.zero_grad()
        
        outputs = model(customers, articles)
        loss = criterion(outputs, articles, training=True)

        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    return train_loss
        
# Evaluatioon function      
def evaluation(model, device, dataloader):
    model.eval()
    val_loss = 0
    top_1000_correct = 0
    top_500_correct = 0
    top_100_correct = 0
    with torch.no_grad():
        for batchidx, data in enumerate(dataloader):
            print(str(batchidx) + "/" + str(len(dataloader)))
            customers, articles = data
            customers = customer_encoder.batch_encode(customers)
            articles = article_encoder.batch_encode(articles.tolist())
            all_articles = article_encoder.batch_encode(test_df['article_id'].unique())
            
            # all articles -> to get accurate recall values
            outputs = model(customers, all_articles)

            loss = criterion(outputs, articles, training=False)
            val_loss += loss.item()
            
            # Top k=1000
            top_indices = torch.topk(outputs, 1000).indices
            for i, article in enumerate(articles):
                if article in top_indices[i]:
                    top_1000_correct += 1
                if article in top_indices[i][:500]:
                    top_500_correct += 1
                if article in top_indices[i][:100]:
                    top_100_correct += 1
    return (val_loss, top_1000_correct / len(dataloader.dataset), top_500_correct / len(dataloader.dataset), top_100_correct / len(dataloader.dataset))


In [279]:
# model
model = TwoTower(len(train_df.customer_id.unique()) + 1, len(train_df.article_id.unique()) + 1)

# optimizer
optimizer = optim.Adam(model.parameters(), lr=lr)

training starts
0/457
1/457
2/457
3/457
4/457
5/457
6/457
7/457
8/457
9/457
10/457
11/457
12/457
13/457
14/457
15/457
16/457
17/457
18/457
19/457
20/457
21/457
22/457
23/457
24/457
25/457
26/457
27/457
28/457
29/457
30/457
31/457
32/457
33/457
34/457
35/457
36/457
37/457
38/457
39/457
40/457
41/457
42/457
43/457
44/457
45/457
46/457
47/457
48/457
49/457
50/457
51/457
52/457
53/457
54/457
55/457
56/457
57/457
58/457
59/457
60/457
61/457
62/457
63/457
64/457
65/457
66/457
67/457
68/457
69/457
70/457
71/457
72/457
73/457
74/457
75/457
76/457
77/457
78/457
79/457
80/457
81/457
82/457
83/457
84/457
85/457
86/457
87/457
88/457
89/457
90/457
91/457
92/457
93/457
94/457
95/457
96/457
97/457
98/457
99/457
100/457
101/457
102/457
103/457
104/457
105/457
106/457
107/457
108/457
109/457
110/457
111/457
112/457
113/457
114/457
115/457
116/457
117/457
118/457
119/457
120/457
121/457
122/457
123/457
124/457
125/457
126/457
127/457
128/457
129/457
130/457
131/457
132/457
133/457
134/457
135/457
136/45

In [None]:
# Training loop
train_losses = []
val_losses = []
top_100_recall_lst = []
top_500_recall_lst = []
top_1000_recall_lst = []

epochs = 1
for epoch in range(epochs):
    
    print("Epoch:" + str(epoch))
    train_loss = train(model, device, train_loader, optimizer)
    val_loss, top_1000_recall, top_500_recall, top_100_recall = evaluation(model, device, test_loader)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    top_100_recall_lst.append(top_100_recall)
    top_500_recall_lst.append(top_500_recall)
    top_1000_recall_lst.append(top_1000_recall)
