In [None]:
from google.colab import drive
import shutil
drive.mount('/content/drive')
shutil.copyfile("/content/drive/MyDrive/RetailRecommender/preprocessing.py", "preprocessing.py")
shutil.copyfile("/content/drive/MyDrive/RetailRecommender/Datasets/online_retail_processed.csv", "sample_data/online_retail_processed.csv")


In [None]:
!pip install pyg-lib torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-1.13.0+cu116.html
!pip install torch-geometric
!pip install sentence-transformers

In [None]:
# Initialization
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
from torch.utils.data import Dataset, DataLoader
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sentence_transformers import SentenceTransformer

from preprocessing import split_temporal
device = "cuda:0" if torch.cuda.is_available() else "cpu"

The Dataset

In [None]:
df = pd.read_csv('sample_data/online_retail_processed.csv')
df.head()

In [None]:
df.describe()

In [None]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = sbert_model.encode(df['Description'])

In [None]:
train_df, test_df = split_temporal(df, "InvoiceDate")

In [None]:
train_df

In [None]:
class RetailData(Dataset):
  def __init__(self, df):
    x = df.iloc[:,:-1]
    y = df.iloc[:, -1]
    self.x_train=torch.tensor(x.values,dtype=torch.float32)
    self.y_train=torch.tensor(y.values,dtype=torch.float32)
    
  def __len__(self):
    return len(self.y_train)

  def __getitem__(self,idx):
    return self.x_train[idx],self.y_train[idx]

In [None]:
print(df.shape, train_df.shape, test_df.shape)
print(df['CustomerID'].unique().shape, train_df['CustomerID'].unique().shape, test_df['CustomerID'].unique().shape)
print(df['CustomerID'].max(), train_df['CustomerID'].max(), test_df['CustomerID'].max())

In [None]:
train_loader = RetailData(train_df[["Quantity", "UnitPrice", "CustomerID", "CountryID", "StockCodeID"]])

In [None]:
total_len = len(df['CustomerID'].unique()) + len(df['StockCodeID'].unique())
num_users = len(df['CustomerID'].unique())
sentence_embeddings = sbert_model.encode(train_df['Description'])

x = torch.tensor(range(0, total_len))
edge_index = torch.tensor([train_df['CustomerID'], train_df['StockCodeID'] + num_users]).long()
edge_attr = torch.tensor([train_df['UnitPrice'] * train_df['Quantity'], sentence_embeddings]).T
avg_edge_weight = edge_attr.mean()
print(x.shape, edge_index.shape, edge_attr.shape, avg_edge_weight)
tr_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

In [None]:
edge_index.max()

In [None]:
sentence_embeddings = sbert_model.encode(test_df['Description'])

te_x = torch.tensor(range(0, total_len))
te_edge_index = torch.tensor([(test_df["CustomerID"]).values, (test_df["StockCodeID"] + num_users).values]).long()
te_edge_attr = torch.tensor([test_df['UnitPrice'].values * test_df['Quantity'].values, sentence_embeddings]).T
te_data = Data(x=te_x, edge_index=te_edge_index, edge_attr=te_edge_attr)

In [None]:
print(edge_index.shape)
print(te_edge_index.shape)

Customer and Item Nodes

In [None]:
item_ids = torch.tensor(df['StockCodeID'].unique())
customer_ids = torch.tensor(df['CustomerID'].unique())
print(item_ids.shape, customer_ids.shape)

In [None]:
from torch_geometric.utils.negative_sampling import negative_sampling
import torch_geometric as ptg
import torch.nn.functional as F
from torch_geometric.utils import degree

In [None]:
class GraphEmbedder(nn.Module):
  def __init__(self, embed_size, num_layers, num_nodes, dropout_p=0.5):
    super().__init__()
    self.embed = nn.Embedding(num_nodes, embed_size)
    layers = [ptg.nn.GCNConv(embed_size, embed_size) for _ in range(num_layers)]
    self.layers = nn.ModuleList(layers)
    self.dropout_p = dropout_p
  
  def forward(self, data):
    x,edge_indices = data.x, data.edge_index
    x = self.embed(x)
    for i, layer in enumerate(self.layers):
      dropped = F.dropout(x, p=self.dropout_p, training=self.training)
      x = layer(dropped, edge_indices)
      if i != len(self.layers) - 1:
        x = F.relu(x)
    return x

In [None]:
def recall_k(net, data, k):
  net.eval()
  embeds = net(data)
  user_embeds, item_embeds = embeds[:num_users], embeds[num_users:]
  dots = F.logsigmod(user_embeds @ item_embeds.T)
  top_k = torch.topk(dots, dim=1).indices
  neighs = degree(data.edge_index[0])
  total = 0
  for u in range(num_users):
    overlap_amt = len(np.intersect1d(top_k[u].detach().cpu().numpy(),torch.where(data.edge_index[0] == u, data.edge_index[1], -1).detach().cpu().numpy()))
    total += overlap_amt/neighs[u]
  return total/num_users

In [None]:
def train(encoder, optim, epochs, tr_data, te_data):
    tr_losses = []
    te_losses = []
    te_recalls = []
    best_recall = -np.inf
    bad_epochs = 0
    for i in range(epochs):
      embedded = encoder(tr_data)
      user_embeds, item_embeds = embedded[:num_users], embedded[num_users:]
      pos = tr_data.edge_index
      neg = negative_sampling(tr_data.edge_index)
      dots = user_embeds @ item_embeds.T
      pos_weights = F.logsigmoid(dots[pos[0], pos[1]-num_users]*tr_data.edge_attr[0])
      neg_weights = F.logsigmoid(dots[neg[0], neg[1]-num_users]*avg_edge_weight)

      loss = pos_weights.sum() - neg_weights.sum()
      loss.backward()
      optim.step()
      optim.zero_grad()

      # epoch testing
      embedded = encoder(te_data)
      user_embeds, item_embeds = embedded[:num_users], embedded[num_users:]
      pos = te_data.edge_index
      neg = negative_sampling(te_data.edge_index)
      dots = user_embeds @ item_embeds.T
      pos_weights = F.logsigmoid(dots[pos[0], pos[1]-num_users]*te_data.edge_attr[0])
      neg_weights = F.logsigmoid(dots[neg[0], neg[1]-num_users]*avg_edge_weight)

      te_loss = pos_weights.sum() - neg_weights.sum()
      
      tr_losses.append(loss.item())
      te_recalls.append(te_loss.item())
      te_recalls.append(recall_k(encoder, te_data, 150).item())
      if i % 10 == 9:
          print(tr_losses[-1])
    fig, (ax1, ax2, ax3) = plt.subplots(2)
    fig.set_figheight(15)
    fig.set_figwidth(15)
    ax1.plot(tr_losses, label="Train Loss", color="#cc6462")
    ax1.plot(te_losses, label="Valid Loss", color="#9e9e9e") 
    ax1.set(xlabel="Iterations")
    ax1.set_title("Training and Validation Losses")
    ax1.legend()

    ax3.plot(te_recalls, color="#cc6462")
    ax3.set_title("Recall 150")

In [None]:
embed_net = GraphEmbedder(32, 2, total_len)
optimizer = torch.optim.Adam(embed_net.parameters())

In [None]:
train(embed_net, optimizer, 200, tr_data, None)