# Recommendation based on past purchases

Import the neccessary libraries.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import io
import os
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

# Get Datasets

We get the two datasets, which we created from the Data Preparation code. 

In [3]:
data_rating = pd.read_csv("user_rating.csv")

In [4]:
def create_dataset(ratings, top=None):
    if top is not None:
        ratings.groupby('UserID')['Rating'].count()
    
    unique_users = ratings.UserID.unique()
    user_to_index = {old: new for new, old in enumerate(unique_users)}
    new_users = ratings.UserID.map(user_to_index)
    
    unique_items = ratings.ItemID.unique()
    item_to_index = {old: new for new, old in enumerate(unique_items)}
    new_items = ratings.ItemID.map(item_to_index)
    
    unique_postcode = ratings.Postcode.unique()
    postcode_to_index = {old: new for new, old in enumerate(unique_postcode)}
    new_postcode = ratings.Postcode.map(postcode_to_index)

    n_users = unique_users.shape[0]
    n_items = unique_items.shape[0]
    n_postcode = unique_postcode.shape[0]
    
    X = pd.DataFrame({'UserID': new_users, 'ItemID': new_items, 'Postcode': new_postcode})
    y = ratings['Rating'].astype(np.float32)
    return (n_users, n_items, n_postcode), (X, y), (user_to_index, item_to_index, postcode_to_index)

In [5]:
(n, m, l), (X, y), _ = create_dataset(data_rating)
print(f'Embeddings: {n} users, {m} items, {l} postcode')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

Embeddings: 500 users, 500 items, 10 postcode
Dataset shape: (125000, 3)
Target shape: (125000,)


In [6]:
class ReviewsIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

In [7]:
def batches(X, y, bs=32, shuffle=True):
    for xb, yb in ReviewsIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1)

In [8]:
for x_batch, y_batch in batches(X, y, bs=4):
    print(x_batch)
    print(y_batch)
    break

tensor([[141, 189,   3],
        [328, 363,   9],
        [ 22,  30,   5],
        [159,  93,   7]])
tensor([[0.],
        [4.],
        [4.],
        [7.]])


# Training Data

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)
datasets = {'train': (X_train, y_train), 'val': (X_valid, y_valid)}
dataset_sizes = {'train': len(X_train), 'val': len(X_valid)}

In [10]:
minmax = [data_rating.Rating.min(), data_rating.Rating.max()]
minmax = torch.Tensor(minmax)

In [11]:
class EmbeddingNet(nn.Module):
    """
    Creates a dense network with embedding layers.
    
    Args:
    
        n_users:            
            Number of unique users in the dataset.

        n_items: 
            Number of unique items in the dataset.
        
        n_postcodes: 
            Number of unique postcodes in the dataset.

        n_factors: 
            Number of columns in the embeddings matrix.

        embedding_dropout: 
            Dropout rate to apply right after embeddings layer.

        hidden:
            A single integer or a list of integers defining the number of 
            units in hidden layer(s).

        dropouts: 
            A single integer or a list of integers defining the dropout 
            layers rates applyied right after each of hidden layers.
            
    """
    def __init__(self, n_users, n_items, n_postcodes,
                 n_factors=50, embedding_dropout=0.02, 
                 hidden=10, dropouts=0.2):
        
        super().__init__()
        hidden = get_list(hidden)
        dropouts = get_list(dropouts)
        n_last = hidden[-1]
        
        def gen_layers(n_in):
            """
            A generator that yields a sequence of hidden layers and 
            their activations/dropouts.
            
            Note that the function captures `hidden` and `dropouts` 
            values from the outer scope.
            """
            nonlocal hidden, dropouts
            assert len(dropouts) <= len(hidden)
            
            for n_out, rate in zip_longest(hidden, dropouts):
                yield nn.Linear(n_in, n_out)
                yield nn.ReLU()
                if rate is not None and rate > 0.:
                    yield nn.Dropout(rate)
                n_in = n_out
            
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_items, n_factors)
        self.p = nn.Embedding(n_postcodes, n_factors)
        self.drop = nn.Dropout(embedding_dropout)
        self.hidden = nn.Sequential(*list(gen_layers(n_factors * 3)))
        self.fc = nn.Linear(n_last, 1)
        self._init()
        
    def forward(self, users, items, postcodes, minmax=None):
        features = torch.cat([self.u(users), self.m(items), self.p(postcodes)], dim=1)
        x = self.drop(features)
        x = self.hidden(x)
        out = torch.sigmoid(self.fc(x))
        if minmax is not None:
            min_rating, max_rating = minmax
            out = out*(max_rating - min_rating + 1) + min_rating - 0.5
        return out
    
    def _init(self):
        """
        Setup embeddings and hidden layers with reasonable initial values.
        """
        
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        self.p.weight.data.uniform_(-0.05, 0.05)
        self.hidden.apply(init)
        init(self.fc)
    
    
def get_list(n):
    if isinstance(n, (int, float)):
        return [n]
    elif hasattr(n, '__iter__'):
        return list(n)
    raise TypeError('layers configuraiton should be a single number or a list of numbers')

In [12]:
net = EmbeddingNet(
    n_users=n, n_items=m, n_postcodes= l,
    n_factors=150, hidden=[500, 500, 500], 
    embedding_dropout=0.05, dropouts=[0.5, 0.5, 0.25])

In [13]:
lr = 1e-3
wd = 1e-5
bs = 2000
n_epochs = 100
patience = 10
no_improvements = 0
best_loss = np.inf
best_weights = None
history = []
lr_history = []

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

net.to(device)
criterion = nn.MSELoss(reduction='sum')
optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=wd)
iterations_per_epoch = int(math.ceil(dataset_sizes['train'] // bs))

for epoch in range(n_epochs):
    stats = {'epoch': epoch + 1, 'total': n_epochs}
    
    for phase in ('train', 'val'):
        training = phase == 'train'
        running_loss = 0.0
        n_batches = 0
        
        for batch in batches(*datasets[phase], shuffle=training, bs=bs):
            x_batch, y_batch = [b.to(device) for b in batch]
            optimizer.zero_grad()
        
            # compute gradients only during 'train' phase
            with torch.set_grad_enabled(training):
                outputs = net(x_batch[:, 0], x_batch[:, 1], x_batch[:, 2], minmax)
                loss = criterion(outputs, y_batch)
                
                # don't update weights and rates when in 'val' phase
                if training:
                    loss.backward()
                    optimizer.step()
                    
            running_loss += loss.item()
            
        epoch_loss = running_loss / dataset_sizes[phase]
        stats[phase] = epoch_loss
        
        # early stopping: save weights of the best model so far
        if phase == 'val':
            if epoch_loss < best_loss:
                print('loss improvement on epoch: %d' % (epoch + 1))
                best_loss = epoch_loss
                best_weights = copy.deepcopy(net.state_dict())
                no_improvements = 0
            else:
                no_improvements += 1
                
    history.append(stats)
    print('[{epoch:03d}/{total:03d}] train: {train:.4f} - val: {val:.4f}'.format(**stats))
    if no_improvements >= patience:
        print('early stopping after epoch {epoch:03d}'.format(**stats))
        break

loss improvement on epoch: 1
[001/100] train: 9.4976 - val: 8.8756
[002/100] train: 9.1839 - val: 8.8919
[003/100] train: 9.1447 - val: 8.9033
[004/100] train: 9.1106 - val: 8.9071
[005/100] train: 9.0721 - val: 8.9619
[006/100] train: 9.0062 - val: 9.0019
[007/100] train: 8.8793 - val: 9.1124
[008/100] train: 8.7216 - val: 9.2520
[009/100] train: 8.5027 - val: 9.4709
[010/100] train: 8.2367 - val: 9.6135
[011/100] train: 7.9876 - val: 9.8968
early stopping after epoch 011


In [14]:
net.load_state_dict(best_weights)

<All keys matched successfully>

In [15]:
groud_truth, predictions = [], []

with torch.no_grad():
    for batch in batches(*datasets['val'], shuffle=False, bs=bs):
        x_batch, y_batch = [b.to(device) for b in batch]
        outputs = net(x_batch[:, 0], x_batch[:, 1], x_batch[:, 2], minmax)
        groud_truth.extend(y_batch.tolist())
        predictions.extend(outputs.tolist())

groud_truth = np.asarray(groud_truth).ravel()
predictions = np.asarray(predictions).ravel()

In [16]:
final_loss = np.sqrt(np.mean((predictions - groud_truth)**2))
print(f'Final RMSE: {final_loss:.4f}')

Final RMSE: 3.0373


In [17]:
with open('best.weights', 'wb') as file:
    pickle.dump(best_weights, file)

# Make Predictions

We are going to predict the preferences of User 1 for all items. Our input will be User 1 (list of 1s), list of items and list of items' corresponding postcode. For each element in the output tensor, the higher the value, the higher the preference value for its corresponding item.  

In [50]:
net.eval()
user = torch.tensor([1] * 499)
data_postcode = pd.read_csv("list_postcode.csv")

item_list = []
postcode_list = []

for data_val in data_postcode.values.tolist():
    item_list.append(data_val[0])
    postcode_list.append(data_val[1])

item_list.pop()
item = torch.tensor(item_list)

unique_postcode = set(postcode_list)
postcode_to_index = {old: new for new, old in enumerate(unique_postcode)}
new_postcode = list(map(lambda x: postcode_to_index[x],postcode_list))

new_postcode.pop()
postcode = torch.tensor(new_postcode)

with torch.no_grad():
    output = net(user,item, postcode)

We have the output now. We create a dict where we map each items to its output score.

In [51]:
output_list = list(output.numpy().flatten())
item_rating = {}

for i in range(0,499):
    item_rating[i+1] = output_list[i]

Finally, we sorted the dict by its output score in reverse order. In the user preference list, the first item has the highest preferences from User 1 and the last item has the lowest preferences from User 1

In [55]:
user_preference = sorted(item_rating.items(), key=lambda kv: kv[1], reverse=True)
print(user_preference)

[(343, 0.32295188), (360, 0.32132372), (70, 0.31864417), (20, 0.31855217), (257, 0.31781465), (440, 0.3174612), (218, 0.317338), (396, 0.31694615), (342, 0.31688058), (38, 0.3166604), (479, 0.31660828), (486, 0.3164373), (438, 0.31621048), (25, 0.315508), (77, 0.31547478), (88, 0.31544355), (239, 0.3152673), (482, 0.31512365), (80, 0.31492028), (96, 0.31489322), (264, 0.31437412), (329, 0.31418476), (307, 0.31383252), (460, 0.31366792), (397, 0.3135863), (222, 0.3131963), (421, 0.3131191), (292, 0.3130056), (164, 0.31283438), (318, 0.3127181), (336, 0.31267098), (17, 0.31266433), (451, 0.3126604), (66, 0.31258672), (214, 0.3125857), (395, 0.3125601), (308, 0.3124216), (224, 0.31221506), (226, 0.312189), (399, 0.3121024), (155, 0.31205645), (170, 0.31194192), (271, 0.3117826), (361, 0.3117192), (386, 0.31161782), (245, 0.31155628), (365, 0.3113514), (499, 0.3112461), (148, 0.31122497), (89, 0.3111962), (141, 0.31117266), (207, 0.31116918), (75, 0.31114137), (115, 0.31113774), (401, 0.31