# Part 2 Recommender System

## Dependencies

In [1]:
# basic packages
import numpy as np
import pandas as pd
import os
from tqdm import tqdm as progress_bar

In [2]:
# traditional recommender systems packages
import surprise
from surprise import model_selection

In [3]:
# for neural nets
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

In [4]:
torch.set_float32_matmul_precision('medium') # trade precision for performance

In [5]:
np.random.seed(42)

## Helper Functions

In [6]:
def MSE(y_pred, y_test):
    # should both be in numpy.array
    return np.mean((y_pred - y_test)**2)

In [7]:
def accuracy(y_pred, y_test):
    y_pred = np.round(y_pred)
    return np.mean(y_pred == y_test)

In [8]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

## Load Data

In [9]:
# load the data in favor of surprise format
filepath = os.path.join('data', 'rec.tsv')
reader = surprise.Reader(line_format='user item rating', sep='\t')
data = surprise.Dataset.load_from_file(filepath, reader=reader)

In [10]:
trainset, testset = model_selection.train_test_split(data, test_size=.25)
# must acknowledge the data leakage problem with time
# it might be more wise to use leave-one-out testset based on lastest time

Notice that our dataset's majority is user / item which only relevant to very few amount of reviews:

In [11]:
user_length = []
for user in trainset.ur:
    user_length.append(len(trainset.ur[user]))
np.average(user_length)

1.3543476368547949

In [12]:
item_length = []
for item in trainset.ir:
    item_length.append(len(trainset.ir[item]))
np.average(item_length)

14.155460029797524

## Problem Statement

Given a user-item pairs, predict possible ratings.

**Evaluation**: Mean Squared Error & Round Accuracy on test set.

### Baseline Model: Similarity-Based Rating Estimation

Given a user, item pair: We consider all the items consumed by the user, we use its weighted (based on similarities) average of the user's ratings on all other items it used to predict its ratings on a new item.

In [13]:
# prepare item average dictionary
item_average = {}
for item in trainset.ir:
    all_reviews = trainset.ir[item]
    item_average[item] = np.mean([review[1] for review in all_reviews])

In [14]:
def predict_ratings(user, item):
    
    try:
        # from out id to inner id
        user_iid = trainset.to_inner_uid(user)
        item_iid = trainset.to_inner_iid(item)
    except:
        # if the user or item does not appear in the training dataset
        return trainset.global_mean
    
    ratings = []
    similarities = []
    
    for review in trainset.ur[user_iid]:
        another_item = review[0]
        if another_item == item_iid:
            continue # escape itself
            
        # get the ratings off average on the item's behavior
        ratings.append(review[1] - item_average[another_item])
        
        users_this_item = set([review[0] for review in trainset.ir[item_iid]])
        users_another_item = set([review[0] for review in trainset.ir[another_item]])
        
        similarities.append(Jaccard(users_this_item, users_another_item))
    
    if(sum(similarities) > 0):
        weighted_ratings = [(x*y) for x, y in zip(ratings, similarities)]
        return item_average[item_iid] + sum(weighted_ratings) / sum(similarities)
    else:
        return trainset.global_mean

In [15]:
# perform the prediction
y_pred = []
y_test = []
for user, item, rating in progress_bar(testset):
    y_pred.append(predict_ratings(user, item))
    y_test.append(rating)
    
y_pred = np.array(y_pred)
y_test = np.array(y_test)

100%|████████████████████████████████████████████████████████████████████████| 772756/772756 [02:48<00:00, 4596.66it/s]


In [16]:
# Test Set MSE
MSE(y_pred, y_test)

1.9474959814844819

In [17]:
# Test Set Accuracy
accuracy(y_pred, y_test)

0.21011289462650565

### Latent Factor Model
Based on the idea of k factors that represent different categories of user and item.

In [18]:
model = surprise.SVD(n_factors=128)

In [19]:
%%time
# fit and predict
model.fit(trainset)
predictions = model.test(testset)

CPU times: total: 36.5 s
Wall time: 36.2 s


In [20]:
y_pred = []
for prediction in predictions:
    y_pred.append(prediction.est)
y_pred = np.array(y_pred)

In [21]:
# Test Set MSE
MSE(y_pred, y_test)

1.6888076658234337

In [22]:
# Test Set Accuracy
accuracy(y_pred, y_test)

0.2831695904011098

In [None]:
del data, trainset, testset, item_average

### Neural Network: *Neural Collaborative Filtering (NCF)*
Reference: https://arxiv.org/abs/1708.05031

Idea: Directly provide the network with encoded item and user, and let itself to analyze the recommendation.

#### Reconstrcut the Dataset

In [26]:
filepath = os.path.join('data', 'rec.tsv')
data = pd.read_csv(filepath, sep='\t', header=None).rename(columns = {0: 'user_id', 1: 'item_id', 2: 'rating'})

In [27]:
# change the unique complex index into sequence of numbers
data['user_id'], user_id_mapper = pd.factorize(data['user_id'])
data['item_id'], item_id_mapper = pd.factorize(data['item_id'])

In [28]:
# firstly shuffle the data
data = data.sample(frac=1)

# split into train-test set
trainset = data.iloc[:round(len(data) * 0.95)]
testset = data.iloc[round(len(data) * 0.95):]

In [29]:
class Train_dataset(torch.utils.data.Dataset):
    """
    Dataset for training
    
    Input:
        data (pd.DataFrame): DataFrame containing the product ratings
    """
    
    def __init__(self, data):
        self.users, self.items, self.ratings = self.get_dataset(data)
        
    def __len__(self):
        return len(self.users)
    
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]
    
    def get_dataset(self, data):
        return torch.tensor(data['user_id']), torch.tensor(data['item_id']), torch.tensor(data['rating'])

#### The Model

In [30]:
class NCF(pl.LightningModule):
    """
    Nerual Collaborative Filtering
    
    Input:
        num_users (int): number of unique users
        num_items (int): number of unique items
        data (pd.DataFrame): DataFrame contaniing the product ratings
    """
    
    def __init__(self, num_users, num_items, data):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(num_embeddings=num_users, embedding_dim = 128)
        self.item_embedding = torch.nn.Embedding(num_embeddings=num_items, embedding_dim = 128)
        self.fc1 = torch.nn.Linear(in_features=256, out_features=512)
        self.fc2 = torch.nn.Linear(in_features=512, out_features=64)
        self.output = torch.nn.Linear(in_features=64, out_features=1)
        self.data = data
        
    def forward(self, user_input, item_input):
        
        # pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)
        
        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)
        
        # fully connected layers
        vector = torch.nn.ReLU()(self.fc1(vector))
        vector = torch.nn.ReLU()(self.fc2(vector))
        
        # output layer 
        pred = self.output(vector)
        
        return pred
    
    def training_step(self, batch, batch_index):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = torch.nn.MSELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())
    
    def train_dataloader(self):
        return DataLoader(Train_dataset(self.data), batch_size=256)

In [31]:
num_users = data['user_id'].max() + 1
num_items = data['item_id'].max() + 1

model = NCF(num_users, num_items, trainset)

#### Train the model

In [32]:
%%time
trainer = pl.Trainer(max_epochs=5, logger=False)
trainer.fit(model)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 275 M 
1 | item_embedding | Embedding | 23.8 M
2 | fc1            | Linear    | 131 K 
3 | fc2            | Linear    | 32.8 K
4 | output         | Linear    | 65    
---------------------------------------------
299 M     Trainable params
0         Non-trainable params
299 M     Total params
1,197.994 Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: total: 1h 35min 35s
Wall time: 1h 35min 40s


#### Evaluate the result

In [33]:
test_user = torch.tensor(testset['user_id'].to_numpy())
test_item = torch.tensor(testset['item_id'].to_numpy())
y_pred = model.forward(test_user, test_item).detach().numpy().flatten()
y_test = testset['rating'].to_numpy()

In [34]:
# Test Set MSE
MSE(y_pred, y_test)

0.7011991391584224

In [35]:
# Test Set Accuracy
accuracy(y_pred, y_test)

0.5145615363213437