# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm.auto import tqdm
from scipy import sparse
import torch

import warnings
warnings.filterwarnings('ignore')

# Data

## Load data

In [3]:
DATA_DIR = os.path.join("..", "data", "final_dataset")

In [5]:
interactions = pd.read_parquet(os.path.join(DATA_DIR, "ratings_final.parquet"))
users = pd.read_parquet(os.path.join(DATA_DIR, "users_final.parquet"))
books = pd.read_parquet(os.path.join(DATA_DIR, "books_final.parquet"))
user_id_map_df = pd.read_csv(os.path.join(DATA_DIR, "user_id_map.csv"))
book_id_map_df = pd.read_csv(os.path.join(DATA_DIR, "book_id_map.csv"))

print(f"Number of ratings: {len(interactions)}")
print(f"Number of unique users: {interactions['user_id'].nunique()}")
print(f"Number of books: {interactions['book_id'].nunique()}")
print(f"Shape of interactions: {interactions.shape}")
print(f"Shape of books: {books.shape}")
print(f"Shape of users: {users.shape}")

Number of ratings: 104756
Number of unique users: 31940
Number of books: 22020
Shape of interactions: (104756, 3)
Shape of books: (22058, 13)
Shape of users: (31940, 35)


In [6]:
M = interactions['user_id'].nunique()
N = interactions['book_id'].nunique()

print(f"There are {M} unique users and {N} unique books in this data set")

There are 31940 unique users and 22020 unique books in this data set


## Split

In [7]:
def split_dataframe(df, holdout_fraction=0.1):
  """Splits a DataFrame into training and test sets.
  Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
  Returns:
    train: dataframe for training
    test: dataframe for testing
  """
  test = df.sample(frac=holdout_fraction, replace=False)
  train = df[~df.index.isin(test.index)]
  return train, test

train_df, test_df = split_dataframe(interactions)

# Theory

Using neural networks instead of matrix factorization (MF) for recommendation systems has a number of advantages. See the notes section for detail. Here, we will be providing the model architecture that will be used for the recommendation system.

The model will consist of two steps:

## The Embedding Layer

This layer will take the user and item IDs in one-hot encoded form along with any other user and item feature and will pass it through a fully connected layer. The output of this layer will be the latent representation of the user and item. Let use denote $\mathbf{u}$ as the user and $\mathbf{v}$ as the item. Their dimensions will be $m+m_{uf}$ and $n+n_{if}$ respectively where $m$ and $n$ are the number of users and items and $m_{uf}$ and $n_{if}$ are the number of user and item features respectively. The output of the embedding layer will be $\mathbf{u} \in \mathbb{R}^d$ and $\mathbf{v} \in \mathbb{R}^d$ where $d$ is the dimension of the latent space.

We will have two different layers, one for users and the other of items. This is required because the number of users and items are different and we want to learn different embeddings for them. The user embedding layer will have $m+m_{uf}$ neurons and the item embedding layer will have $n+n_{if}$ neurons.

## CF Layers

CF layers, or collaborative filtering layers are made up of one or more layers of fully connected layers. The input to these layers will be the concatenation of the user and item latent representations. The output of the CF layers will be the predicted rating.

![](images/dl_01.png)

## Loss Function

The loss function can either be MSE or cross entropy. We will be experimenting with both.

## Data

We will be using all the positive data and a random sample of the negative data. The ratio of positive to negative data will be decided by a parameter.

# Model

In [None]:
class BookDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row["user_id"]
        book_id = row["isbn"]
        rating = row["provided_rating"]
        return {
            "user_id": torch.tensor(user_id, dtype=torch.long),
            "book_id": torch.tensor(book_id, dtype=torch.long),
            "rating": torch.tensor(rating, dtype=torch.float),
        }

In [51]:
train_df.head()

Unnamed: 0,user_id,book_id,provided_rating
0,0,0,6
1,0,1,5
3,2,3,7
4,3,4,8
5,4,5,8
