# Apply Logistic MF to amazon book ratings

Given a dataset with amazon ratings of books, write a pytorch code with the following model:
$$\hat{y_{ij}} = sigmoid(u_i \cdot v_j + b_i + c_j)$$

This is a binary dataset (ratings are 0's or 1's). The loss function for this model is log loss or binary cross entropy.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
PATH = Path("data/")

In [3]:
# reading a csv into pandas
train = pd.read_csv(PATH/"train_books_ratings.csv")
valid = pd.read_csv(PATH/"valid_books_ratings.csv")

In [4]:
valid.head()

Unnamed: 0,user,item,rating,timestamp
0,A2E2F4MLVYDGEQ,000100039X,0,1393286400
1,A386A9WE42M4PG,000100039X,0,1371772800
2,A1OGQA984MTKBH,000100039X,0,1372118400
3,A1VVBHGM8DFIZ4,000100039X,0,1387152000
4,AD6E4Y092Y4KP,000100039X,0,1392336000


In [5]:
train_user_ids = np.sort(np.unique(train.user.values))
num_users = len(train_user_ids)
print(num_users)
train_user_ids[:15]

1312778


array(['A000096625CHSNKYTYGZN', 'A00027561NC7JTXEP3EOD',
       'A0002802PGRRB05CR0VT', 'A00031045Q68JAQ1UYT',
       'A00034485ZR6O60DSTB', 'A000474048I5ERWOT4F1',
       'A000546612R3DNRC8556S', 'A00066243R8D11GEHJID0',
       'A00069023W30DWQJNBSPS', 'A00084501WU69W4PMQJWJ',
       'A00085162GMCAJ3DQHUMY', 'A00105581RTVW6FDVGPKJ',
       'A0010876CNE3ILIM9HV0', 'A00109803PZJ91RLT7DPN',
       'A001116435Y409YSMCZKW'], dtype=object)

In [6]:
userid2idx = {o:i for i,o in enumerate(train_user_ids)}
train["user"] = train["user"].apply(lambda x: userid2idx[x])
valid["user"] = valid["user"].apply(lambda x: userid2idx.get(x, -1))

In [7]:
train_item_ids = np.sort(np.unique(train.item.values))
num_items = len(train_item_ids)
print(num_items)
train_item_ids[:15]

659279


array(['0000000116', '0000013714', '0000477141', '000100039X',
       '0001053655', '0001203010', '0001360000', '0001473123',
       '0001473905', '0001501232', '000161102X', '0001711296',
       '000171130X', '0001712772', '000171287X'], dtype=object)

In [8]:
itemid2idx = {o:i for i,o in enumerate(train_item_ids)}
train["item"] = train["item"].apply(lambda x: itemid2idx[x])
valid["item"] = valid["item"].apply(lambda x: itemid2idx.get(x, -1))

In [9]:
valid = valid[valid["user"] >= 0].copy()
valid = valid[valid["item"] >= 0].copy()

In [10]:
train.head()

Unnamed: 0,user,item,rating,timestamp
0,527409,0,0,1395619200
1,1059073,1,0,1357516800
2,750064,2,0,1399939200
3,1062362,3,0,1385683200
4,758289,3,0,1391990400


In [11]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        # init 
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
    
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return F.sigmoid((U*V).sum(1) +  b_u  + b_v)

In [12]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for i in range(epochs):
        model.train()
        users = torch.LongTensor(train.user.values)  
        items = torch.LongTensor(train.item.values) 
        ratings = torch.FloatTensor(train.rating.values) 
    
        y_hat = model(users, items)
        loss = F.binary_cross_entropy(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        testloss = valid_loss(model)
        print("train loss %.3f valid loss %.3f" % (loss.item(), testloss)) 

In [13]:
def valid_loss(model):
    model.eval()
    users = torch.LongTensor(valid.user.values) 
    items = torch.LongTensor(valid.item.values)
    ratings = torch.FloatTensor(valid.rating.values)
    y_hat = model(users, items)
    loss = F.binary_cross_entropy(y_hat, ratings)
    return loss.item()

In [14]:
model = MF_bias(num_users, num_items, emb_size=100)

In [15]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)



train loss 0.694 valid loss 0.689
train loss 0.685 valid loss 0.684
train loss 0.677 valid loss 0.680
train loss 0.671 valid loss 0.676
train loss 0.666 valid loss 0.673
train loss 0.661 valid loss 0.670
train loss 0.658 valid loss 0.668
train loss 0.655 valid loss 0.665
train loss 0.653 valid loss 0.663
train loss 0.651 valid loss 0.660
