In [1]:
from collections import defaultdict
import csv
import scipy
import scipy.optimize
import random
import numpy as np
import time

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

import pandas as pd 
from IPython.display import display

In [2]:
class MF(nn.Module):
    
    def __init__(self, n_user, n_item, k=1):
        super(MF, self).__init__()
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        
        # gammas (users and items)
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # alpha and betas (users and items)
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        
        # self.theta = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, train_x):
        item_id = train_x[:, 0]
        user_id = train_x[:, 1]
        # freq = train_x[:, 2].float()
        vector_user = self.user(user_id)
        vector_item = self.item(item_id)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        
        # biases = (self.bias + self.theta*freq + bias_user + bias_item)
        biases = (self.bias + bias_user + bias_item)
        
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        return F.mse_loss(prediction, target.squeeze())

In [3]:
data = pd.read_csv("../datasets/google_local/reviews_timesorted.csv")
display(data.info())
display(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10601852 entries, 0 to 10601851
Data columns (total 8 columns):
gPlusPlaceId      int64
gPlusUserId       int64
rating            int64
unixReviewTime    int64
num_reviews       int64
year              int64
month             int64
day               int64
dtypes: int64(8)
memory usage: 647.1 MB


None

Unnamed: 0,gPlusPlaceId,gPlusUserId,rating,unixReviewTime,num_reviews,year,month,day
0,804813,2021440,4,662601600,9,1990,12,31
1,1918972,389663,4,662601600,20,1990,12,31
2,449452,2709545,4,662601600,15,1990,12,31
3,942354,4936600,4,662601600,293,1990,12,31
4,3063673,828378,5,662601600,11,1990,12,31


In [4]:
original_data = pd.read_csv("../datasets/google_local/reviews_freq.csv")

n_user = len(original_data['gPlusUserId'].unique())
n_place = len(original_data['gPlusPlaceId'].unique())

print(n_user,n_place)

5054567 3116785


In [5]:
N = data.index.size

# Note that the reviews are sorted by time
train_split = int(N * 0.70)   
valid_split =  int(N * 0.85)

train_x = data.loc[:train_split, ['gPlusPlaceId','gPlusUserId','num_reviews', \
                                  'unixReviewTime','year','month','day']]
train_y = data.loc[:train_split, 'rating':'rating']
valid_x = data.loc[train_split+1:valid_split, ['gPlusPlaceId','gPlusUserId','num_reviews', \
                                  'unixReviewTime','year','month','day']]
valid_y = data.loc[train_split+1:valid_split, 'rating':'rating']
test_x = data.loc[valid_split+1:, ['gPlusPlaceId','gPlusUserId','num_reviews', \
                                  'unixReviewTime','year','month','day']]
test_y = data.loc[valid_split+1:, 'rating':'rating']

print(N, train_x.index.size, valid_x.index.size,test_x.index.size)

10601852 7421297 1590278 1590277


In [6]:
display(train_x[:5])
display(valid_x[:5])
display(test_x[:5])

Unnamed: 0,gPlusPlaceId,gPlusUserId,num_reviews,unixReviewTime,year,month,day
0,804813,2021440,9,662601600,1990,12,31
1,1918972,389663,20,662601600,1990,12,31
2,449452,2709545,15,662601600,1990,12,31
3,942354,4936600,293,662601600,1990,12,31
4,3063673,828378,11,662601600,1990,12,31


Unnamed: 0,gPlusPlaceId,gPlusUserId,num_reviews,unixReviewTime,year,month,day
7421297,2725776,4737472,26,1377798285,2013,8,29
7421298,2604227,43504,50,1377798286,2013,8,29
7421299,1298806,1549363,29,1377798286,2013,8,29
7421300,730982,3720790,6,1377798289,2013,8,29
7421301,2919327,3774840,2,1377798290,2013,8,29


Unnamed: 0,gPlusPlaceId,gPlusUserId,num_reviews,unixReviewTime,year,month,day
9011575,1959671,2938557,1,1387118864,2013,12,15
9011576,2039338,4638537,12,1387118869,2013,12,15
9011577,727670,840634,43,1387118873,2013,12,15
9011578,678877,3725103,6,1387118876,2013,12,15
9011579,2776187,2754988,2,1387118877,2013,12,15


In [7]:
# Hyperparameters
lr = 1e-2
lamb = 1e-6
k=1
batch_size = 1024

cuda = torch.cuda.is_available()
print (cuda)

True


In [None]:
from random import shuffle

model = MF(n_user, n_place, k=k)

if cuda:
    model.cuda()
    
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=lamb)

def chunks(X, Y, size):
    """Yield successive n-sized chunks from l."""
    starts = list(range(0, len(X), size))
    shuffle(starts)
    for i in starts:
        yield (X[i:i + size], Y[i:i + size])
        
# To keep track to best hyperparameters and results
best_loss = 0
best = []

losses = []
valid_losses = []

for epoch in range(10+1):
    
    i = 0
    for feature, target in chunks(np.array(train_x), np.array(train_y), batch_size):
        # This zeros the gradients on every parameter. 
        # This is easy to miss and hard to troubleshoot.
        optimizer.zero_grad()
        # Convert 
        feature = Variable(torch.from_numpy(feature))
        target = Variable(torch.from_numpy(target).type(torch.FloatTensor))
        
        if cuda:
            feature = feature.cuda()
            target = target.cuda()
            
        # model in training mode    
        model.train()
            
        # Compute a prediction for these features
        prediction = model.forward(feature)
        # Compute a loss given what the true target outcome was
        loss = model.loss(prediction, target)
        # break
        # Backpropagate: compute the direction / gradient every model parameter
        # defined in your __init__ should move in in order to minimize this loss
        # However, we're not actually changing these parameters, we're just storing
        # how they should change.

        loss.backward()
        # Now take a step & update the model parameters. The optimizer uses the gradient at 
        # defined on every parameter in our model and nudges it in that direction.
        optimizer.step()
        
        if i%1000 == 0 and epoch%1 == 0:
            print("Epoch[{}] Iteration[{}] Training Loss: {:.2f}".format(epoch, i, loss.data))

        # Record the loss per example
        losses.append(loss.cpu().data.numpy() / len(feature))
        
        if i%1000 == 0 and epoch%1 == 0:
            
            val_feature = torch.from_numpy(np.array(valid_x))
            val_target = torch.from_numpy(np.array(valid_y)).type(torch.FloatTensor)
            
            if cuda:
                val_feature = val_feature.cuda()
                val_target = val_target.cuda()
                
            # model in test mode    
            model.eval()

            val_pred = model.forward(val_feature)
            vloss = model.loss(val_pred, val_target)
            print("Epoch[{}] Validation Loss: {:.3f} ".format(epoch, vloss.data))
            
            # Record the validation loss per example
            valid_losses.append(vloss.cpu().data.numpy()/len(val_feature))
            
            if best_loss is 0:
                best_loss = vloss
                best = [vloss,lr,lamb]
                print("Save best theta...")
            else:
                if vloss < best_loss:
                    best_loss = vloss
                    best = [vloss,lr,lamb]
                    print("Save best theta...")
        i += 1

Epoch[0] Iteration[0] Training Loss: 13.28
Epoch[0] Validation Loss: 13.980 
Save best theta...
Epoch[0] Iteration[1000] Training Loss: 1.61
Epoch[0] Validation Loss: 1.727 
Save best theta...
Epoch[0] Iteration[2000] Training Loss: 1.04
Epoch[0] Validation Loss: 1.667 
Save best theta...
Epoch[0] Iteration[3000] Training Loss: 0.94
Epoch[0] Validation Loss: 1.632 
Save best theta...
Epoch[0] Iteration[4000] Training Loss: 0.96
Epoch[0] Validation Loss: 1.610 
Save best theta...
Epoch[0] Iteration[5000] Training Loss: 1.22
Epoch[0] Validation Loss: 1.595 
Save best theta...
Epoch[0] Iteration[6000] Training Loss: 0.92
Epoch[0] Validation Loss: 1.587 
Save best theta...
Epoch[0] Iteration[7000] Training Loss: 1.56
Epoch[0] Validation Loss: 1.583 
Save best theta...
Epoch[1] Iteration[0] Training Loss: 0.85
Epoch[1] Validation Loss: 1.581 
Save best theta...
Epoch[1] Iteration[1000] Training Loss: 0.71
Epoch[1] Validation Loss: 1.590 
Epoch[1] Iteration[2000] Training Loss: 0.79
Epoch[1]