In [1]:
from collections import defaultdict
import csv
import scipy
import scipy.optimize
import random
import numpy as np
import time

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable

import pandas as pd 
from IPython.display import display

In [2]:
class MF(nn.Module):
    itr = 0
    
    def __init__(self, n_user, n_item, k=1, c_vector=1.0, c_bias=1.0, writer=None):
        super(MF, self).__init__()
        self.writer = writer
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        # gammas (users and items)
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # alpha and betas (users and items)
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, train_x):
        item_id = train_x[:, 0]
        user_id = train_x[:, 1]
        vector_user = self.user(user_id)
        vector_item = self.item(item_id)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)
        
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):
        loss_mse = F.mse_loss(prediction, target.squeeze())
        
        # Add new regularization to the biases
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        total = loss_mse + prior_user + prior_item + prior_bias_user + prior_bias_item
        for name, var in locals().items():
            if type(var) is torch.Tensor and var.nelement() == 1 and self.writer is not None:
                self.writer.add_scalar(name, var, self.itr)
        return total

## user_id and place_id --> user_idx, place_idx

To perform matrix factorization, we need to convert user_id and place_id into their index in the interaction matrix. This has already been done in the Notebook GoogleLocal_reformat.ipynb

### Reviews based on (user_id, place_id, rating, time)

In [3]:
data = pd.read_csv("../datasets/google_local/reviews.csv")
display(data.info())
display(data.head())

n_user = len(data['gPlusUserId'].unique())
n_place = len(data['gPlusPlaceId'].unique())

print(n_user,n_place)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11453845 entries, 0 to 11453844
Data columns (total 4 columns):
gPlusPlaceId      object
gPlusUserId       object
rating            float64
unixReviewTime    object
dtypes: float64(1), object(3)
memory usage: 349.5+ MB


None

Unnamed: 0,gPlusPlaceId,gPlusUserId,rating,unixReviewTime
0,108103314380004200232,100000010817154263736,3.0,1372686659
1,102194128241608748649,100000013500285534661,5.0,1342870724
2,101409858828175402384,100000021336848867366,5.0,1390653513
3,101477177500158511502,100000021336848867366,5.0,1389187706
4,106994170641063333085,100000021336848867366,4.0,1390486279


5054567 3116785


### Reviews based on (user_idx, place_idx, rating, time)

In [4]:
data = pd.read_csv("../datasets/google_local/reviews_reformatted.csv")
display(data.info())
display(data.head())

n_user = len(data['gPlusUserId'].unique())
n_item = len(data['gPlusPlaceId'].unique())

print(n_user,n_item)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11453845 entries, 0 to 11453844
Data columns (total 4 columns):
gPlusPlaceId      int64
gPlusUserId       int64
rating            float64
unixReviewTime    object
dtypes: float64(1), int64(2), object(1)
memory usage: 349.5+ MB


None

Unnamed: 0,gPlusPlaceId,gPlusUserId,rating,unixReviewTime
0,1368311,0,3.0,1372686659
1,370282,1,5.0,1342870724
2,237940,2,5.0,1390653513
3,249417,2,5.0,1389187706
4,1181533,2,4.0,1390486279


5054567 3116785


In [5]:
def l2_regularize(array):
    loss = torch.sum(array ** 2.0)
    return loss

In [6]:
# Shuffle the data
shuffled_data = data.sample(frac=1).reset_index(drop=True)
shuffled_data.head()

Unnamed: 0,gPlusPlaceId,gPlusUserId,rating,unixReviewTime
0,1421526,3291564,4.0,1359071859
1,1436921,194979,5.0,1390412928
2,1951521,1405178,5.0,1323136392
3,2349727,1648361,5.0,1385938274
4,2121811,4200589,5.0,1344854294


### Split into Training, Validation and Test Datasets

In [7]:
N = shuffled_data.index.size

train_split = int(N * 0.70)
valid_split =  int(N * 0.85)

train_x = shuffled_data.loc[:train_split, 'gPlusPlaceId':'gPlusUserId']
train_y = shuffled_data.loc[:train_split, 'rating':'rating']
valid_x = shuffled_data.loc[train_split+1:valid_split, 'gPlusPlaceId':'gPlusUserId']
valid_y = shuffled_data.loc[train_split+1:valid_split, 'rating':'rating']
test_x = shuffled_data.loc[valid_split+1:, 'gPlusPlaceId':'gPlusUserId']
test_y = shuffled_data.loc[valid_split+1:, 'rating':'rating']

display(train_x.describe())
display(train_y.describe())
display(valid_x.describe())
display(valid_y.describe())
display(test_x.describe())
display(test_y.describe())

print(N, train_x.index.size, valid_x.index.size,test_x.index.size)


Unnamed: 0,gPlusPlaceId,gPlusUserId
count,8017692.0,8017692.0
mean,1558401.0,2514285.0
std,899767.5,1457401.0
min,0.0,1.0
25%,779778.0,1250517.0
50%,1558311.0,2509152.0
75%,2337392.0,3774378.0
max,3116784.0,5054566.0


Unnamed: 0,rating
count,8017692.0
mean,4.047443
std,1.195825
min,0.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


Unnamed: 0,gPlusPlaceId,gPlusUserId
count,1718077.0,1718077.0
mean,1559433.0,2512361.0
std,899811.8,1457010.0
min,6.0,0.0
25%,779944.0,1248296.0
50%,1559237.0,2507633.0
75%,2338078.0,3770334.0
max,3116779.0,5054565.0


Unnamed: 0,rating
count,1718077.0
mean,4.045867
std,1.195607
min,0.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


Unnamed: 0,gPlusPlaceId,gPlusUserId
count,1718076.0,1718076.0
mean,1557427.0,2513512.0
std,899871.2,1457395.0
min,1.0,2.0
25%,778447.2,1249095.0
50%,1556118.0,2507648.0
75%,2336836.0,3773078.0
max,3116779.0,5054565.0


Unnamed: 0,rating
count,1718076.0
mean,4.048239
std,1.195381
min,0.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


11453845 8017692 1718077 1718076


In [8]:
# Hyperparameters
lr = 1e-3
k = 1
# New parameter for regularizing bias
c_bias = 1e-5
c_vector = 1e-5
batchsize = 1024

cuda = torch.cuda.is_available()
print (cuda)

True


In [12]:
from random import shuffle

# This code utilizes ignite engine's create_supervised_trainer()
# But we need something more basic

model = MF(n_user, n_item, k=k, c_vector=c_vector)
if cuda:
    model.cuda()
    
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

def chunks(X, Y, size):
    """Yield successive n-sized chunks from l."""
    starts = list(range(0, len(X), size))
    shuffle(starts)
    for i in starts:
        yield (X[i:i + size], Y[i:i + size])
        
batch_size = 1024
losses = []
for epoch in range(5+1):
    
    i = 0
    for feature, target in chunks(np.array(train_x), np.array(train_y), batch_size):
        # This zeros the gradients on every parameter. 
        # This is easy to miss and hard to troubleshoot.
        optimizer.zero_grad()
        # Convert 
        feature = Variable(torch.from_numpy(feature))
        target = Variable(torch.from_numpy(target).type(torch.FloatTensor))
        
        if cuda:
            feature = feature.cuda()
            target = target.cuda()
            
        # Compute a prediction for these features
        prediction = model.forward(feature)
        # Compute a loss given what the true target outcome was
        loss = model.loss(prediction, target)
        # break
        # Backpropagate: compute the direction / gradient every model parameter
        # defined in your __init__ should move in in order to minimize this loss
        # However, we're not actually changing these parameters, we're just storing
        # how they should change.

        loss.backward()
        # Now take a step & update the model parameters. The optimizer uses the gradient at 
        # defined on every parameter in our model and nudges it in that direction.
        optimizer.step()
        
        if i%1000 == 0 and epoch%1 == 0:
            print("Epoch[{}] Iteration[{}] Training Loss: {:.2f}".format(epoch, i, loss.data))

        # Record the loss per example
        losses.append(loss.cpu().data.numpy() / len(feature))
        
        if i%1000 == 0 and epoch%1 == 0:
            feature = torch.from_numpy(np.array(valid_x))
            target = torch.from_numpy(np.array(valid_y)).type(torch.FloatTensor)
            
            if cuda:
                feature = feature.cuda()
                target = target.cuda()

            prediction = model.forward(feature)
            loss = model.loss(prediction, target)
            print("Epoch[{}] Validation Loss: {:.2f} ".format(epoch, loss.data))

        i += 1

Epoch[0] Iteration[0] Training Loss: 8170958.00
Epoch[0] Validation Loss: 8157927.50 
Epoch[0] Iteration[1000] Training Loss: 1728496.25
Epoch[0] Validation Loss: 1725805.62 
Epoch[0] Iteration[2000] Training Loss: 343506.94
Epoch[0] Validation Loss: 342926.66 
Epoch[0] Iteration[3000] Training Loss: 57079.65
Epoch[0] Validation Loss: 56970.56 
Epoch[0] Iteration[4000] Training Loss: 7316.12
Epoch[0] Validation Loss: 7300.06 
Epoch[0] Iteration[5000] Training Loss: 674.81
Epoch[0] Validation Loss: 673.10 
Epoch[0] Iteration[6000] Training Loss: 43.52
Epoch[0] Validation Loss: 43.28 
Epoch[0] Iteration[7000] Training Loss: 3.14
Epoch[0] Validation Loss: 3.14 
Epoch[1] Iteration[0] Training Loss: 1.54
Epoch[1] Validation Loss: 1.53 
Epoch[1] Iteration[1000] Training Loss: 1.33
Epoch[1] Validation Loss: 1.44 
Epoch[1] Iteration[2000] Training Loss: 1.35
Epoch[1] Validation Loss: 1.44 
Epoch[1] Iteration[3000] Training Loss: 1.46
Epoch[1] Validation Loss: 1.43 
Epoch[1] Iteration[4000] Tra

In [39]:
loss.data


 8.1648e+06
[torch.FloatTensor of size 1]

In [32]:
train_x[:10]

Unnamed: 0,gPlusPlaceId,gPlusUserId
0,1786524,4977190
1,1329103,266338
2,159841,3718444
3,1702994,1622642
4,3091879,2894372
5,1293407,2429273
6,2703638,1584233
7,1775170,4037622
8,458729,2213988
9,398873,2950808
