In [1]:
import numpy as np
import scipy as sp
import sklearn
import nltk
import tensorflow as tf
import matplotlib
import gzip
import random
from collections import defaultdict
import sklearn.decomposition


def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

In [2]:
data = []
for l in readGz('train.json.gz'):
    data.append(l)


In [3]:
userList = []
businessList = []
userDict = defaultdict(int)
businessDict = defaultdict(int)
visitedDict = defaultdict(int)
for d in data:
    u = d['userID']
    b = d['businessID']
    if u not in userList:
        userList.append(u)
    if b not in businessList:
        businessList.append(b)
    visitedDict[(u,b)]+=1

    
for i in range(len(userList)):
    userDict[userList[i]] = i
for i in range(len(businessList)):
    businessDict[businessList[i]] = i

In [4]:
random.shuffle(data)
train_data = data[:160000]

validation_data = data[160000:200000]
negative_pair = []
randomCnt = 0
while(randomCnt) < 80000:
    ruIndex = random.randint(0, len(userList)-1)
    rbIndex = random.randint(0, len(businessList)-1)
    ru = userList[ruIndex]
    rb = businessList[rbIndex]
    if visitedDict[(ru, rb)]==0:
        randomCnt +=1
        negative_pair.append((ru,rb))
#Finished creating dataset

ratings = np.array([d['rating'] for d in train_data])
avgRating = np.mean(ratings)
print avgRating


4.18719375


In [5]:
def Mean(lst):
    if len(lst) != 0:
        return np.mean(lst)
    else:
        return avgRating

In [7]:
businessRat = [[] for i in range(len(businessList))]
for d in train_data:
    businessRat[businessDict[d['businessID']]].append(d['rating'])
businessAvg = [Mean(businessRat[i]) for i in range(len(businessList))]
print businessAvg[:20]

[4.333333333333333, 4.375, 4.2127659574468082, 4.2857142857142856, 3.828125, 4.5625, 4.375, 5.0, 4.4545454545454541, 4.2000000000000002, 4.3250000000000002, 4.75, 3.7000000000000002, 4.333333333333333, 3.6818181818181817, 4.2999999999999998, 3.5, 4.1111111111111107, 4.125, 3.2142857142857144]


In [8]:
categoryList = []
for d in train_data:
    for c in d['categories']:
        if c not in categoryList:
            categoryList.append(c)
categoryDict = defaultdict(int)
for c in categoryList:
    categoryDict[c] = categoryList.index(c)

businessCategory = [[0 for j in range(len(categoryList))] for i in range(len(businessList))]
userHistory = [[0 for j in range(len(categoryList))] for i in range(len(userList))]
for d in train_data:
    for c in d['categories']:
        userHistory[userDict[d['userID']]][categoryDict[c]]+=1.0
        businessCategory[businessDict[d['businessID']]][categoryDict[c]]=1
#the list of what each business's category is and what category each user has visited

In [5]:
def feature(datum):
    feat = []
    userOneHot = [0 for u in userList]
    businessOneHot = [0 for b in businessList]
    userOneHot[userDict[datum['userID']]] = 1
    businessOneHot[businessDict[datum['businessID']]] = 1
    feat.append(np.array(userOneHot))
    feat.append(np.array(businessOneHot))
    return feat

In [6]:
#extracting features and ratings

train_rating = [d['rating'] for d in train_data]

validation_rating = [d['rating'] for d in validation_data]

In [7]:
#tensorflow learning hyper parameters
learning_rate = 0.001
regularization_rate = 0.001
Reduction_Size = 100
max_iter = 20000
batch_size = 200

In [26]:
def calc(user, business, regularizer):
    predict = avgRating
    with tf.variable_scope('bu'):
        betau = tf.get_variable(name = 'betau', shape = [len(userList),1], initializer = tf.truncated_normal_initializer(stddev = 0.5))
        predict += tf.matmul(user, betau)
        tf.add_to_collection('losses', regularizer(betau))
    
    with tf.variable_scope('bi'):
        betai = tf.get_variable(name = 'betai', shape = [len(businessList),1], initializer = tf.truncated_normal_initializer(stddev = 0.5))
        predict += tf.matmul(business, betai)
        tf.add_to_collection('losses', regularizer(betai))
    
    with tf.variable_scope('gamma'):
        gammau = tf.get_variable(name = 'gu', shape = [len(userList), Reduction_Size], initializer = tf.truncated_normal_initializer(stddev = 0.5))
        gammai = tf.get_variable(name = 'gi', shape = [Reduction_Size, len(businessList)], initializer = tf.truncated_normal_initializer(stddev = 0.5))
        predict += tf.matmul(tf.matmul(user, gammau), tf.matmul(gammai, business.T))
        tf.add_to_collection('losses', regularizer(gammau))
        tf.add_to_collection('losses', regularizer(gammai))
    
    return predict

In [27]:
def train():
    U = tf.placeholder(tf.float32, [None, len(userList)], name = 'input-u')
    B = tf.placeholder(tf.float32, [None, len(businessList)], name = 'input-b')
    rating = tf.placeholder(tf.float32, [None], name = 'rating')
    regularizer = tf.contrib.layers.l2_regularizer(regularization_rate)
    
    predict_ = calc(U,B,regularizer)
    se = tf.losses.mean_squared_error(labels = rating, predictions = predict_)
    sum_loss = tf.reduce_sum(se)
    loss = sum_loss + tf.add_n(tf.get_collection('losses'))
    
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        for i in range(max_iter):
            sample = np.random.randint(0, 160000, batch_size)
            x_batch = train_data[sample]
            y_batch = train_rating[sample]
            x_batch_feature = [feature(d) for d in x_batch]
            x_batch_user = np.array([b[0] for b in x_batch_feature])
            x_batch_business = np.array([b[1] for b in x_batch_feature])
            
            l_value, _ = sess.run([loss, train_op], fead_dict = {U:x_batch_user, B: x_batch_business, rating:y_batch})
            if i % 1000 == 0:
                print("After %d iters, loss on training batch is %f."%(i, l_value))
                sample_ = np.random.randint(0, 40000, batch_size)
                v_batch = validation_data[sample_]
                v_rating = validation_rating[sample_]
                v_batch_feature = [feature(d) for d in v_batch]
                v_batch_user = np.array(v[0] for v in v_batch_feature)
                v_batch_business = np.array(v[1] for v in v_batch_feature)
                vl_value = sess.run(loss, feed_dict = {U:v_batch_user, B: v_batch_business, rating : v_rating})
                print("After %d iters, loss on validation batch is %f"%(i, vl_value))
                
                

In [28]:
train()

ValueError: Variable bu/betau already exists, disallowed. Did you mean to set reuse=True in VarScope? Originally defined at:

  File "<ipython-input-23-ed34b79082d2>", line 4, in calc
    betau = tf.get_variable(name = 'betau', shape = [len(userList),1], initializer = tf.truncated_normal_initializer(stddev = 0.5))
  File "<ipython-input-24-a3a8b1bbd2d1>", line 7, in train
    predict_ = calc(U,B,regularizer)
  File "<ipython-input-25-2da0ffaf5447>", line 1, in <module>
    train()
