In [1]:
import sys; sys.path.append('../util')
from load_yelp_data import load_yelp_dataframe, filter_to_location, train_test_split_reviews
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import graphlab as gl
import sklearn.metrics
%matplotlib inline

from matrix_factorization_recommender import MatrixFactorizationRecommender
from simple_averaging_recommender import SimpleAveragingRecommender

In [2]:
businesses = load_yelp_dataframe('businesses')
reviews = load_yelp_dataframe('reviews')
users = load_yelp_dataframe('users')

In [3]:
phoen_biz, phoen_rev, phoen_usr = filter_to_location('Phoenix', businesses, reviews, users)
reviews_train, reviews_test = train_test_split_reviews(phoen_rev)

In [4]:
def utility_matrix_sparsity(reviews):
    return 1 - len(reviews) / float(len(reviews.business_id.unique()) * len(reviews.user_id.unique()))

In [5]:
print 'Overall utility matrix sparsity:', utility_matrix_sparsity(reviews)
print 'Phoenix utility matrix sparsity:', utility_matrix_sparsity(phoen_rev)

Overall utility matrix sparsity: 0.999954279094
Phoenix utility matrix sparsity: 0.999885843969


Let's look at how this [vowpal wabbit](https://github.com/JohnLangford/vowpal_wabbit)-based recommender performs:

In [6]:
X_train = reviews_train[['user_id', 'business_id']].values
y_train = reviews_train['stars'].values
X_test = reviews_test[['user_id', 'business_id']].values
y_test = reviews_test['stars'].values

In [20]:
def summarize_performance(model):
    name = simple_avg.__class__.__name__
    print '{} train R^2: {}'.format(name, model.r2_score(X_train, y_train))
    print '{} test R^2: {}'.format(name, model.r2_score(X_test, y_test))
    print '{} train RMSE: {}'.format(name, model.rmse(X_train, y_train))
    print '{} test RMSE: {}'.format(name, model.rmse(X_test, y_test))

In [21]:
simple_avg = SimpleAveragingRecommender()
simple_avg.fit(X_train, y_train)
summarize_performance(simple_avg)

SimpleAveragingRecommender train R^2: 0.458334735464
SimpleAveragingRecommender test R^2: 0.0353093716165
SimpleAveragingRecommender train RMSE: 1.05046205597
SimpleAveragingRecommender test RMSE: 1.40178931215


In [22]:
matrix_fac = MatrixFactorizationRecommender()
matrix_fac.fit(X_train, y_train)
summarize_performance(matrix_fac)

Model file already exists, skipping vowpal-wabbit fitting
SimpleAveragingRecommender train R^2: 0.255887084607
SimpleAveragingRecommender test R^2: 0.253239819851
SimpleAveragingRecommender train RMSE: 1.23121613378
SimpleAveragingRecommender test RMSE: 1.23332998571


These results make sense / are actually pretty great. Simple averaging does an eminently reasonable job, giving us a test R^2 of close to 0 (which makes sense for an average), but a respectable RMSE of 1.4. Clearly any reasonable model we put forward should do better than that.

Matrix factorization does better than simple averaging, and surprisingly, we do just as well on the test set as we do on the training set. That's pretty cool to see.

In [23]:
train_sframe = gl.SFrame(reviews_train[['business_id', 'user_id', 'stars']])
test_sframe = gl.SFrame(reviews_test[['business_id', 'user_id', 'stars']])

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1480256873.log


This non-commercial license of GraphLab Create for academic use is assigned to andrew_ross@g.harvard.edu and will expire on November 23, 2017.


In [26]:
pop_model = gl.popularity_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)
sim_model = gl.item_similarity_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)
fac_model = gl.factorization_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)
rank_fac_model = gl.ranking_factorization_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)

In [27]:
models = [pop_model, sim_model, fac_model, rank_fac_model]
for model in models:
    name = model.__class__.__name__
    train_rmse = model.evaluate_rmse(train_sframe, target='stars')['rmse_overall']
    test_rmse = model.evaluate_rmse(test_sframe, target='stars')['rmse_overall']
    print '{} train RMSE: {}'.format(name, train_rmse)
    print '{} test RMSE: {}'.format(name, test_rmse)

PopularityRecommender train RMSE: 1.22130851601
PopularityRecommender test RMSE: 1.30166476847
ItemSimilarityRecommender train RMSE: 4.03787726
ItemSimilarityRecommender test RMSE: 4.04191406029
FactorizationRecommender train RMSE: 0.392297368092
FactorizationRecommender test RMSE: 1.47303586137
RankingFactorizationRecommender train RMSE: 0.456966653589
RankingFactorizationRecommender test RMSE: 2.02394390501


It looks like the graphlab factorization recommenders overfit the training set, while the popularity recommender is much more robust (which we might expect). The item similarity metric does terribly, but we haven't given it any business or user attributes, which could potentially help it compute similarity. We might want to hand-write a user-user or business-business similarity score that compares specific attributes and pass in similar elements to the graphlab model (which I believe is possible), since many of the fields we'd like to compare are idiosyncratically structured.