In [39]:
import sys; sys.path.append('../util')
from load_yelp_data import load_yelp_dataframe, create_standard_train_and_test
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import graphlab as gl
%matplotlib inline

from matrix_factorization_recommender import MatrixFactorizationRecommender

In [3]:
businesses = load_yelp_dataframe('businesses')
reviews = load_yelp_dataframe('reviews')
users = load_yelp_dataframe('users')

[(businesses_train,businesses_test),
 (reviews_train,reviews_test),
 (users_train, users_test)]=create_standard_train_and_test([businesses, reviews, users])

Let's look at how this [vowpal wabbit](https://github.com/JohnLangford/vowpal_wabbit)-based recommender performs:

In [4]:
X_train = reviews_train[['user_id', 'business_id']].values
y_train = reviews_train['stars'].values
X_test = reviews_test[['user_id', 'business_id']].values
y_test = reviews_test['stars'].values

In [5]:
matrix_fac = MatrixFactorizationRecommender()
matrix_fac.fit(X_train, y_train)

Model file already exists, skipping vowpal-wabbit fitting


In [6]:
matrix_fac.r2_score(X_test, y_test)

0.17471762166215021

In [65]:
import sklearn.metrics
mse = sklearn.metrics.mean_squared_error(y_test, matrix_fac.predict(X_test))
np.sqrt(mse)

1.2623265210165748

In [67]:
np.sqrt(sklearn.metrics.mean_squared_error(y_train, matrix_fac.predict(X_train)))

1.1723361155936303

We get a reasonable R^2 on the test set, and an RMSE of 1.26. On our training set, as we might expect, the RMSE is lower (1.17), but not outrageously so; we aren't overfitting by that much. Now let's switch to graphlab's models:

In [43]:
train_sframe = gl.SFrame(reviews_train[['business_id', 'user_id', 'stars']])
test_sframe = gl.SFrame(reviews_test[['business_id', 'user_id', 'stars']])

In [44]:
pop_model = graphlab.popularity_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)

sim_model = graphlab.item_similarity_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)

fac_model = graphlab.factorization_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)

rank_fac_model = graphlab.ranking_factorization_recommender.create(
    train_sframe, user_id='user_id', item_id='business_id', target='stars', verbose=False)

In [64]:
models = [pop_model, sim_model, fac_model, rank_fac_model]
for model in models:
    print type(model).__name__ + ':'
    print model.evaluate_rmse(test_sframe, target='stars')['rmse_overall']

PopularityRecommender:
1.26397284255
ItemSimilarityRecommender:
4.01121272365
FactorizationRecommender:
1.44120740221
RankingFactorizationRecommender:
1.90497278254


In [66]:
for model in models:
    print type(model).__name__ + ':'
    print model.evaluate_rmse(train_sframe, target='stars')['rmse_overall']

PopularityRecommender:
1.19856858622
ItemSimilarityRecommender:
4.00711787159
FactorizationRecommender:
0.377976155743
RankingFactorizationRecommender:
0.378911557349


It looks like the graphlab factorization recommenders overfit the training set, while the popularity recommender is much more robust (which we might expect). The item similarity metric does terribly, but we haven't given it any business or user attributes, which could potentially help it compute similarity. We might want to hand-write a user-user or business-business similarity score that compares specific attributes and pass in similar elements to the graphlab model (which I believe is possible), since many of the fields we'd like to compare are idiosyncratically structured.