# Yelp Data Challenge - Restaurant Recommender

BitTiger DS501

Nov 2017

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")
import random
from time import time

In [76]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time


class ItemItemRecommender(object):

    def __init__(self, neighborhood_size):
        self.neighborhood_size = neighborhood_size

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        self.item_sim_mat = cosine_similarity(self.ratings_mat.T)
        self._set_neighborhoods()

    def _set_neighborhoods(self):
        least_to_most_sim_indexes = np.argsort(self.item_sim_mat, 1)
        self.neighborhoods = least_to_most_sim_indexes[:, -self.neighborhood_size:]

    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        # Just initializing so we have somewhere to put rating preds
        out = np.zeros(self.n_items)
        for item_to_rate in range(self.n_items):
            relevant_items = np.intersect1d(self.neighborhoods[item_to_rate],
                                            items_rated_by_this_user,
                                            assume_unique=True)  # assume_unique speeds up intersection op
            out[item_to_rate] = self.ratings_mat[user_id, relevant_items] * \
                self.item_sim_mat[item_to_rate, relevant_items] / \
                self.item_sim_mat[item_to_rate, relevant_items].sum()
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        cleaned_out = np.nan_to_num(out)
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:]

In [75]:
def get_ratings_data():
    ratings_contents = pd.read_table("../data/u.data",
                                     names=["user", "movie", "rating", "timestamp"])
    highest_user_id = ratings_contents.user.max()
    highest_movie_id = ratings_contents.movie.max()
    ratings_as_mat = sparse.lil_matrix((highest_user_id, highest_movie_id))
    for _, row in ratings_contents.iterrows():
            # subtract 1 from id's due to match 0 indexing
        ratings_as_mat[row.user-1, row.movie-1] = row.rating
    return ratings_contents, ratings_as_mat

In [None]:
def get_sparse_matrix(df, userid, item):
    highest_user_id = df.userid.max()
    highest_item_id = df.item.max()
    df_as_matrix = sparse.lil_matrix((highest_user_id, highest_item_id))
    for _, row in df.iterrows():
        df_as_matrix[row.userid-1, row.item-1] = row.stars

In [3]:
pwd

'/Users/zirongwu/DataScience/DS501/BitTiger-DS501-1805/Homework/Yelp_Data_Challenge_Project/code'

In [4]:
cd ..

/Users/zirongwu/DataScience/DS501/BitTiger-DS501-1805/Homework/Yelp_Data_Challenge_Project


In [5]:
df = pd.read_csv('dataset/last_2_years_restaurant_reviews.csv')

In [6]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,vJIuDBdu01vCA8y1fwR1OQ,CakesbyToi,"American (Traditional), Food, Bakeries, Restau...",1.5,1,2015-08-02,0,UgwmWy_68S_aKR9wTWKGOg,3,I am a huge fan of both locally owned business...,4,jSD05fFrAFa5gX3ZQae1tw
1,vJIuDBdu01vCA8y1fwR1OQ,CakesbyToi,"American (Traditional), Food, Bakeries, Restau...",1.5,2,2015-06-08,2,AywPsODuQbUMhBEjmKiGXw,1,Let me start off by saying. If you cant make a...,8,oagZh5A2cWJXZBLakS_KpQ
2,vJIuDBdu01vCA8y1fwR1OQ,CakesbyToi,"American (Traditional), Food, Bakeries, Restau...",1.5,1,2015-08-08,0,6j39TEUBDBTEK37OKACWLw,1,Decent product. ..HORRIBLE service. Totally un...,2,6BSwuyc7fvcccJgrY4_W5Q
3,kgffcoxT6BQp-gJ-UQ7Czw,Subway,"Fast Food, Restaurants, Sandwiches",2.5,0,2016-07-03,0,c6iTbCMMYWnOd79ZiWwobg,1,"I ordered a few 12 inch sandwiches , a turkey ...",1,ih7Dmu7wZpKVwlBRbakJOQ
4,kgffcoxT6BQp-gJ-UQ7Czw,Subway,"Fast Food, Restaurants, Sandwiches",2.5,0,2018-03-10,0,5iDdZvpK4jOv2w5kZ15TUA,1,Worst subway of any I have visited. I have man...,1,m3WBc9bGxn1q1ikAFq8PaA


In [77]:
df.user_id.max()

'zzzPVqSxSvjzlLR3Q7wsUw'

In [7]:
df.shape

(640718, 12)

In [8]:
df_sample = df.sample(n = int(df.shape[0] * 0.05), random_state = 14)

In [9]:
df_sample.shape

(32035, 12)

## 1. Clean data and get rating data 

#### Select relevant columns in the original dataframe

In [10]:
# Get business_id, user_id, stars for recommender
df_re = df_sample[['business_id', 'user_id', 'stars']]

In [11]:
df_re.head()

Unnamed: 0,business_id,user_id,stars
71253,pHJu8tj3sI8eC5aIHLFEfQ,Jv-qBvI44JFYOCPVEI2Aig,5
162777,h3pMMRwqBjMeMtk0_qQzRQ,58w7YA4li04e3cOw3BKbUQ,4
46778,6qKjl6xJQzLb9ka-QbTudA,uwiwGGCfKNCQx0MMbddKbA,4
106404,YkOCo5ipV2he2WXIAlZb-A,nNMUtoyV7_kuP2Bfg-dQZA,5
30996,yp2nRId4v-bDtrYl5A3F-g,o6KHdnXxxClpOgC2CN3RLA,5


#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

**Q**: How do we recommend to these users anyways?

Item-item similarity is to calculate all pairs of items and calculate their similarity. Then we predict the rating of the user by using simliarity and the user rating of the similar item. With ordering the predicted rating descendingly, we recommend the top k items to the user

#### Create utility matrix from records

In [12]:
# using pandas.pivot_table
df_utility = pd.pivot_table(data = df_re,
                           values = 'stars',
                           index = 'user_id',
                           columns = 'business_id',
                           fill_value = 0)

## 2. Item-Item similarity recommender

In [13]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1m9o3vGRA8IBPNvNqKLmA,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-Bf8BQ3yMk8U2f45r2DRKw,-BmqghX1sv7sgsxOIS2yAg,...,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zr42_UsWfaIF-rcp37OpwA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zt9RLUIU32fZYOBh2L0NNQ,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg,zx_j6OuuHHa2afVoAZuLpA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--3WaS23LcIXtxyFULJHTA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--OuPo_FoEnz41-whCm1hQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--RlSfc-QmcHFGHyX6aVjA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--XroDUidjD1PcmgahDk2w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--_5wWxmGfXcYDQOhWQUlQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_utility.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27214 entries, --3WaS23LcIXtxyFULJHTA to zzhB7A-ZQfaJ8KJhAvf_rA
Columns: 3740 entries, --9e1ONYQuAa-CB_Rrw7Tw to zx_j6OuuHHa2afVoAZuLpA
dtypes: int64(3740)
memory usage: 776.7+ MB


In [14]:
df_utility.shape

(27214, 3740)

In [44]:
df_utility.index[1000]

'1QM296OFfiS2HmxxMcL6pg'

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
item_sim_mat = cosine_similarity(df_utility)

In [17]:
type(item_sim_mat)

numpy.ndarray

In [54]:
item_sim_mat

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [57]:
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 1000
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [20]:
neighborhoods.shape

(27214, 75)

In [50]:
user_id = df_utility.index[10000]

In [21]:
n_users = df_utility.shape[0]
n_items = df_utility.shape[1]

In [22]:
n_users

27214

In [23]:
# items means the businesses
n_items

3740

In [28]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-1m9o3vGRA8IBPNvNqKLmA,-1vfRrlnNnNJ5boOVghMPA,-3zffZUHoY8bQjGfPSoBKQ,-8R_-EkGpUhBk55K9Dd4mg,-9YyInW1wapzdNZrhQJ9dg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-Bf8BQ3yMk8U2f45r2DRKw,-BmqghX1sv7sgsxOIS2yAg,...,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zr42_UsWfaIF-rcp37OpwA,zrQ1zKWC-W2PCvwjBururQ,zsQk990PubOHjr1YcLkQFw,zt9RLUIU32fZYOBh2L0NNQ,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg,zx_j6OuuHHa2afVoAZuLpA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--3WaS23LcIXtxyFULJHTA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--OuPo_FoEnz41-whCm1hQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--RlSfc-QmcHFGHyX6aVjA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--XroDUidjD1PcmgahDk2w,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
--_5wWxmGfXcYDQOhWQUlQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
# start_time = time()
# return the indices of the elements that are non-zero
items_rated_by_this_user = df_utility.loc[user_id].nonzero()[0]

In [73]:
items_rated_by_this_user

array([2571])

In [74]:
type(items_rated_by_this_user[0])

numpy.int64

In [41]:
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)

In [43]:
len(out)

3740

In [71]:
df_utility.loc[user_id, relevant_items]

Series([], Name: MfWCOcyalc-uz7X65RlQhg, dtype: int64)

In [66]:
item_sim_mat[1, 0]

0.0

In [67]:
item_sim_mat[2, 2571]

0.0

In [70]:
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
#     print(relevant_items)
    out[item_to_rate] = df_utility.loc[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


# pred_ratings = np.nan_to_num(out)
# print(pred_ratings)
# print("Execution time: %f seconds" % (time()-start_time))

ValueError: setting an array element with a sequence.

In [59]:
len(relevant_items)

0

### Let's reuse the ItemItemRecommender class derived from previous exercise

Hint: we need to make modification to accommodate the dense numpy array

In [None]:
# To be implemented

pass

## 3. Matrix Factorization recommender

##### Compare two of the methods demoed in Practice Class: sklearn NMF, sklearn TruncatedSVD, or GraphLab
##### *Extra points for using GraphLab

## 4. Other recommenders (optional)

What are other ways you can build a better recommender?

* Other features (have you noticed there are other features in the Yelp dataset, e.g. tips, etc.?)
* Popularity-based
* Content-based
* Hybrid