# Yelp Data Challenge - Restaurant Recommender

BitTiger DS501

Nov 2017

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
% matplotlib inline
plt.style.use("ggplot")

In [2]:
df = pd.read_csv('dataset/last_2_years_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,1,2016-05-17,0,0Qc1THNHSapDL7cv-ZzW5g,5,What can I say.. Wowzers! Probably one of the ...,0,4LxKRRIikhr65GfPDW626w
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,0,2017-01-20,0,L8lo5SKXfZRlbn1bpPiC9w,5,Went here for guys weekend. Unbelievable. Ravi...,0,nT8zgjoc-PbdBoQsFEXFLw
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,52,2016-09-25,30,6eUT3IwwWPP3CZkAhxqOIw,5,"One word my friends: tableside!!! Yes, tablesi...",56,7RlyCglsIzhBn081inwvcg
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,1,2017-02-12,0,3cnTdE45VrsS0o4cVhfGog,3,"Located inside my favorite hotel Venetian, Del...",1,rOIrilMC7VFwFVBeQNiKMw
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Steakhouses', 'Cajun/Creole', 'Restaurants']",4.0,0,2016-10-30,0,tYrSbjX3QgZGBZuQ3n8g6w,5,"After the most incredible service, delicious m...",2,PiWlV_UC_-SXqyxQM9fAtw


## 1. Clean data and get rating data 

#### Select relevant columns in the original dataframe

In [4]:
# Get business_id, user_id, stars for recommender
df_utility = pd.pivot_table(data=df, 
                            values='stars', 
                            index='user_id', 
                            columns='business_id', 
                            fill_value=0)

df_utility.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136735 entries, ---udAKDsn0yQXmzbWQNSw to zzvqVZTYs5VKxPc-IkRQ4A
Columns: 4268 entries, --9e1ONYQuAa-CB_Rrw7Tw to zwNC-Ow4eIMan2__bS9-rg
dtypes: int64(4268)
memory usage: 4.3+ GB


#### There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender

**Q**: How do we recommend to these users anyways?

**A**: We can use item-item based collaborative filtering. Business as item, user as user. So we need to calculate the similarity between business first and then recommend. Or we can use U-V decomposition to build recommender

#### Create utility matrix from records

In [14]:
num_users = df.user_id.value_counts()
num_business = df.business_id.value_counts()
#print(num_users,num_business)
stars_mat = sparse.lil_matrix((num_users.shape[0], num_business.shape[0]))
stars_mat

<136735x4268 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in LInked List format>

In [17]:
df.set_index('user_id', inplace = True)

## 2. Item-Item similarity recommender

### Let's reuse the ItemItemRecommender class derived from previous exercise

Hint: we need to make modification to accommodate the dense numpy array

In [5]:
df.set_index('user_id', inplace = True)
utility_mat = df_utility.as_matrix()
utility_mat.shape

(136735, 4268)

In [7]:
# Item-Item Similarity Matrix
item_sim_mat = cosine_similarity(utility_mat.T)

least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [8]:
neighborhoods.shape
#

(4268, 75)

In [38]:
# Let's pick a lucky user
user_id = 3

In [39]:
from time import time
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]

start_time = time()
items_rated_by_this_user = utility_mat[user_id].nonzero()[0]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    #print(neighborhoods[item_to_rate])
    #print(items_rated_by_this_user)
    if len(relevant_items) != 0:
        #print(relevant_items)
        #print(utility_mat[user_id, relevant_items], '*', item_sim_mat[item_to_rate, relevant_items])
        out[item_to_rate] = sum(utility_mat[user_id, relevant_items] * \
            item_sim_mat[item_to_rate, relevant_items]) / \
            item_sim_mat[item_to_rate, relevant_items].sum()
    else:
        out[item_to_rate] = np.nan


print(out[~np.isnan(out)])
pred_ratings = np.nan_to_num(out)
print (pred_ratings)
print("Execution time: %f seconds" % (time()-start_time))

[ 3.          4.          3.          4.          3.          3.          4.
  3.          3.          3.55785356  3.          4.          3.          3.
  4.          3.47318803  3.          3.          4.          4.          3.
  3.          3.          4.          4.          4.          3.          3.
  3.          3.          3.          3.          3.          4.          4.
  3.          3.          4.          3.          3.          4.          4.
  4.          3.          4.          3.          3.          4.          3.
  3.          3.          4.          3.          3.          4.          3.
  3.          3.          3.          3.          3.          3.          4.
  3.          3.          4.          3.          3.          4.          3.
  3.          3.          4.          3.          3.49174105  3.          3.
  4.          3.          3.          3.          4.          4.          3.
  3.          3.          4.          3.          3.          3.          4.

In [46]:
# Recommend n movies
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))

# Find items that have been rated by user
items_rated_by_this_user = utility_mat[user_id].nonzero()[0]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating_it = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating_it[-n:]

[44, 3604, 1264, 29, 918, 2372, 2893, 3476, 484, 3908]

In [51]:
recommend_business_id = df_utility.columns[unrated_items_by_pred_rating_it[-n:]].values
for business in recommend_business_id:
    print("recommend business name: ", df.loc[df.business_id == business].name.values[0])

recommend business name:  Beer Park
recommend business name:  Leticia's Mexican Cocina
recommend business name:  Squeeze In
recommend business name:  Jaburritos
recommend business name:  Yong Kang Street
recommend business name:  Estiatorio Milos
recommend business name:  Off The Strip at the LINQ
recommend business name:  Gallagher's Steakhouse
recommend business name:  Lobster ME
recommend business name:  Pin-Up Pizza


## 3. Matrix Factorization recommender

Take a look at Graphlab Create examples

In [40]:
from sklearn.decomposition import TruncatedSVD

def fit_uvd(M,k):
    # use TruncatedSVD to realize UVD
    svd = TruncatedSVD(n_components=k, n_iter=7, random_state=0)
    svd.fit(M)

    V = svd.components_
    U = svd.transform(M) # effectively, it's doing: U = M.dot(V.T)
    # we can ignore svd.singular_values_ for our purpose
    
    # why we can do this?
    # recall: 
    # SVD start from u*s*v=M => u*s=M*v.T, where M*v.T is our transformation above to get U in UVD
    # so the above U is effectively u*s in SVD
    # that's why U*V = u*s*v = M our original matrix
    # there are many ways to understand it!
    # here we by-passed singular values.
    
    return U,V

# decompose
U,V = fit_uvd(utility_mat,200)

# reconstruct
ratings_mat_fitted = U.dot(V) # U*V

# calculate errs
errs = np.array((utility_mat-ratings_mat_fitted).flatten()).squeeze()
mask = np.array(utility_mat.flatten()).squeeze()>0

mse = np.mean(errs[mask]**2)
average_abs_err = abs(errs[mask]).mean()
print (mse)
print (average_abs_err)

9.89994527306
2.46531960038


In [52]:
# get recommendations for one user
user_id = 3
n = 10

pred_ratings = ratings_mat_fitted[user_id,:]
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

items_rated_by_this_user = utility_mat[user_id].nonzero()[0]

unrated_items_by_pred_rating_uv = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating_uv[:n]

[3908, 484, 3476, 2893, 2372, 918, 29, 1264, 3604, 44]

In [53]:
recommend_business_id = df_utility.columns[unrated_items_by_pred_rating_uv[-n:]].values
for business in recommend_business_id:
    print("recommend business name: ", df.loc[df.business_id == business].name.values[0])

recommend business name:  Culinary Dropout
recommend business name:  Shake Shack
recommend business name:  CUT by Wolfgang Puck
recommend business name:  Nine Fine Irishmen
recommend business name:  BabyStacks Cafe
recommend business name:  Dog Haus
recommend business name:  Rí Rá Irish Pub
recommend business name:  The Bootlegger Italian Bistro
recommend business name:  Scarpetta
recommend business name:  Delmonico Steakhouse


## 4. Other recommenders (optional)

What are other ways you can build a better recommender?

* Other features (have you noticed there are other features in the Yelp dataset, e.g. tips, etc.?)
* Popularity-based
* Content-based
* Hybrid

## Note
> We can change the neighborhood_size to 200 or even larger to improve the item-item based recommender

> We can increase the latent factor from 200 to 500 to improve the U-V decomposition recommender