In [1]:
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.style.use("ggplot")
os.getcwd()
os.chdir('/Users/caden/Desktop/yelp_project/data')

In [2]:
df = pd.read_csv('last_2_year_restaurant_reviews_Las_Vegas.csv')

# 1. Clean data and get rating data

In [3]:
data = df[['business_id', 'user_id', 'stars']].copy()
# exclude the users that haven't give much ratings
clean_user = data['user_id'].value_counts().index
mask_ = data['user_id'].value_counts()>20
clean_user = clean_user[mask_].values
used_data = data[data['user_id'].apply(lambda x: x in clean_user)]

### Define the reference tables from string to the categorical number

In [4]:
user_cat = used_data.user_id.copy().astype('category').cat.codes
user = used_data[['user_id']].copy()
user['cat'] = user_cat
user = user.drop_duplicates(subset = ['cat']).set_index('cat')

business_cat = used_data.business_id.copy().astype('category').cat.codes
business = used_data[['business_id']].copy()
business['cat'] = business_cat
business = business.drop_duplicates(subset = ['cat']).set_index('cat')


In [14]:
used_data = used_data.copy().astype('category').apply(lambda x: x.cat.codes)

# 2. Create utility matrix 

In [16]:
from scipy import sparse
user_scaling = len(used_data.user_id.value_counts())
business_scaling = len(used_data.business_id.value_counts())
rate_matrix = sparse.lil_matrix((user_scaling, business_scaling))
for _, row in used_data.iterrows():
    rate_matrix[row.user_id, row.business_id] = row.stars

In [17]:
rate_matrix

<4790x18011 sparse matrix of type '<class 'numpy.float64'>'
	with 186564 stored elements in LInked List format>

# 3. Item-Item similarity recommender

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
item_sim_mat = cosine_similarity(rate_matrix.T)
similarity_index = np.argsort(item_sim_mat, axis =1)
# choose the 75 most similarity for each business_id
neighborhood = similarity_index[:, -75:]

### Make prediction for the user unrated items

In [104]:
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
def recommend(user_id, rate_matrix, n):
    n_users = rate_matrix.shape[0]
    n_items = rate_matrix.shape[1]
    rated_by_this_user = rate_matrix[user_id].nonzero()[1]
    out = np.zeros(n_items)
    for to_rate in range(n_items):
        # find the intersection items that both in the rated(by this user)list and most similar list
        #if to_rate not in rated_by_this_user:
        relevant_items = np.intersect1d(neighborhood[to_rate],
                                       rated_by_this_user,
                                       assume_unique = True)
        out[to_rate] = rate_matrix[user_id, relevant_items]*item_sim_mat[to_rate, relevant_items]/item_sim_mat[to_rate, relevant_items].sum()
    pre_ratings = np.nan_to_num(out)
    item_sort = list(np.argsort(pred))[::-1]
    unrated_items = [item for item in item_sort if item not in rated_by_this_user]
    return business.loc[unrated_items[:n]]

In [110]:
# the prediction of rates for user:105
recommend(105, rate_matrix, 20)

Unnamed: 0_level_0,business_id
cat,Unnamed: 1_level_1
9005,V18iSIgNurCBmowYm34byw
5926,KEDm72wwss4YFXHhHdsHdQ
14838,omaLPw0gmu8FacEY4EL1XA
14837,omMiNDhPYAX4TPfv4xrBLQ
6077,Knm1LOVKvw6YC0wR1-J92A
6079,KnsAsott2VaBu9sGyTS9kg
6089,Kq2P3SX1pe0RPYM44fd2ZQ
6094,KqxoM5zKgqlgoTjHrBVYHw
6105,Kts2VexuwNq07a-gdpGExQ
14817,oiHwpkJXt0NW3S2UTa0Fdg


# 4. Matrix Factorization recommender

### Build NMF recommender

In [105]:
from sklearn.decomposition import NMF
def fit_nmf(M, K):
    nmf = NMF(n_components = K)
    nmf.fit(M)
    W = nmf.transform(M)
    H = nmf.components_
    err = nmf.reconstruction_err_
    return W, H, err

def MF(fit_nmf, user_id, rate_matrix, n):
    W, H, err = fit_nmf(rate_matrix, 100)
    fitted_matrix = W.dot(H)
    pred_ratings = fitted_matrix[user_id, :]
    sorted_list = list(np.argsort(pred_ratings))[::-1]
    rated_by_this_user = rate_matrix[user_id].nonzero()[1]
    unrated_items = [item for item in sorted_list if item not in rated_by_this_user]
    return business.loc[unrated_items[:n]]


In [108]:
MF(fit_nmf, 105, rate_matrix, 20)

Unnamed: 0_level_0,business_id
cat,Unnamed: 1_level_1
15127,po0p6NIro0cDrmKkcyPy0w
13281,jBh399TajGcH28Zo2J1pHw
4518,FFVAGNz3fQ7Mdsupm5uzMw
13564,kABF0hYfAEnl166mn1zR1A
1934,5xdKUuu4DbJ71KGbwRXfFg
40,-95mbLJsa0CxXhpaNL4LvA
8086,RixnVQV24bc3p92mSntaVA
13216,ixAh9crILnJ9tM8LhWFhkw
4552,FMnH5WIRLyexBYMBUUnDxQ
2451,7sPNbCx7vGAaH7SbNPZ6oA


### Build UVD recommender

In [109]:
from sklearn.decomposition import TruncatedSVD
def fit_uvd(M, K):
    svd = TruncatedSVD(n_components = K, n_iter = 10, random_state = 0)
    svd.fit(M)
    V = svd.components_
    U = svd.transform(M)
    return U, V, svd
MF(fit_uvd, 105, rate_matrix, 20)

Unnamed: 0_level_0,business_id
cat,Unnamed: 1_level_1
15862,sNVGdeOPeitJ3OWUQBINzQ
15127,po0p6NIro0cDrmKkcyPy0w
40,-95mbLJsa0CxXhpaNL4LvA
6530,MQD3EvzB7C8zol6kBxBrSg
9434,WRQ-9LluyivReFiQZFUujw
5618,JAmQCmczUclNUfZjkNdjQA
1934,5xdKUuu4DbJ71KGbwRXfFg
8086,RixnVQV24bc3p92mSntaVA
12410,g83WbX_recywc4DEIZ-xug
4518,FFVAGNz3fQ7Mdsupm5uzMw


In [121]:
k = 0
for i in MF(fit_nmf, 256, rate_matrix, 20).index:
    if i in MF(fit_uvd, 256, rate_matrix, 20).index:
#         if i in recommend(10, rate_matrix, 20):
        k += 1
print('The number of the same recommends of NMF and UVD are %s' %k)

The number of the same recommends of NMF and UVD are 8
