# Collaborative Filtering for Tabelog (食べログ)

In [8]:
import pandas as pd
import numpy as np

In [9]:
data = pd.read_csv('./data/food_category_data/tabelog_tokyo_sushi_data.csv')
data.head()

Unnamed: 0,store_id,store_name,score,area,genre,review_count,daytime_price,daytime_price_low,daytime_price_high,nighttime_price,nighttime_price_low,nighttime_price_high,photo_count,like_count,bookmark_count
0,1,地どり居酒屋　個室宴会　キンクラ 大山店,3.24,東京都内,sushi,46,-,,,"￥3,000～￥3,999",3000,3999,317,11,9
1,2,オステリア ラストリカート,3.45,東京都内,sushi,213,"￥4,000～￥4,999",4000.0,4999.0,"￥6,000～￥7,999",6000,7999,192,5,26
2,3,沖縄料理なんくるないさー ヨドバシAkiba店,3.08,東京都内,sushi,17,～￥999,1.0,999.0,"￥3,000～￥3,999",3000,3999,148,14,28
3,4,PRIME TOKYO 新宿野村ビル49F,3.33,東京都内,sushi,133,-,,,"￥5,000～￥5,999",5000,5999,166,1,7
4,5,魚がし,3.55,東京都内,sushi,140,-,,,"￥10,000～￥14,999",10000,14999,227,5,3


In [10]:
# features to use for similarities and predictions
X = data[['review_count', 'photo_count', 'like_count', 'bookmark_count', 
                 'daytime_price_low', 'daytime_price_high', 
                 'nighttime_price_low', 'nighttime_price_high']].fillna(0)
# feature to predict: restaurant rating score
y = data['score']

# Calculate means excluding zeros (missing ratings)
user_means = np.array([np.mean(X[i][X[i] != 0]) for i in range(len(X))])
centered_ratings = np.zeros_like(X, dtype=float)

# Center the ratings, keeping zeros as zeros
for i in range(len(X)):
    centered_ratings[i] = np.where(X[i] != 0, X[i] - user_means[i], 0)

# Calculate cosine similarity between two feature vectors
def cosine_similarity(u1, u2):
    dot_product = np.dot(u1, u2)
    norm_u1 = np.sqrt(np.sum(u1**2))
    norm_u2 = np.sqrt(np.sum(u2**2))
    return dot_product / (norm_u1 * norm_u2)

def get_cosine_similarity(df):
    """
    Computes the pairwise cosine similarity between restaurants in a given dataset.

    Args:
        df (DataFrame): dataframe of restaurant features
    
    Returns:
        similarities (array): 2d similarity matrix
    """
    n_restaurants = df.shape[0]
    similarities = np.zeros((n_restaurants, n_restaurants))

    for i in range(n_restaurants):
        for j in range(n_restaurants):
            similarities[i, j] = cosine_similarity(df.iloc[i].values, df.iloc[j].values)

    return similarities

def print_sim_matrix(matrix, n):
    """
    Prints a readable matrix of similarity scores between restaurants.

    Args:
        matrix (array): similarity matrix to be printed
        n (integer): number of restaurants to display
    """
    labels = [f"Restaurant {i+1}" for i in range(n)]
    print(pd.DataFrame(matrix[:n, :n], index=labels, columns=labels))

def predict_rating(item_idx, matrix, target, k):
    """
    Predicts the rating of a given restaurant, based on k similar restaurants.

    Args:
        item_idx (int): index of the target restaurant
        matrix (array): similarity matrix
        target (array): series of target feature to be predicted
        k (integer): number of neighbors to consider
    """
    sims = matrix[item_idx]
    mask = np.arange(len(sims)) != item_idx
    other_sims = sims[mask]
    other_ratings = target[mask]

    top_k_idx = np.argsort(other_sims)[-k:]
    top_k_sims = other_sims[top_k_idx]
    top_k_ratings = other_ratings.iloc[top_k_idx]

    return np.sum(top_k_sims * top_k_ratings) / np.sum(top_k_sims)

def get_rating_preds(matrix, target, k, n=5):
    """
    Prints a readable list of predicted ratings for a list of restaurants.

    Args:
        matrix (array): similarity matrix
        target (array): series of target feature to be predicted
        k (integer): number of neighbors to consider
        n (integer): number of restaurants to display
    """
    for i in range(n):
        pred = predict_rating(i, matrix, target, k)
        print(f"Restaurant {i+1}: {pred:.2f}")
        
def recommended(user_input, df, n):
    """
    Find n restaurants most similar to given input array of features.

    Args:
        user_input (array): array of features user is interested in
        df (DataFrame): dataframe of features to use for similarities
        n (integer): number of restaurants to recommend
    """
    n_restaurants = df.shape[0]
    # similarity matrix stored in array of restaurant, similarity score
    sims = []

    for i in range(n_restaurants):
        sim = cosine_similarity(user_input, df.iloc[i].values)
        sims.append((i, sim))

    # sort
    sims.sort(key=lambda x: x[1], reverse=True)
    top_n = [i for i, _ in sims[:n]]
    return data.iloc[top_n]

In [11]:
#sim_matrix = get_cosine_similarity(features)
#print_sim_matrix(sim_matrix, n=5)
#print('\n')
#get_rating_preds(sim_matrix, target, n=5, k=3)

In [12]:
# user-inputted desired features of restaurant
fake_review = np.array([
    0, #review_count
    0, #photo_count
    0, #like_count
    0, #bookmark_count
    10000, #daytime_price_low
    20000, #daytime_price_high
    10000, #nighttime_price_low
    20000 #nighttime_price_high
])

# get 5 similar restaurants to user inputted review
rec = recommended(fake_review, X, n=5)
print(rec)

     store_id            store_name  score  area  genre  review_count  \
10         11      CAFE A LA TIENNE   3.38  東京都内  sushi           165   
28         29     大衆酒場 もつ焼き 次世代 渋谷店   3.18  東京都内  sushi            61   
106       107  串焼き。ビストロガブリ 新宿西口ハルク店   3.45  東京都内  sushi           243   
47         48              鳥ごころ 新宿店   3.02  東京都内  sushi            15   
172       173   個室居酒屋 茨城の恵み 水戸屋 品川店   3.12  東京都内  sushi            72   

     daytime_price  daytime_price_low  daytime_price_high nighttime_price  \
10   ￥1,000～￥1,999             1000.0              1999.0   ￥1,000～￥1,999   
28   ￥2,000～￥2,999             2000.0              2999.0   ￥2,000～￥2,999   
106  ￥2,000～￥2,999             2000.0              2999.0   ￥2,000～￥2,999   
47   ￥3,000～￥3,999             3000.0              3999.0   ￥3,000～￥3,999   
172  ￥3,000～￥3,999             3000.0              3999.0   ￥3,000～￥3,999   

     nighttime_price_low  nighttime_price_high  photo_count  like_count  \
10                  100