In [27]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [129]:
def select_records(yelp_dohmh_property):
    df = yelp_dohmh_property.sort_values(by = 'GRADE DATE')
    df = df.drop_duplicates(subset ='Name', keep = 'last').reset_index()
    return df

def get_cos_similarity(rest, other_rest):
    """
    compute cosine similarity between a resturant profile with another resturant profiles
    rest, other_rest: array
    """
    dot_product = np.dot(rest, other_rest)
    denominator = np.linalg.norm(rest)* np.linalg.norm(other_rest)
    return dot_product/denominator 


def compute_cos_all(rest_id, all_rest):
    """
    compute cosine similarities between a resturant profile with all other resturant profiles,
    return cosine similarity score.
    rest_id : id of a resturant, number 
    all_rest: array of normalized resturant profiles
    """
    other_rest= np.delete(all_rest, rest_id, 0)
    other_rest_score = []
    for i in range(len(other_rest)):
        cos_score = get_cos_similarity(all_rest[rest_id], other_rest[i])
        other_rest_score.append(cos_score)
    other_rest_score = np.array(other_rest_score)
    return other_rest_score


def normalize(df):
    min_max_scaler = preprocessing.MinMaxScaler()
    minmax_df = min_max_scaler.fit_transform(df)
    return minmax_df
            

def find_topk_restaurants(rest_id, rest_df, topk = 5):
    """ 
    find top k similary restaurants of rest_id according to cosines similarity.
    """
    rest_features = rest_df[['SCORE','CUISINE DESCRIPTION','AVERAGE PRICE', 'review_count', 'rating', 'price']]
    rest_scores = compute_cos_all(rest_id, normalize(np.array(pd.get_dummies(rest_features))))
    
    topk_rest_scores = np.sort(rest_scores)[(len(rest_scores)-topk): len(rest_scores)][::-1]
    rest_indexes = np.argsort(rest_scores)[(len(rest_scores)-topk): len(rest_scores)][::-1]
    
    topk_rest_names = []
    topk_rest_cuisine = []
    topk_rest_address = []
    for i in rest_indexes:
        topk_rest_names.append(rest_df['Name'][i])
        topk_rest_cuisine.append(rest_df['CUISINE DESCRIPTION'][i])
        topk_rest_address.append(rest_df['address'][i])
    return pd.DataFrame({'Resturant Name':topk_rest_names, 
                         'Cuisine Type':topk_rest_cuisine, 
                         'Address': topk_rest_address,
                         'Similarity Score':topk_rest_scores})

In [131]:
yelp_dohmh_property = pd.read_csv('yelp_dohmh_property.csv', index_col=[0])
resturants = select_records(yelp_dohmh_property)
find_topk_restaurants(0, resturants)

Unnamed: 0,Resturant Name,Cuisine Type,Address,Similarity Score
0,OKI JAPANESE RESTAURANT,Japanese,4511 Queens Blvd,0.999892
1,COURT ORDER,Delicatessen,52 Court St,0.999681
2,PALACE CAFE,Jewish/Kosher,2603 Nostrand Ave,0.99967
3,KAHVE,Café/Coffee/Tea,786 9th Ave,0.999669
4,CO BA RESTAURANT,Vietnamese/Cambodian/Malaysia,110 9th Ave,0.999612
