In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import warnings
warnings.simplefilter(action='ignore', category='SettingWithCopyWarning')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

df = pd.read_csv("yelp_academic_dataset_business.csv", nrows=40000)

In [2]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [3]:
content_data = df[['business_id', 'name', 'categories', 'attributes', 'stars', 'review_count']].copy()

# Set type to string
content_data['categories'] = content_data['categories'].astype(str)
content_data['attributes'] = content_data['attributes'].astype(str)

# handle missing values
content_data['categories'].fillna('', inplace=True)
content_data['attributes'].fillna('', inplace=True)

# make a new column called 'content'
content_data['content'] = content_data['name'] + ' ' + content_data['categories'] + ' ' + content_data['attributes']

In [4]:
# TF-IDF to convert text into numerical vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(content_data['content'])

# Calculate cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:
def get_content_recommendations(name, cosine_sim=cosine_sim, content_data=content_data, threshold=0.1):

    # get index of the restaurent
    idx = content_data.index[content_data['name'] == name].tolist()
    if not idx:
        print(f"No restuarents found with name '{name}'")
        return []

    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # filter out items with similarity below the threshold
    sim_scores = [(i, score) for i, score in sim_scores if score > threshold]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get top 10 similar restaurents
    top_similar_restaurents = sim_scores[1:11]
    
    # Get the indices and names of the top similar restaurents
    similar_indices = [i[0] for i in top_similar_restaurents]
    similar_restaurents = content_data.iloc[similar_indices][['name', 'review_count', 'stars']]
    return similar_restaurents

In [6]:
restaurent_name = "Pho Voorhees"
content_recommendations = get_content_recommendations(restaurent_name, threshold=0.5)

print(f"Content-Based Recommendations for {restaurent_name}:")
print(content_recommendations)

Content-Based Recommendations for Pho Voorhees:
               name  review_count  stars
13071    Pho Street           150    4.0
34814         Pho 9           107    4.0
22341  Pho & Beyond           367    4.0
38700         Pho 1           128    4.0
555      Pho Bistro           184    3.0
17253      Pho City            97    4.0
4339      Asian Pho           194    3.5
31705   Street Side           137    4.5
18653           Pho           353    3.5
38158        Pho Le           110    4.0


In [7]:
restaurent_name = "Tuna Bar"
content_recommendations = get_content_recommendations(restaurent_name, threshold=0.5)

print(f"Content-Based Recommendations for {restaurent_name}:")
print(content_recommendations)

Content-Based Recommendations for Tuna Bar:
                           name  review_count  stars
20070        O Sushi Restaurant           199    4.5
151                     J Sushi            72    4.5
2012                  Sushi Bar           346    4.0
12954            Mizu Sushi Bar           235    3.5
18217               Sushi House           106    3.0
34054                   Sushi 7           287    3.5
21654                Sushi Cafe           291    4.5
22565  Sake Sushi Bar and Grill           113    4.5
32323          Wasabi Sushi Bar           138    3.5
10390          Wasabi Sushi Bar           101    3.0


In [8]:
restaurent_name = "The Waterwheel"
content_recommendations = get_content_recommendations(restaurent_name, threshold=0.5)

print(f"Content-Based Recommendations for {restaurent_name}:")
print(content_recommendations)

Content-Based Recommendations for The Waterwheel:
                   name  review_count  stars
38983           R Beach            48    4.0
20701          Marathon            61    2.5
25389              Fish           153    4.0
19306             Fresh            66    3.5
10618         Burger Up           857    4.0
11738              Sage           110    3.5
32754   T.G.I. Friday's             9    3.0
28981          Z Grille           238    3.5
24953    Etc Restaurant           170    4.5
21026  Warehouse Grille           271    3.5
