### Importing 3rd party libraries

In [90]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import shapely
import reverse_geocoder as rg
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.font_manager as fm
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set(rc={'figure.figsize':(13.7,10.27)})
sns.set_style("whitegrid")
sns.set_color_codes()

### Importing Dask related libraries

In [2]:
from dask.distributed import Client
import dask.bag as db
import dask.dataframe as dd
import dask.array as da
import dask
from ast import literal_eval
from collections.abc import MutableMapping
from collections import Counter
import io
import os

### Importing surprise related libraries

In [3]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans
from surprise import SVDpp
from surprise import SVD
from surprise import accuracy

### Import Sklearn libraries

In [4]:
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
Client()

0,1
Client  Scheduler: tcp://127.0.0.1:57385  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 8.59 GB


### Loading and Exploring preprocessed dataframe

In [6]:
joined_df = dd.read_csv('data/joined_df.csv').compute()

In [7]:
joined_df.columns

Index(['Unnamed: 0', 'rating', 'reviewerName', 'categories', 'gPlusPlaceId',
       'gPlusUserId', 'user_lat', 'user_long', 'placeName', 'price', 'address',
       'place_lat', 'place_long'],
      dtype='object')

In [8]:
joined_df = joined_df.loc[:, ~joined_df.columns.str.match('Unnamed')]

In [9]:
joined_df.describe()

Unnamed: 0,rating,user_lat,user_long,place_lat,place_long
count,4102647.0,1019908.0,1019908.0,4102647.0,4102647.0
mean,3.963454,35.43534,-42.30559,35.37519,-39.44279
std,1.130273,18.46955,73.40014,17.88234,73.71885
min,0.0,-90.0,-176.4769,-54.84173,-180.0
25%,3.0,33.05811,-95.71289,32.71132,-93.09486
50%,4.0,39.75509,-75.16379,39.74303,-74.01006
75%,5.0,45.30182,6.123373,44.98173,7.1348
max,5.0,90.0,179.0,76.53802,180.0


In [10]:
joined_df 

Unnamed: 0,rating,reviewerName,categories,gPlusPlaceId,gPlusUserId,user_lat,user_long,placeName,price,address,place_lat,place_long
0,4.0,william spindler,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,100000032416892623125,,,Peking Chinese Restaurant,$$$,"['860 Main St', 'Red Bluff, CA 96080']",40.179159,-122.236162
1,5.0,william spindler,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,100000032416892623125,,,Firehouse Pizza,$$,"['734 Main St', 'Red Bluff, CA 96080']",40.178074,-122.235234
2,5.0,william spindler,['Barbecue Restaurant'],111623070919810985923,100000032416892623125,,,Two Buds Barbeque,,"['22825 Antelope Blvd', 'Red Bluff, CA 96080']",40.184955,-122.186557
3,4.0,william spindler,['Restaurant'],113854191152597312098,100000032416892623125,,,Bud's Jolly Kone,,"['455 Antelope Blvd', 'Red Bluff, CA 96080']",40.185961,-122.209742
4,5.0,william spindler,['Mexican Restaurant'],115827996910815192564,100000032416892623125,,,La Corona,$$,"['914 Walnut St', 'Red Bluff, CA 96080']",40.175064,-122.242574
...,...,...,...,...,...,...,...,...,...,...,...,...
196648,2.0,charles mckinney,"['Hamburger Restaurant', 'Fast Food Restaurant']",116458473784504954830,118446742455312620560,,,Fatburger,,"['6780 Cherry Ave', 'Long Beach, CA 90805']",33.878048,-118.168365
196649,2.0,charles mckinney,"['Mexican Restaurant', 'Latin American Restaur...",117332598175065149705,118446742455312620560,,,Super Mex,,"['5660 Atlantic Ave', 'Long Beach, CA 90805']",33.859250,-118.184753
196650,4.0,charles mckinney,"['Hot Dog Restaurant', 'Takeout Restaurant', '...",117868066122653879601,118446742455312620560,,,Wienerschnitzel,$$$,"['1300 E Rosecrans Ave', 'Compton, CA 90221']",33.903287,-118.209676
196651,1.0,charles mckinney,"['Buffet Restaurant', 'American Restaurant']",117952004983617019485,118446742455312620560,,,HomeTown Buffet,$$,"['3102 E Imperial Hwy', 'Lynwood, CA 90262']",33.930282,-118.216058


In [88]:
def places_filter(df, lat_l, lat_h, lon_l, lon_h):
    """Utility place filter function to filter us cities and states
       from rest of the world
       
       Input:
       df (Pandas DataFrame): input dataframe with location cordinates
       lat_l (Float): Source location latitude
       lat_h (Float): Destination location latitude
       lon_l (Float): Source location longitude
       lon_h (Float): Destination location longitude
       
       return (List): List of filtered placed as per the cordinates"""
    filtered_places = []
    for d in df.itertuples(index=False):
        if ((d.place_lat >= lat_l) \
            & (d.place_lat <= lat_h) \
            & (d.place_long >= lon_l) \
            & (d.place_long <= lon_h)):
            filtered_places.append(d)
    return filtered_places

5:36: E502 the backslash is redundant between brackets
6:38: E502 the backslash is redundant between brackets
7:39: E502 the backslash is redundant between brackets
8:5: E129 visually indented line with same indent as next logical line
11:1: W391 blank line at end of file


### Filtering orginal dataframe places to USA

In [12]:
%%time
usa_df = places_filter(joined_df, 19.50139, 64.85694, -161.75583, -68.01197)

CPU times: user 13.8 s, sys: 490 ms, total: 14.3 s
Wall time: 14.3 s


In [13]:
usa_df = pd.DataFrame(usa_df)

In [14]:
usa_df

Unnamed: 0,rating,reviewerName,categories,gPlusPlaceId,gPlusUserId,user_lat,user_long,placeName,price,address,place_lat,place_long
0,4.0,william spindler,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,100000032416892623125,,,Peking Chinese Restaurant,$$$,"['860 Main St', 'Red Bluff, CA 96080']",40.179159,-122.236162
1,5.0,william spindler,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,100000032416892623125,,,Firehouse Pizza,$$,"['734 Main St', 'Red Bluff, CA 96080']",40.178074,-122.235234
2,5.0,william spindler,['Barbecue Restaurant'],111623070919810985923,100000032416892623125,,,Two Buds Barbeque,,"['22825 Antelope Blvd', 'Red Bluff, CA 96080']",40.184955,-122.186557
3,4.0,william spindler,['Restaurant'],113854191152597312098,100000032416892623125,,,Bud's Jolly Kone,,"['455 Antelope Blvd', 'Red Bluff, CA 96080']",40.185961,-122.209742
4,5.0,william spindler,['Mexican Restaurant'],115827996910815192564,100000032416892623125,,,La Corona,$$,"['914 Walnut St', 'Red Bluff, CA 96080']",40.175064,-122.242574
...,...,...,...,...,...,...,...,...,...,...,...,...
2285752,2.0,charles mckinney,"['Hamburger Restaurant', 'Fast Food Restaurant']",116458473784504954830,118446742455312620560,,,Fatburger,,"['6780 Cherry Ave', 'Long Beach, CA 90805']",33.878048,-118.168365
2285753,2.0,charles mckinney,"['Mexican Restaurant', 'Latin American Restaur...",117332598175065149705,118446742455312620560,,,Super Mex,,"['5660 Atlantic Ave', 'Long Beach, CA 90805']",33.859250,-118.184753
2285754,4.0,charles mckinney,"['Hot Dog Restaurant', 'Takeout Restaurant', '...",117868066122653879601,118446742455312620560,,,Wienerschnitzel,$$$,"['1300 E Rosecrans Ave', 'Compton, CA 90221']",33.903287,-118.209676
2285755,1.0,charles mckinney,"['Buffet Restaurant', 'American Restaurant']",117952004983617019485,118446742455312620560,,,HomeTown Buffet,$$,"['3102 E Imperial Hwy', 'Lynwood, CA 90262']",33.930282,-118.216058


# Starting Collaborative filtering Recommender System

### Reading ratings from the dataframe

In [15]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data=Dataset.load_from_df(usa_df[['gPlusUserId','gPlusPlaceId','rating']], reader)

### Creating holdout set

In [16]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=None, shuffle=True)

### Training the model using surprise library - SVD algorithm

In [17]:
%%time
algo=SVD(n_factors=10,reg_all=0.01)
algo.fit(trainingSet)
predictions_svd=algo.test(testSet)

CPU times: user 1min 1s, sys: 1.8 s, total: 1min 3s
Wall time: 1min 2s


### Validating rating predictions using RMSE

In [18]:
accuracy.rmse(predictions_svd,verbose=True)

RMSE: 1.1024


1.102446670754025

In [19]:
usa_df.head(10)

Unnamed: 0,rating,reviewerName,categories,gPlusPlaceId,gPlusUserId,user_lat,user_long,placeName,price,address,place_lat,place_long
0,4.0,william spindler,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,100000032416892623125,,,Peking Chinese Restaurant,$$$,"['860 Main St', 'Red Bluff, CA 96080']",40.179159,-122.236162
1,5.0,william spindler,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,100000032416892623125,,,Firehouse Pizza,$$,"['734 Main St', 'Red Bluff, CA 96080']",40.178074,-122.235234
2,5.0,william spindler,['Barbecue Restaurant'],111623070919810985923,100000032416892623125,,,Two Buds Barbeque,,"['22825 Antelope Blvd', 'Red Bluff, CA 96080']",40.184955,-122.186557
3,4.0,william spindler,['Restaurant'],113854191152597312098,100000032416892623125,,,Bud's Jolly Kone,,"['455 Antelope Blvd', 'Red Bluff, CA 96080']",40.185961,-122.209742
4,5.0,william spindler,['Mexican Restaurant'],115827996910815192564,100000032416892623125,,,La Corona,$$,"['914 Walnut St', 'Red Bluff, CA 96080']",40.175064,-122.242574
5,5.0,william spindler,['Chinese Restaurant'],116585428624152564242,100000032416892623125,,,China Doll Chinese Restaurant,,"['182 S Main St', 'Red Bluff, CA 96080']",40.170748,-122.228931
6,2.0,Richard Yocom,['Restaurant'],100073820849130920147,100000053212755369563,,,Hardee's / Red Burrito,$$$,"['134 N Hills St', 'Meridian, MS 39305']",32.413658,-88.677648
7,2.0,Richard Yocom,"['Pizza Restaurant', 'European Restaurant']",102333498482915416504,100000053212755369563,,,Pizza Hut,$$$,"['2199 S Byron Butler Pkwy', 'Perry, FL 32348']",30.09469,-83.580572
8,1.0,Richard Yocom,"['Pizza Restaurant', 'European Restaurant']",102471437282277965376,100000053212755369563,,,Hungry Howie's,$$$,"['15028 US Highway 19 S', 'Thomasville, GA 317...",30.80096,-83.935407
9,2.0,Richard Yocom,"['Restaurant', 'American Restaurant']",103519165841762621376,100000053212755369563,,,Ryan's,$$,"['207 S Frontage Rd', 'Meridian, MS 39301']",32.362456,-88.676926


In [20]:
algo.predict('100000053212755369563','106591714648856494903')

Prediction(uid='100000053212755369563', iid='106591714648856494903', r_ui=None, est=3.2218719939490996, details={'was_impossible': False})

### Train complete dataset



In [91]:
%%time
trainset = data.build_full_trainset()

CPU times: user 9.73 s, sys: 8.29 s, total: 18 s
Wall time: 20.4 s


In [22]:
%%time
algo.fit(trainset)

CPU times: user 1min, sys: 753 ms, total: 1min 1s
Wall time: 1min 1s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x164618150>

In [25]:
from collections import defaultdict

def getRecommendations(userID='100000053212755369563', topN=3):
    """Wrapper function to use the already trained SVD algorithm
       and get top recommendations"""
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions_svd: 
            predicted_value = algo.predict(userID,iid)
            top_recs[iid].append([predicted_value.est])
    output_dict = {}
    for iid, item_ratings in sorted(top_recs.items(), key=lambda item: item[1][0], reverse=True):
        output_dict[iid]=item_ratings
    return output_dict 

In [26]:
%%time
getRecommendations()

CPU times: user 8.99 s, sys: 1.72 s, total: 10.7 s
Wall time: 9.39 s


{'111192969447456401878': [[4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299],
  [4.232824489225299]],
 '112435049692764936122': [[4.208751842096858],
  [4.208751842096858],
  [4.208751842096858],
  [4.208751842096858],
  [4.208751842096858],
  [4.208751842096858],
  [4.208751842096858],
  [4.208751842096858],
  [4.208751842096858]],
 '101

### Helper function to calculate distance between two cordinates

In [27]:
from math import sin, cos, sqrt, atan2, radians
def calculate_distance(lat1, lon1, lat2, lon2):
    """Helper function to calculate distance between two cordinates"""
    # approximate radius of earth in km
    R = 6373.0

    lat1_r = radians(lat1)
    lon1_r = radians(lon1)
    lat2_r = radians(lat1)
    lon2_r = radians(lon2)

    dlon = lon2_r - lon1_r
    dlat = lat2_r - lat1_r

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [28]:
def getRecommendations_from_df(df, userID='100000053212755369563', topN=3, lat=None, lon=None, distance_limit=100):
    top_recs = defaultdict(list)
    output_dict = {}
    dis = None
    for row in df.itertuples():
        iid = row.gPlusPlaceId
        predicted_value = algo.predict(userID, iid)
        
        if lat and lon:
            try:
                dis = calculate_distance(row.place_lat, row.place_long, lat, lon)
            except Exception as e:
                  pass
        if dis <= distance_limit:
            top_recs[iid].append({'PlaceName':row.placeName, 'Distance':dis, 'Category':row.categories, 'Prediction':predicted_value.est})
     
    for iid, item_ratings in sorted(top_recs.items(), key=lambda item: item[1][0].get('Prediction'), reverse=True):
        output_dict[iid]=item_ratings
     
    return output_dict 

In [92]:
%%time
recs= getRecommendations_from_df(usa_df, userID='118446742455312620560', lat=40.179159, lon=-122.236162, distance_limit= 0.5)

CPU times: user 41.8 s, sys: 5.89 s, total: 47.7 s
Wall time: 43.9 s


In [30]:
usa_df

Unnamed: 0,rating,reviewerName,categories,gPlusPlaceId,gPlusUserId,user_lat,user_long,placeName,price,address,place_lat,place_long
0,4.0,william spindler,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,100000032416892623125,,,Peking Chinese Restaurant,$$$,"['860 Main St', 'Red Bluff, CA 96080']",40.179159,-122.236162
1,5.0,william spindler,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,100000032416892623125,,,Firehouse Pizza,$$,"['734 Main St', 'Red Bluff, CA 96080']",40.178074,-122.235234
2,5.0,william spindler,['Barbecue Restaurant'],111623070919810985923,100000032416892623125,,,Two Buds Barbeque,,"['22825 Antelope Blvd', 'Red Bluff, CA 96080']",40.184955,-122.186557
3,4.0,william spindler,['Restaurant'],113854191152597312098,100000032416892623125,,,Bud's Jolly Kone,,"['455 Antelope Blvd', 'Red Bluff, CA 96080']",40.185961,-122.209742
4,5.0,william spindler,['Mexican Restaurant'],115827996910815192564,100000032416892623125,,,La Corona,$$,"['914 Walnut St', 'Red Bluff, CA 96080']",40.175064,-122.242574
...,...,...,...,...,...,...,...,...,...,...,...,...
2285752,2.0,charles mckinney,"['Hamburger Restaurant', 'Fast Food Restaurant']",116458473784504954830,118446742455312620560,,,Fatburger,,"['6780 Cherry Ave', 'Long Beach, CA 90805']",33.878048,-118.168365
2285753,2.0,charles mckinney,"['Mexican Restaurant', 'Latin American Restaur...",117332598175065149705,118446742455312620560,,,Super Mex,,"['5660 Atlantic Ave', 'Long Beach, CA 90805']",33.859250,-118.184753
2285754,4.0,charles mckinney,"['Hot Dog Restaurant', 'Takeout Restaurant', '...",117868066122653879601,118446742455312620560,,,Wienerschnitzel,$$$,"['1300 E Rosecrans Ave', 'Compton, CA 90221']",33.903287,-118.209676
2285755,1.0,charles mckinney,"['Buffet Restaurant', 'American Restaurant']",117952004983617019485,118446742455312620560,,,HomeTown Buffet,$$,"['3102 E Imperial Hwy', 'Lynwood, CA 90262']",33.930282,-118.216058


### Starting Content based recommendation system

In [31]:
# initializing the new column
df_cont_based = usa_df.copy(deep=True)

In [32]:
#Function to create bag of words and determine scores for each words
def score_keywords(row):
#     for index, row in df_cont_based.iterrows():
    category = eval(row.categories)
    category_str = ' '.join(category)
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(category_str)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    # assigning the key words to the new column for the corresponding movie
    return ' '.join(list(key_words_dict_scores.keys()))

In [33]:
%%time
df_cont_based['bag_of_words'] = df_cont_based.apply(score_keywords, axis=1)

CPU times: user 14min 32s, sys: 2min 16s, total: 16min 49s
Wall time: 2h 12min 1s


In [34]:
df_cont_based

Unnamed: 0,rating,reviewerName,categories,gPlusPlaceId,gPlusUserId,user_lat,user_long,placeName,price,address,place_lat,place_long,bag_of_words
0,4.0,william spindler,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,100000032416892623125,,,Peking Chinese Restaurant,$$$,"['860 Main St', 'Red Bluff, CA 96080']",40.179159,-122.236162,asian restaurant chinese
1,5.0,william spindler,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,100000032416892623125,,,Firehouse Pizza,$$,"['734 Main St', 'Red Bluff, CA 96080']",40.178074,-122.235234,european restaurant italian pizza
2,5.0,william spindler,['Barbecue Restaurant'],111623070919810985923,100000032416892623125,,,Two Buds Barbeque,,"['22825 Antelope Blvd', 'Red Bluff, CA 96080']",40.184955,-122.186557,barbecue restaurant
3,4.0,william spindler,['Restaurant'],113854191152597312098,100000032416892623125,,,Bud's Jolly Kone,,"['455 Antelope Blvd', 'Red Bluff, CA 96080']",40.185961,-122.209742,restaurant
4,5.0,william spindler,['Mexican Restaurant'],115827996910815192564,100000032416892623125,,,La Corona,$$,"['914 Walnut St', 'Red Bluff, CA 96080']",40.175064,-122.242574,mexican restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285752,2.0,charles mckinney,"['Hamburger Restaurant', 'Fast Food Restaurant']",116458473784504954830,118446742455312620560,,,Fatburger,,"['6780 Cherry Ave', 'Long Beach, CA 90805']",33.878048,-118.168365,hamburger restaurant fast food
2285753,2.0,charles mckinney,"['Mexican Restaurant', 'Latin American Restaur...",117332598175065149705,118446742455312620560,,,Super Mex,,"['5660 Atlantic Ave', 'Long Beach, CA 90805']",33.859250,-118.184753,mexican restaurant latin american
2285754,4.0,charles mckinney,"['Hot Dog Restaurant', 'Takeout Restaurant', '...",117868066122653879601,118446742455312620560,,,Wienerschnitzel,$$$,"['1300 E Rosecrans Ave', 'Compton, CA 90221']",33.903287,-118.209676,hot dog restaurant takeout fast food
2285755,1.0,charles mckinney,"['Buffet Restaurant', 'American Restaurant']",117952004983617019485,118446742455312620560,,,HomeTown Buffet,$$,"['3102 E Imperial Hwy', 'Lynwood, CA 90262']",33.930282,-118.216058,buffet restaurant american


In [99]:
indices = pd.Series(df_cont_based.index)

In [100]:
%%time
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df_cont_based['bag_of_words'])

CPU times: user 14.1 s, sys: 514 ms, total: 14.6 s
Wall time: 14.7 s


### Recommendation based on Content only

### Based on dynamic cosine sim calculation

In [101]:
#  defining the function that takes in places 
# as input and returns the top 10 recommendations only based on content
def recommendations_content(gPlusPlaceId, df, indices=indices):
    # initializing the empty list of recommended places
    recommended_places = []
    # gettin the index of the places that matches the placeID
    idx = df.loc[df['gPlusPlaceId']==str(gPlusPlaceId)]
    idx = idx.index.values[0]
    
    #Calculate cosine similarity
    cosine_sim = cosine_similarity(count_matrix[idx], count_matrix)
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[0]).sort_values(ascending = False)
    
    # getting the indexes of the 10 most similar movies
    top_10_scores = score_series.iloc[1:11]
    
    for score_i, score_v in top_10_scores.items():
        place_name = df['placeName'].iloc[score_i]
        place_category = df['categories'].iloc[score_i]
        recommended_places.append({'Place':place_name, 
                                   'Catgory':place_category,
                                   'Similarity': score_v})
    
    return recommended_places

In [164]:
recommended_places_content_based = recommendations_content(111623070919810985923, df_cont_based, indices=indices)

In [165]:
recommended_places_content_based

[{'Place': 'Slows Bar-B-Q',
  'Catgory': "['Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': "Callear's R & R Bar B Que Restaurant",
  'Catgory': "['Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': 'Cozy Corner Restaurant',
  'Catgory': "['Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': 'Kickin Wings',
  'Catgory': "['Restaurant', 'Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': '98 Bar-B-Que',
  'Catgory': "['Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': "Bludso's BBQ",
  'Catgory': "['Restaurant', 'Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': 'Chuck Wagon BBQ',
  'Catgory': "['Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': 'Blue Tractor BBQ & Brewery',
  'Catgory': "['Barbecue Restaurant']",
  'Similarity': 0.9999999999999998},
 {'Place': 'Hong Kong BBQ & Dimsum',
  'Catgory': "['Barbecue Restaurant']",
  'Similari

### Helper method to find n closest places based on content+distance

In [158]:
# This function will filter the top recommendations and get the top suggestions
# bases on distance
def get_n_closest_places(score_series, 
                         df, 
                         src_lat, 
                         src_lon, 
                         distance_limit=10,
                         desired_similarity=0.5):
    # initializing the empty list of recommended places
    recommended_places = []
    counter = 0 # UpTo 10  for 10 recommendations to optimize the loop
    for score_i, score_v in score_series.items():
        try:
            dest_lat = df['place_lat'].iloc[score_i]
            dest_lon = df['place_long'].iloc[score_i]
            dis = calculate_distance(src_lat, src_lon, dest_lat, dest_lon) 
            if dis<=distance_limit and dis>0 and score_v>=desired_similarity:
                place_name = df['placeName'].iloc[score_i]
                place_category = df['categories'].iloc[score_i]
                recommended_places.append({'Place':place_name, 
                                           'Catgory':place_category,
                                           'Distance(In km)': dis,
                                          'Similarity': score_v})
                counter+=1
            
            if counter == 10:
                break
        except Exception as e:
            pass
        
    return recommended_places
        
    

### Recommendation based on Content and Distance

In [159]:
#  defining the function that takes in places 
# as input and returns the top 10 recommendations based on content and distance
def recommendations_content_distance(df, 
                                     gPlusPlaceId=109420033090810328045,
                                     indices=indices, 
                                     distance_limit=10,
                                     desired_similarity=0.5):
        
    #Calculating cordinates for input location
    input_place_location = df[['place_lat', 'place_long']].loc[df['gPlusPlaceId']==str(gPlusPlaceId)]
    input_place_lat = input_place_location['place_lat'].iloc[0]
    input_place_long = input_place_location['place_long'].iloc[0]
    
    # gettin the index of the places that matches the placeID
    idx = df.loc[df['gPlusPlaceId']==str(gPlusPlaceId)]
    idx = idx.index.values[0]
    
    #Calculate cosine similarity
    cosine_sim = cosine_similarity(count_matrix[idx], count_matrix)
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[0]).sort_values(ascending = False)
    # Filtering the similar placees based on distance to find 
    # top recommendations
    recommended_places = get_n_closest_places(score_series, 
                                              df, 
                                              input_place_lat, 
                                              input_place_long, 
                                              distance_limit=distance_limit,
                                              desired_similarity=desired_similarity)
         
    return recommended_places

In [172]:
# creating a Series for the categories so they are associated to an ordered numerical
# list I will use in the function to match the indexes
places_rec = recommendations_content_distance(df_cont_based, 
                                              gPlusPlaceId=109420033090810328045, 
                                              distance_limit=100,
                                              desired_similarity=0.8)

In [173]:
places_rec

[{'Place': 'Serious Pie',
  'Catgory': "['Pizza Restaurant', 'European Restaurant', 'Italian Restaurant']",
  'Distance(In km)': 9.767244807522811,
  'Similarity': 1.0},
 {'Place': "Campana's Italian Restaurant",
  'Catgory': "['Italian Restaurant', 'European Restaurant', 'Pizza Restaurant']",
  'Distance(In km)': 37.60876448729283,
  'Similarity': 1.0},
 {'Place': "Tommy's Pizza",
  'Catgory': "['Pizza Restaurant', 'European Restaurant', 'Italian Restaurant']",
  'Distance(In km)': 36.43096281239843,
  'Similarity': 1.0},
 {'Place': 'Zeeks Pizza',
  'Catgory': "['Pizza Restaurant', 'European Restaurant', 'Italian Restaurant']",
  'Distance(In km)': 6.3341311990192946,
  'Similarity': 1.0},
 {'Place': "Spiro's Pizza & Pasta",
  'Catgory': "['Pizza Restaurant', 'European Restaurant', 'Italian Restaurant']",
  'Distance(In km)': 42.04623853450258,
  'Similarity': 1.0},
 {'Place': 'Westside Pizza',
  'Catgory': "['European Restaurant', 'Italian Restaurant', 'Pizza Restaurant']",
  'Distan

### Evaluating content based model for user 100000032416892623125

### Exploring places visited by the user 100000032416892623125

In [169]:
df_cont_based[df_cont_based['gPlusUserId']=='100000032416892623125']

Unnamed: 0,rating,reviewerName,categories,gPlusPlaceId,gPlusUserId,user_lat,user_long,placeName,price,address,place_lat,place_long,bag_of_words
0,4.0,william spindler,"['Asian Restaurant', 'Chinese Restaurant']",106591714648856494903,100000032416892623125,,,Peking Chinese Restaurant,$$$,"['860 Main St', 'Red Bluff, CA 96080']",40.179159,-122.236162,asian restaurant chinese
1,5.0,william spindler,"['European Restaurant', 'Italian Restaurant', ...",109420033090810328045,100000032416892623125,,,Firehouse Pizza,$$,"['734 Main St', 'Red Bluff, CA 96080']",40.178074,-122.235234,european restaurant italian pizza
2,5.0,william spindler,['Barbecue Restaurant'],111623070919810985923,100000032416892623125,,,Two Buds Barbeque,,"['22825 Antelope Blvd', 'Red Bluff, CA 96080']",40.184955,-122.186557,barbecue restaurant
3,4.0,william spindler,['Restaurant'],113854191152597312098,100000032416892623125,,,Bud's Jolly Kone,,"['455 Antelope Blvd', 'Red Bluff, CA 96080']",40.185961,-122.209742,restaurant
4,5.0,william spindler,['Mexican Restaurant'],115827996910815192564,100000032416892623125,,,La Corona,$$,"['914 Walnut St', 'Red Bluff, CA 96080']",40.175064,-122.242574,mexican restaurant
5,5.0,william spindler,['Chinese Restaurant'],116585428624152564242,100000032416892623125,,,China Doll Chinese Restaurant,,"['182 S Main St', 'Red Bluff, CA 96080']",40.170748,-122.228931,chinese restaurant


### Offline evaluation of Recommender system

- Recommmended places to user 100000032416892623125 based on place(106591714648856494903 - Chinese restaurant) the user visited/reviewed
- Content based recommendation to 106591714648856494903 - Chinese restaurant within 5km and similarity greater than 0.8

In [174]:
places_rec = recommendations_content_distance(df_cont_based, 
                                              gPlusPlaceId=106591714648856494903, 
                                              distance_limit=5,
                                              desired_similarity=0.8)
places_rec

[{'Place': 'China Doll Restaurant',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.791367229541537,
  'Similarity': 1.0000000000000002},
 {'Place': 'Golden China Buffet',
  'Catgory': "['Chinese Restaurant', 'Asian Restaurant']",
  'Distance(In km)': 1.3429198553986461,
  'Similarity': 1.0000000000000002},
 {'Place': 'Colby Teriyaki & Wok',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.1942138662869652,
  'Similarity': 1.0000000000000002},
 {'Place': 'Beijing Garden Chinese',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.043718445403108,
  'Similarity': 1.0000000000000002},
 {'Place': 'Goodworld Chinese Restaurant',
  'Catgory': "['Chinese Restaurant', 'Asian Restaurant']",
  'Distance(In km)': 2.34663297016549,
  'Similarity': 1.0000000000000002},
 {'Place': 'Hunan Wok',
  'Catgory': "['Chinese Restaurant', 'Asian Restaurant']",
  'Distance(In km)': 4.386800995237523,
  'Similari

In this application we are not trying to predict the user's preference but trying to recommend the places which user will be interested to visit. For example, when a user visited an Asian restaurant(106591714648856494903) then we are trying to recommend similar places close to the places which the user might not know.

From the above example we see user (100000032416892623125) has visited two Asian/Chinese restaurants. For the sake of evaluation we only pass the placeID of the first restaurant and consider the second place as hidden and expect the recommender system to recommend the second place.

'China Doll Restaurant' is the second place which user (100000032416892623125)  visited and is also recommended by the application with a similarity of 1

{'Place': 'China Doll Restaurant',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.791367229541537,
  'Similarity': 1.0000000000000002},

In an offline evaluation of usage prediction we have four possible outcomes 
![image.png](attachment:image.png)

#### True positive (Recommended and Used)

In [None]:
{'Place': 'China Doll Restaurant',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.791367229541537,
  'Similarity': 1.0000000000000002}

#### False positive (Recommended but not used)
In the offline case we are forced to assume that unused items would have not used even if they had been recommended, this assumption may be false because the user might not be aware of these places and about its existance. But after the recommendation system exposed the places the user can decide to select it. Therefore in this case the number of False positives is over estimated

In [None]:
 {'Place': 'Golden China Buffet',
  'Catgory': "['Chinese Restaurant', 'Asian Restaurant']",
  'Distance(In km)': 1.3429198553986461,
  'Similarity': 1.0000000000000002}
 {'Place': 'Colby Teriyaki & Wok',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.1942138662869652,
  'Similarity': 1.0000000000000002},
 {'Place': 'Beijing Garden Chinese',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.043718445403108,
  'Similarity': 1.0000000000000002},
 {'Place': 'Goodworld Chinese Restaurant',
  'Catgory': "['Chinese Restaurant', 'Asian Restaurant']",
  'Distance(In km)': 2.34663297016549,
  'Similarity': 1.0000000000000002},
 {'Place': 'Hunan Wok',
  'Catgory': "['Chinese Restaurant', 'Asian Restaurant']",
  'Distance(In km)': 4.386800995237523,
  'Similarity': 1.0000000000000002},
 {'Place': 'Beijing Garden Chinese',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.043718445403108,
  'Similarity': 1.0000000000000002},
 {'Place': 'Fortune Inn',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 1.1433965039594702,
  'Similarity': 1.0000000000000002},
 {'Place': 'Happy Lake 1 Teriyaki Wok',
  'Catgory': "['Asian Restaurant', 'Chinese Restaurant']",
  'Distance(In km)': 2.3830457012800306,
  'Similarity': 1.0000000000000002},
 {'Place': 'Tasters Wok',
  'Catgory': "['Chinese Restaurant', 'Asian Restaurant']",
  'Distance(In km)': 4.199171869557294,
  'Similarity': 1.0000000000000002}]

We can count the number of examples that fall into each cell and compute the following:
![image-2.png](attachment:image-2.png)

#### Precision = 1/(1+9) = 0.1

#### Recall = 1/(1+0) = 1

#### False Positive Rate = 9/(9+0) = 1