# Building the Recommendation System

The purpose of this workbook is to develop a prediction model for any given user.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
ratings_matrix = pd.read_pickle('./Dataset/ratings_matrix_new.pkl')

In [4]:
all_business = pd.read_pickle('./Dataset/business.pkl')
unique_business = pd.read_pickle('./Dataset/unique_business.pkl')
unique_user = pd.read_pickle('./Dataset/unique_user.pkl')

In [7]:
all_business.head()

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,stars,review_count
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,19107,39.955505,-75.155564,4.0,80
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,19106,39.953949,-75.143226,4.0,245
2,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,19147,39.943223,-75.162568,4.5,205
3,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,19123,39.962582,-75.135657,3.5,65
4,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,19104,39.954573,-75.194894,3.0,56


In [8]:
biz_sort = all_business.sort_values('stars', ascending = False)
biz_sort.head(10)

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,stars,review_count
749,hxPnlWZmirx7neooZykmtg,Sutton's,1706 N 5th St,Philadelphia,19122,39.977159,-75.143673,5.0,74
996,UMHuKs1sO-wq3XqKaejXeA,Miss Rachel's Pantry,1938 S Chadwick St,Philadelphia,19145,39.926515,-75.173776,5.0,119
1436,TozkPEh-xhts3qfVeRFeRg,Bad Brother,726 N 24th St,Philadelphia,19130,39.968629,-75.176888,5.0,92
1498,Yz0fJyBkUF8VZBvwFswkRQ,Liberty Kitchen,1244 N Front St,Philadelphia,19122,39.970158,-75.136091,5.0,56
487,fq1yCVBgBB7s6V-D68NO1g,Cafe Mi Quang,3324 Kensington Ave,Philadelphia,19134,39.998429,-75.110254,5.0,69
1363,Ktg3ahIxk0JIkJwXAqu2ew,Hikari Sushi,"1040 N American St, Ste 701",Philadelphia,19123,39.967402,-75.141153,5.0,155
297,kDzp5FXnuG3Pwk6orhfl8A,Circles + Squares,2513 Tulip St,Philadelphia,19125,39.981163,-75.123262,5.0,103
808,TE2IEDNV0RcI6s1wTOP4fg,Tortilleria San Roman,951 S 9th St,Philadelphia,19147,39.937636,-75.158082,5.0,219
308,Pg2ZKh-Ss7CCpaF8MwNWYw,Antonio's Deli,1014 Federal St,Philadelphia,19147,39.934514,-75.161045,5.0,58
612,dUrtFfHBz15Q4GiyDFAqUw,New Ridge Brewing,6168 Ridge Ave,Philadelphia,19128,40.034931,-75.217059,5.0,54


In [9]:
# Review the ratings matrix
ratings_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1695,1696,1697,1698,1699,1700,1701,1702,1703,1704
0,5.0,,,,,,,,,,...,,,,,,,,,,
1,2.0,,,,,,,,,,...,,,,,,,,,,
2,5.0,,,,3.0,,,,,,...,,,,,,,,,,
3,5.0,,,,,,,,,,...,,,,,,,,,,
4,5.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168671,,,,,,,,,,,...,,,,,,,,,,
168672,,,,,,,,,,,...,,,,,,,,,,
168673,,,,,,,,,,,...,,,,,,,,,,
168674,,,,,,,,,,,...,,,,,,,,,,


Same as the fuction we made in "Build Rating Matrix"⬇️

In [10]:
def find_user_similarity(userA, userB, ratings_matrix):
    # Create a True/False list of businesses that were given a rating for each of the two users
    businesses_rated_by_userA = ~ratings_matrix.loc[userA, :].isna()
    businesses_rated_by_userB = ~ratings_matrix.loc[userB, :].isna()

    # Consolidate the two boolean lists into a single one
    businesses_rated_by_AB = businesses_rated_by_userA & businesses_rated_by_userB

    # Capture the rating values of both users for those businesses that were rated by both users
    # Also transform these values into a format suitable for the cosine_similarity function
    ratings_of_userA = ratings_matrix.loc[userA, businesses_rated_by_AB].values.reshape(1, -1)
    ratings_of_userB = ratings_matrix.loc[userB, businesses_rated_by_AB].values.reshape(1, -1)

    # Capture the similaritiy between the two users by comparing their ratings for the set of businesses that they have both provided a rating for
    similarity = cosine_similarity(ratings_of_userA, ratings_of_userB)[0][0]

    return similarity

In [11]:
'''
Create a function to calculate the user-item rating prediction based on cosine similarity, with the following two parameters:
 target_business = business_id value for business for whom rating is being predicted for
 target_user = user_id value for the user for whom rating is being predicted for
'''
def user_item_rating_prediction(target_user, target_business, ratings_matrix):

    similarities_to_target_user = []
    ratings_given_to_target_business = []

    # Create a list of all users that have provided a rating for the target business
    list_of_users_rating_target_business = list(ratings_matrix[~ratings_matrix.iloc[:, target_business].isna()].index)

    # Loop over every user in our target ratings matrix
    # Refer to each user as the 'other_user' since we know that our target user did not provide a rating for our target business and hence is not in this smaller data frame
    for other_user in list_of_users_rating_target_business:
        # To compensate for the value error that may occur when the two users we are comparing have 0 businesses that they have both rated together
        try:
            # Capture the cosine similarity between our target user and the current user from the list of user we are looping over
            similarity = find_user_similarity(target_user, other_user, ratings_matrix)
            similarities_to_target_user.append(similarity)
            ratings_given_to_target_business.append(ratings_matrix.loc[other_user, target_business])
        # If a value error is generated, we simply pass over to the next loop
        # Since we will not be appending no values to neither our list of similarities and list of ratings, we will not be impacting our final calculation
        except:
            pass

    # Calculate the weighted average of all ratings (for those users that have at least 1 business that they have rated together)
    return np.dot(ratings_given_to_target_business, similarities_to_target_user)/np.sum(similarities_to_target_user)

'\nCreate a function to calculate the user-item rating prediction based on cosine similarity, with the following two parameters:\n target_business = business_id value for business for whom rating is being predicted for\n target_user = user_id value for the user for whom rating is being predicted for\n'

The basic logic is the same as above

In [12]:
def find_business_similarity(businessA, businessB, ratings_matrix):

    users_who_rated_businessA = ~ratings_matrix.loc[:, businessA].isna()
    users_who_rated_businessB = ~ratings_matrix.loc[:, businessB].isna()
    
    users_who_rated_by_AB = users_who_rated_businessA & users_who_rated_businessB
    
    ratings_of_businessA = ratings_matrix.loc[users_who_rated_by_AB, businessA].values.reshape(1, -1)
    ratings_of_businessB = ratings_matrix.loc[users_who_rated_by_AB, businessB].values.reshape(1, -1)
    
    similarity = cosine_similarity(ratings_of_businessA, ratings_of_businessB)[0][0]
    
    return similarity

In [13]:
'''
Create a function to calculate the user-item rating prediction based on cosine similarity, with the following two parameters:
 target_business = business_id value for business for whom rating is being predicted for
 target_user = user_id value for the user for whom rating is being predicted for
'''
def item_item_rating_prediction(target_user, target_business, ratings_matrix):
   
    similarities_to_target_business = []
    ratings_given_by_target_user = []
    
    list_of_businesses_rated_by_target_user = list(ratings_matrix.loc[:, ~ratings_matrix.iloc[target_user, :].isna()].columns)
    
    for other_business in list_of_businesses_rated_by_target_user:
        try:
            similarity = find_business_similarity(target_business, other_business, ratings_matrix)
            similarities_to_target_business.append(similarity)
            ratings_given_by_target_user.append(ratings_matrix.loc[target_user, other_business])
        except:
            pass
    
    return np.dot(ratings_given_by_target_user, similarities_to_target_business)/np.sum(similarities_to_target_business)

'\nCreate a function to calculate the user-item rating prediction based on cosine similarity, with the following two parameters:\n target_business = business_id value for business for whom rating is being predicted for\n target_user = user_id value for the user for whom rating is being predicted for\n'

Check:

In [14]:
user_item_rating_prediction(22917, 2, ratings_matrix)

4.165900897401087

In [15]:
item_item_rating_prediction(22917, 2, ratings_matrix)

4.1541179179560315

In [16]:
businesses_in_range = pd.merge(all_business, unique_business, on = 'business_id', how = 'inner')
businesses_in_range.head()

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,stars,review_count,business_num
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,19107,39.955505,-75.155564,4.0,80,108
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,19106,39.953949,-75.143226,4.0,245,81
2,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,19147,39.943223,-75.162568,4.5,205,92
3,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,19123,39.962582,-75.135657,3.5,65,162
4,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,19104,39.954573,-75.194894,3.0,56,46


# Make prediction

We use decimal degrees as the standard unit of measurement for latitude and longitude geographic coordinates.

After some research and experimentation, we decided to use a decimal degree distance of 0.015 to determine the range of restaurants nearby to limit the scope of our recommender system. This distance represents a straight distance of around 1.75 km in any one direction.

In [17]:
# Create a set of parameters (to use as an example) 

# default GPS coordinates 
lat_gps = 39.9861726
lon_gps = -75.1322293

# Example user for whom we want to generate ratings for 

ex_user_id = 'hDMM20MqoKVi4VMKamyfqQ'
 
# Determine the unique user number for the selected example user above
ex_user_num = unique_user[unique_user['user_id'] == ex_user_id]
user_num = ex_user_num.iloc[0, 0]

# Set a distance value (in Decimal Degrees)* see explanation above
gps_distance = 0.015

In [27]:
'''
Create a function which outputs a list of restaurants sorted by ratings predicted for a given user

Parameters:
 user_num: as per the matching index from our ratings matrix
 user_lat, user_lon: chosen geographic location to centre our recommendations around
 gps_distance: user inputted distance to determine geographic range, in decimal degrees
'''
def restaurant_predictions(user_num, aim_lat, aim_lon, gps_distance):
    
    # Using the given GPS coordinates and GPS distance range
    min_lat = aim_lat - gps_distance
    max_lat = aim_lat + gps_distance
    min_lon = aim_lon - gps_distance
    max_lon = aim_lon + gps_distance
    
    list_of_businesses_in_range = []
    
    # Looping over each record in our business df, select all the restaurants which fall within this range
    for record in all_business.index:
        
        # Capture the latitude and longitude coordinates for the current business we are looping over
        bus_lat = all_business.iloc[record, 5]
        bus_lon = all_business.iloc[record, 6]
        
        # If the GPS coordinates of the current business falls within our GPS range...
        if (bus_lat > min_lat) and (bus_lat < max_lat) and (bus_lon > min_lon) and (bus_lon < max_lon):
            list_of_businesses_in_range.append(record)
           
    # Given our list of index values for our selected businesses...
    # Create a new df containing only those selected businesses
    businesses_in_range = all_business.loc[list_of_businesses_in_range,:]
    
    # Insert a new column called 'user rating' filled with null values
    businesses_in_range.insert(7, 'user_rating', np.nan)
    
    businesses_in_range = pd.merge(businesses_in_range, unique_business, on = 'business_id', how = 'inner')
    
    # Loop over every record in our df containing the selected restaurants with our user-inputted GPS range
    for record in businesses_in_range.index:
        # Capture the business_num for the current business we are looping over in order to refer to our ratings matrix
        business_num = businesses_in_range.iloc[record, 9]
        # Determine the ratings prediction using the hybrid approach as determined in the previous notebook...
        # Weighted average using 80% of the item-item-rating and 20% of the user-item-rating
        ratings_prediction = item_item_rating_prediction(user_num, business_num, ratings_matrix)*0.9 + user_item_rating_prediction(user_num, business_num, ratings_matrix)*0.1
        # Capture the predicted rating for the given user into our df
        businesses_in_range.iloc[record, 7] = ratings_prediction
    
    # Sort our df containing those selected restaurants within our GPS range by the predicted user rating
    businesses_in_range.sort_values('user_rating', ascending = False, inplace = True)
    
    return businesses_in_range

'\nCreate a function which outputs a list of restaurants sorted by ratings predicted for a given user\n\nParameters:\n user_num: as per the matching index from our ratings matrix\n user_lat, user_lon: chosen geographic location to centre our recommendations around\n gps_distance: user inputted distance to determine geographic range, in decimal degrees\n'

In [28]:
%%time

recommendations = restaurant_predictions(user_num, lat_gps, lon_gps, gps_distance)

  return np.dot(ratings_given_by_target_user, similarities_to_target_business)/np.sum(similarities_to_target_business)
  return np.dot(ratings_given_to_target_business, similarities_to_target_user)/np.sum(similarities_to_target_user)
  return np.dot(ratings_given_by_target_user, similarities_to_target_business)/np.sum(similarities_to_target_business)
  return np.dot(ratings_given_to_target_business, similarities_to_target_user)/np.sum(similarities_to_target_user)
  return np.dot(ratings_given_by_target_user, similarities_to_target_business)/np.sum(similarities_to_target_business)
  return np.dot(ratings_given_to_target_business, similarities_to_target_user)/np.sum(similarities_to_target_user)
  return np.dot(ratings_given_by_target_user, similarities_to_target_business)/np.sum(similarities_to_target_business)
  return np.dot(ratings_given_to_target_business, similarities_to_target_user)/np.sum(similarities_to_target_user)
  return np.dot(ratings_given_by_target_user, similarities_to_ta

CPU times: user 10.7 s, sys: 191 ms, total: 10.9 s
Wall time: 11.4 s


In [29]:
# Show only the top 10 restaurants with the highest predicted user rating for the selected user 
recommendations.head(10)

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,user_rating,stars,review_count,business_num
52,hUmHwBQtGg0iH-PZakQJDw,Wm Mulherin's Sons,1355 N Front St,Philadelphia,19122,39.971709,-75.135062,5.0,4.5,610,1566
49,3IDTJXyPKz1vOcJQD1Cn9w,Thin & Crispy,2563 Trenton Ave,Philadelphia,19125,39.98312,-75.123012,5.0,4.5,56,1629
42,CLw-I3X3X8l1V80jxPv3AA,One Pound Cheesesteaks,2661 Kensington Ave,Philadelphia,19125,39.989582,-75.125543,5.0,3.5,51,1512
34,hNlvlGJsTXLJxeJ4RPsaeA,Alamodak Restaurant and Hookah Lounge,161 Cecil B Moore Ave,Philadelphia,19122,39.976354,-75.137436,5.0,4.0,98,1014
33,3UBG2rwjgP-6ifTAuKl3Gg,Steap and Grind,1619 Frankford Ave,Philadelphia,19125,39.97444,-75.132947,5.0,4.0,138,963
9,kDzp5FXnuG3Pwk6orhfl8A,Circles + Squares,2513 Tulip St,Philadelphia,19125,39.981163,-75.123262,5.0,5.0,103,358
11,cVV8GWVIe9BwyCOKwrFgPA,Castellino's,1255 E Palmer St,Philadelphia,19125,39.972337,-75.12904,5.0,5.0,72,300
32,1FCxJuEH_3tmxUDVPv1qfw,Poe's Sandwich Joint,1710 N 5th St,Philadelphia,19122,39.977303,-75.143602,5.0,4.0,122,1032
28,g0UI4VuBBTvNow0-HW58sA,Good Spoon Soupery,1400 N Front St,Philadelphia,19122,39.972032,-75.135421,5.0,4.5,138,838
20,m82r1MXvzm7dI0DF0kWURA,PrimoHoagies,1501 E Susquehanna Ave,Philadelphia,19125,39.975985,-75.126772,5.0,4.0,50,674


In [30]:
# delete irrelevant columns and save 'name', 'latitude', 'longitude' and 'star' as our output

import json

res = json.loads(recommendations.to_json())

response = []

biz_list = list(res['name'].keys())

result_limit = 10
for idx in range(result_limit):
    
    biz_idx = biz_list[idx]
    
    biz_info = {}
    
    biz_info["name"] = res["name"][biz_idx]
    
    biz_info["latitude"] = res["latitude"][biz_idx]
    
    biz_info["longtitude"] = res["longitude"][biz_idx]
    
    biz_info["star"] = res["stars"][biz_idx]

    response.append(biz_info)


print(json.dumps(response))

[{"name": "Wm Mulherin's Sons", "latitude": 39.971709, "longtitude": -75.135062, "star": 4.5}, {"name": "Thin & Crispy", "latitude": 39.9831195, "longtitude": -75.1230116, "star": 4.5}, {"name": "One Pound Cheesesteaks", "latitude": 39.9895823, "longtitude": -75.125543, "star": 3.5}, {"name": "Alamodak Restaurant and Hookah Lounge", "latitude": 39.9763544, "longtitude": -75.1374359, "star": 4.0}, {"name": "Steap and Grind", "latitude": 39.9744402, "longtitude": -75.132947, "star": 4.0}, {"name": "Circles + Squares", "latitude": 39.9811626537, "longtitude": -75.1232624799, "star": 5.0}, {"name": "Castellino's", "latitude": 39.9723373258, "longtitude": -75.129040464, "star": 5.0}, {"name": "Poe's Sandwich Joint", "latitude": 39.9773029, "longtitude": -75.1436018, "star": 4.0}, {"name": "Good Spoon Soupery", "latitude": 39.9720325, "longtitude": -75.1354214, "star": 4.5}, {"name": "PrimoHoagies", "latitude": 39.9759849699, "longtitude": -75.1267718478, "star": 4.0}]
