DATA CLEANING!

In [1]:
import tensorflow as tf
import pandas as pd
import keras
import random
from pymongo import MongoClient
from geopy.geocoders import Nominatim

### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###
# get user data from mongodb database (specifically age and initial main_type preference)
# connection_string = "mongodb+srv://hangodb:hangodb@cluster0.phdgtft.mongodb.net/"
# dbname = "Hango"
# collection_name = "User Data"
# client = MongoClient(connection_string)
# db = client[dbname]
# collection = db[collection_name]
# query = {"email": email} 
# user_object = collection.find_one(query)
### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###

### read in dataframe from the cleaned datafile ###
df = pd.read_json('Hango.Places.json')

### convert object id to str (Aidan's code) ###
df.rename(columns={'_id': 'place_id'}, inplace=True)
df['place_id'] = df['place_id'].apply(lambda x: x['$oid'])

### filter by age, main_type, and location (reduce calculations) ###
age = 20 #int(input("Enter age: ")) 
main_type = 'Food' #str(input('Enter main_type (Food, Entertainment, Nature/Recreation, Nightlife, Museum/Art, or Drinks): '))
main_type = main_type[0].upper() + main_type[1:].lower()
# N means age doesn't matter, Y means age does matter
# if underage, only keep the ones with N as age
if age < 21:
    df = df[df['age'] == 'N']
# only keep the main_types in df that are main_type
df = df[df['main_type'] == main_type]
# filter by location
app = Nominatim(user_agent="test2")
user_add = 'Garden Grove, CA'
address = app.geocode(user_add).raw
# get long and lat from data
user_loc = [float(address['lat']), float(address['lon'])]
# get radius
radius = 10/111 # radius of 10 miles to get enough places for initial recommendations
# find radius in context of user lat and lon
lat_least = user_loc[0]-radius
lat_most = user_loc[0]+radius
lon_least = user_loc[1]-radius
lon_most = user_loc[1]+radius
# only keep the ones within radius
lat_radius = (df['lat'] >= lat_least) & (df['lat'] <= lat_most)
df = df[lat_radius]
lon_radius = (df['lon'] >= lon_least) & (df['lon'] <= lon_most)
df = df[lon_radius]
# then drop it from the df (unnecessary)
df.drop(columns=['main_type', 'weblink', 'age'], inplace=True)

### hot-encode ###
# get unique subtypes (Aidan's code)
unique_subtypes = set(subtype for sublist in df['sub_types'] for subtype in sublist)
# create a DataFrame with one-hot encoding columns for subtypes (Aidan's code)
subtype_df = pd.DataFrame({subtype: df['sub_types'].apply(lambda x: 1 if subtype in x else 0) for subtype in unique_subtypes})
# concatenate the original DataFrame with the new subtype DataFrame (Aidan's code)
df = pd.concat([df, subtype_df], axis=1)
# drop original sub_types (unnecessary)
df.drop(columns='sub_types', inplace=True)

print(df.head())

                    place_id                          name        lat  \
26  65c7cd557f987a416ea45b68                     Good Food  33.833990   
91  65c7cd527f987a416ea45b22        Alberto's Mexican Food  33.788140   
92  65c7cd527f987a416ea45b26               Ben's Fast Food  33.809493   
93  65c7cd537f987a416ea45b2b  Sabrosada Fresh Mexican Food  33.735156   
96  65c7cd587f987a416ea45bb8      Castañeda's Mexican Food  33.685667   

           lon                                 address  rating  rating_amount  \
26 -117.915834  201 W Center Street Promenade, Anaheim     4.5             46   
91 -117.991934               8040 Chapman Ave, Stanton     4.0            481   
92 -117.895743                1560 S Lewis St, Anaheim     4.9             14   
93 -117.955091        15681 Brookhurst St, Westminster     4.3            912   
96 -117.954558   19071 Brookhurst St, Huntington Beach     4.2            538   

                       price  meal_delivery  store  restaurant  bar  cafe 

In [2]:
#remove places that have the same name but different locations (only keep the one that is closest to user location)
import math
#pythagorean theorem c = math.sqrt((x2-x1)**2 + (y2-y1)**2)
grouped = df.groupby('name')
removeList = []
d = {}
for name, group in grouped:
    if len(group) > 1:  # Only print if there are multiple rows with the same name
        for index, row in group.iterrows():
            distance = math.sqrt((row['lat']-user_loc[0])**2 + (row['lon']-user_loc[1])**2)
            d[distance] = index
        sorted_d = dict(sorted(d.items()))
        while len(sorted_d) > 1:
            removeList.append(sorted_d.popitem()[1])
            d.popitem()
        d.popitem()
df = df.drop(removeList)
df.drop(columns=['lat', 'lon'], inplace=True)

MODEL BUILDING!!!

In [3]:
### get rated matrix for the df ###
# use Bayesian average rating formula to make sure rating isn't inflated (from too little ratings or too much ratings)
C = df['rating'].mean()  # get avg rating of all ratings
m = df['rating_amount'].quantile(0.9)  # get quantile threshold for rating amount (basically removes the one with too little ratings in calculation)
df['weighted_rating'] = ((df['rating_amount'] / (df['rating_amount'] + m)) * df['rating'] + (m / (df['rating_amount'] + m)) * C).round(2)
# then drop rating and rating_amount from df
df.drop(columns=['rating', 'rating_amount'], inplace=True)


In [5]:
df.sort_values(by='weighted_rating', ascending=False, inplace=True)

#using most rated first
distance_df = df.drop(columns=['place_id', 'name', 'address', 'price', 'weighted_rating'])
most = distance_df.iloc[0]
distance_df.drop(distance_df.index[0], inplace=True)
print(most.name)

131


In [12]:
from scipy.spatial.distance import cosine

# subtract avg from list to ensure that missing values or 0's don't affect the distance 
mean_most = most.mean()
most = most * mean_most

# get the cosine distance for each row (0 indicates that the vectors are perfectly similar (i.e., they point in the same direction). 
# 1 indicates that the vectors are orthogonal (i.e., they are perpendicular to each other). 
# and 2 indicates that the vectors are perfectly dissimilar (i.e., they point in opposite directions)).
for index, row in distance_df.iterrows():
    mean_row = row.mean()
    row = row * mean_row
    cos_distance = cosine(most, row)
    distance_df.loc[index, 'distance'] = cos_distance

distance_df.sort_values(by='distance', ascending=True, inplace=True)

print(distance_df)

      restaurant  cafe  store  meal_delivery  food  meal_takeaway  bar  \
1646           1     0      0              0     1              1    0   
4175           1     0      0              0     1              1    0   
4104           1     0      0              0     1              1    0   
4110           1     0      0              0     1              1    0   
4108           1     0      0              0     1              1    0   
...          ...   ...    ...            ...   ...            ...  ...   
385            1     1      0              0     1              0    0   
1660           1     0      0              0     1              0    1   
167            1     0      0              0     1              0    1   
1651           1     0      1              0     1              0    0   
1629           1     1      1              0     1              0    0   

      distance  
1646  0.000000  
4175  0.000000  
4104  0.000000  
4110  0.000000  
4108  0.000000  
...      

In [13]:
#the most different place, least different place from rated first
len_distance_df = len(distance_df)
most_5_similar = distance_df.head(5)
least_5_similar = distance_df.tail(5)

#connect distance_df to original df to get place_id and name
print('5 places that are most similar to top rated place:')
for index, row in most_5_similar.iterrows():
    print(df.loc[index]['name'] + " at " + df.loc[index]['address'])

print()
print('5 places that are least similar to top rated place:')
for index, row in least_5_similar.iterrows():
    print(df.loc[index]['name'] + " at " + df.loc[index]['address'])


5 places that are most similar to top rated place:
Del Taco at 5856 Warner Ave, Huntington Beach
Taco Bell at 1600 W Katella Ave, Anaheim
Thien Loc Food To Go at 14328 Brookhurst St, Garden Grove
Dakao Food To Go at 14550 Brookhurst St, Westminster
Tan My Food To Go at 9362 Westminster Blvd., Westminster

5 places that are least similar to top rated place:
Munch Thai Food & Sweet Tea at 880 W Lincoln Ave, Anaheim
Mario's Mexican Food & Cantina at 18603 Main St, Huntington Beach
STACKED at 7490 Edinger Ave, Huntington Beach
Wienerschnitzel at 5966 Warner Ave, Huntington Beach
McDonald's at 1500 S Harbor Blvd, Anaheim


In [7]:
# # return best rated place first within user preferences (HOW TO GENERATE WEIGHTED MATRIX - DON'T NEED YET)
# not_sub = ['place_id', 'name', 'address', 'price', 'weighted_rating']
# for column_name in df.columns:
#     if column_name not in not_sub:
#         df[column_name] = df[column_name]*df['weighted_rating']
# df.drop(columns=['weighted_rating'], inplace=True)
# print(df)