DATA CLEANING!

In [8]:
import tensorflow as tf
import pandas as pd
import keras
import random
from pymongo import MongoClient
from geopy.geocoders import Nominatim

### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###
# get user data from mongodb database (specifically age and initial main_type preference)
# connection_string = "mongodb+srv://hangodb:hangodb@cluster0.phdgtft.mongodb.net/"
# dbname = "Hango"
# collection_name = "User Data"
# client = MongoClient(connection_string)
# db = client[dbname]
# collection = db[collection_name]
# query = {"email": email} 
# user_object = collection.find_one(query)
### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###

### read in dataframe from the cleaned datafile ###
df = pd.read_json('Hango.Places.json')

### convert object id to str (Aidan's code) ###
df.rename(columns={'_id': 'place_id'}, inplace=True)
df['place_id'] = df['place_id'].apply(lambda x: x['$oid'])

### filter by age, main_type, and location (reduce calculations) ###
age = 20 #int(input("Enter age: ")) 
main_type = 'Food' #str(input('Enter main_type (Food, Entertainment, Nature/Recreation, Nightlife, Museum/Art, or Drinks): '))
main_type = main_type[0].upper() + main_type[1:].lower()
# N means age doesn't matter, Y means age does matter
# if underage, only keep the ones with N as age
if age < 21:
    df = df[df['age'] == 'N']
# only keep the main_types in df that are main_type
df = df[df['main_type'] == main_type]
# filter by location
app = Nominatim(user_agent="test2")
user_add = 'Long Beach, CA'
address = app.geocode(user_add).raw
# get long and lat from data
user_loc = [float(address['lat']), float(address['lon'])]
# get radius
radius = 10/111 # radius of 10 miles to get enough places for initial recommendations
# find radius in context of user lat and lon
lat_least = user_loc[0]-radius
lat_most = user_loc[0]+radius
lon_least = user_loc[1]-radius
lon_most = user_loc[1]+radius
# only keep the ones within radius
lat_radius = (df['lat'] >= lat_least) & (df['lat'] <= lat_most)
df = df[lat_radius]
lon_radius = (df['lon'] >= lon_least) & (df['lon'] <= lon_most)
df = df[lon_radius]
# then drop it from the df (unnecessary)
df.drop(columns=['main_type', 'weblink', 'age'], inplace=True)

### hot-encode ###
# get unique subtypes (Aidan's code)
unique_subtypes = set(subtype for sublist in df['sub_types'] for subtype in sublist)
# create a DataFrame with one-hot encoding columns for subtypes (Aidan's code)
subtype_df = pd.DataFrame({subtype: df['sub_types'].apply(lambda x: 1 if subtype in x else 0) for subtype in unique_subtypes})
# concatenate the original DataFrame with the new subtype DataFrame (Aidan's code)
df = pd.concat([df, subtype_df], axis=1)
# drop original sub_types (unnecessary)
df.drop(columns='sub_types', inplace=True)

print(df.head())

                     place_id                     name        lat         lon  \
21   65c7cd4d7f987a416ea45aa3                 Del Taco  33.795876 -118.108282   
22   65c7cd4d7f987a416ea45ab0     Georgia's Restaurant  33.830491 -118.144792   
40   65c7cd4e7f987a416ea45aba  Lucky Chinese Fast Food  33.790799 -118.265913   
123  65c7cd4d7f987a416ea45aa4                Five Guys  33.854278 -118.138839   
160  65c7cd4d7f987a416ea45aa7                Taco Bell  33.795668 -118.107541   

                                 address  rating  rating_amount price  food  \
21       2201 Palo Verde Ave, Long Beach     4.1            611     1     1   
22      4101 McGowen St #155, Long Beach     4.2            650     2     1   
40   306 W Pacific Coast Hwy, Wilmington     4.5            196     2     1   
123         4625 Candlewood St, Lakewood     4.1            378     2     1   
160          6407 Stearns St, Long Beach     4.0            607     1     1   

     restaurant  store  bar  meal_take

In [9]:
#remove places that have the same name but different locations (only keep the one that is closest to user location)
import math
#pythagorean theorem c = math.sqrt((x2-x1)**2 + (y2-y1)**2)
grouped = df.groupby('name')
removeList = []
d = {}
for name, group in grouped:
    if len(group) > 1:  # Only print if there are multiple rows with the same name
        for index, row in group.iterrows():
            distance = math.sqrt((row['lat']-user_loc[0])**2 + (row['lon']-user_loc[1])**2)
            d[distance] = index
        sorted_d = dict(sorted(d.items()))
        while len(sorted_d) > 1:
            removeList.append(sorted_d.popitem()[1])
            d.popitem()
        d.popitem()
df = df.drop(removeList)
df.drop(columns=['lat', 'lon'], inplace=True)

Adalberto's Mexican Food
Del Taco
Jack in the Box
McDonald's


MODEL BUILDING!!!

In [3]:
### get rated matrix for the df ###
# use Bayesian average rating formula to make sure rating isn't inflated (from too little ratings or too much ratings)
C = df['rating'].mean()  # get avg rating of all ratings
m = df['rating_amount'].quantile(0.9)  # get quantile threshold for rating amount (basically removes the one with too little ratings in calculation)
df['weighted_rating'] = ((df['rating_amount'] / (df['rating_amount'] + m)) * df['rating'] + (m / (df['rating_amount'] + m)) * C).round(2)
# then drop rating and rating_amount from df
df.drop(columns=['rating', 'rating_amount'], inplace=True)


In [4]:
df.sort_values(by='weighted_rating', ascending=False, inplace=True)

#using most rated first
distance_df = df.drop(columns=['place_id', 'name', 'address', 'price', 'weighted_rating'])
most = distance_df.iloc[0]
distance_df.drop(distance_df.index[0], inplace=True)
print(most.name)

1546


In [5]:
from scipy.spatial.distance import cosine

# subtract avg from list to ensure that missing values or 0's don't affect the distance 
mean_most = most.mean()
most = most * mean_most

# get the cosine distance for each row (0 indicates that the vectors are perfectly similar (i.e., they point in the same direction). 
# 1 indicates that the vectors are orthogonal (i.e., they are perpendicular to each other). 
# and 2 indicates that the vectors are perfectly dissimilar (i.e., they point in opposite directions)).
for index, row in distance_df.iterrows():
    mean_row = row.mean()
    row = row * mean_row
    cos_distance = cosine(most, row)
    distance_df.loc[index, 'distance'] = cos_distance

distance_df.sort_values(by='distance', ascending=True, inplace=True)

print(distance_df)

      food  restaurant  store  bar  meal_takeaway  cafe  meal_delivery  \
1552     1           1      0    1              0     0              0   
1550     1           1      0    1              0     0              0   
1537     1           1      0    0              0     0              0   
1547     1           1      0    0              0     0              0   
472      1           1      0    0              0     0              0   
22       1           1      0    0              0     0              0   
1238     1           1      0    0              0     0              0   
1536     1           1      0    0              0     0              0   
1553     1           1      0    0              0     0              0   
1543     1           1      0    0              0     0              0   
1533     1           1      0    0              0     0              0   
1544     1           1      0    0              0     0              0   
1538     1           1      0    0    

In [6]:
#the most different place, least different place from rated first
len_distance_df = len(distance_df)
most_5_similar = distance_df.head(5)
least_5_similar = distance_df.tail(5)

#connect distance_df to original df to get place_id and name
print('5 places that are most similar to top rated place:')
for index, row in most_5_similar.iterrows():
    print(df.loc[index]['name'] + " at " + df.loc[index]['address'])

print()
print('5 places that are least similar to top rated place:')
for index, row in least_5_similar.iterrows():
    print(df.loc[index]['name'] + " at " + df.loc[index]['address'])


5 places that are most similar to top rated place:
Curley's Cafe at 1999 E Willow St, Signal Hill
Tacos La Revancha Kitchen & Beer at 2634 E Anaheim St, Long Beach
Plant Power Fast Food at 5095 CA-1, Long Beach
King Taco # 27 at 1841 Long Beach Blvd, Long Beach
Alberta's Mexican Food at 1770 W Pacific Coast Hwy, Long Beach

5 places that are least similar to top rated place:
Del Taco at 1801 E Willow St, Signal Hill
Jack in the Box at 652 Atlantic Ave, Long Beach
Tom's #1 World Famous Chili Burgers at 626 E Sepulveda Blvd, Carson
El Paisa Restaurant at 1640 Orange Ave, Long Beach
McDonald's at 1830 Long Beach Blvd, Long Beach


In [7]:
# # return best rated place first within user preferences (HOW TO GENERATE WEIGHTED MATRIX - DON'T NEED YET)
# not_sub = ['place_id', 'name', 'address', 'price', 'weighted_rating']
# for column_name in df.columns:
#     if column_name not in not_sub:
#         df[column_name] = df[column_name]*df['weighted_rating']
# df.drop(columns=['weighted_rating'], inplace=True)
# print(df)