DATA CLEANING!

In [76]:
import tensorflow as tf
import pandas as pd
import keras
import random
from pymongo import MongoClient
from geopy.geocoders import Nominatim

### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###
# get user data from mongodb database (specifically age and initial main_type preference)
# connection_string = "mongodb+srv://hangodb:hangodb@cluster0.phdgtft.mongodb.net/"
# dbname = "Hango"
# collection_name = "User Data"
# client = MongoClient(connection_string)
# db = client[dbname]
# collection = db[collection_name]
# query = {"email": email} #change this query when actually using di
# user_object = collection.find_one(query)
### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###

### read in dataframe from the cleaned datafile ###
df = pd.read_json('Hango.Places.json')

### convert object id to str (Aidan's code) ###
df.rename(columns={'_id': 'place_id'}, inplace=True)
df['place_id'] = df['place_id'].apply(lambda x: x['$oid'])

### filter by age, main_type, and location (reduce calculations) ###
age = 20 #int(input("Enter age: ")) 
main_type = 'Food' #str(input('Enter main_type (Food, Entertainment, Nature/Recreation, Nightlife, Museum/Art, or Drinks): '))
main_type = main_type[0].upper() + main_type[1:].lower()
# N means age doesn't matter, Y means age does matter
# if underage, only keep the ones with N as age
if age < 21:
    df = df[df['age'] == 'N']
# only keep the main_types in df that are main_type
df = df[df['main_type'] == main_type]
# filter by location
app = Nominatim(user_agent="test2")
user_add = 'Garden Grove, CA'
address = app.geocode(user_add).raw
# get long and lat from data
user_loc = [float(address['lat']), float(address['lon'])]
# get radius
radius = 10/111 # radius of 10 miles to get enough places for initial recommendations
# find radius in context of user lat and lon
lat_least = user_loc[0]-radius
lat_most = user_loc[0]+radius
lon_least = user_loc[1]-radius
lon_most = user_loc[1]+radius
# only keep the ones within radius
lat_radius = (df['lat'] >= lat_least) & (df['lat'] <= lat_most)
df = df[lat_radius]
lon_radius = (df['lon'] >= lon_least) & (df['lon'] <= lon_most)
df = df[lon_radius]
# then drop it from the df (unnecessary)
df.drop(columns=['main_type', 'weblink', 'age', 'lat', 'lon'], inplace=True)

### hot-encode ###
# get unique subtypes (Aidan's code)
unique_subtypes = set(subtype for sublist in df['sub_types'] for subtype in sublist)
# create a DataFrame with one-hot encoding columns for subtypes (Aidan's code)
subtype_df = pd.DataFrame({subtype: df['sub_types'].apply(lambda x: 1 if subtype in x else 0) for subtype in unique_subtypes})
# concatenate the original DataFrame with the new subtype DataFrame (Aidan's code)
df = pd.concat([df, subtype_df], axis=1)
# drop original sub_types (unnecessary)
df.drop(columns='sub_types', inplace=True)

print(df.head())

                     place_id                            name  \
139  65c7cd527f987a416ea45b1a  Raising Cane's Chicken Fingers   
143  65c7cd527f987a416ea45b1e                 Anaheim Food Co   
145  65c7cd527f987a416ea45b20                     Chick-fil-A   
147  65c7cd527f987a416ea45b22          Alberto's Mexican Food   
150  65c7cd527f987a416ea45b25                  Real Thai Food   

                            address  rating  rating_amount price  bar  \
139    3150 Harbor Blvd, Costa Mesa     4.4           4497     1    0   
143        1560 S Lewis St, Anaheim     4.3            184     2    0   
145   16388 Beach Blvd, Westminster     4.5           1700     1    0   
147       8040 Chapman Ave, Stanton     4.0            481     1    0   
150  9522 Chapman Ave, Garden Grove     4.6            247     1    0   

     meal_takeaway  food  store  meal_delivery  cafe  restaurant  
139              0     1      0              0     0           1  
143              0     1      0     

MODEL BUILDING!!!

In [77]:
### get rated matrix for the df ###
# use Bayesian average rating formula to make sure rating isn't inflated (from too little ratings or too much ratings)
C = df['rating'].mean()  # get avg rating of all ratings
m = df['rating_amount'].quantile(0.9)  # get quantile threshold for rating amount (basically removes the one with too little ratings in calculation)
df['weighted_rating'] = ((df['rating_amount'] / (df['rating_amount'] + m)) * df['rating'] + (m / (df['rating_amount'] + m)) * C).round(2)
# then drop rating and rating_amount from df
df.drop(columns=['rating', 'rating_amount'], inplace=True)


In [78]:
df.sort_values(by='weighted_rating', ascending=False, inplace=True)

#using most rated first
distance_df = df.drop(columns=['place_id', 'name', 'address', 'price', 'weighted_rating'])
most = distance_df.iloc[0]
distance_df.drop(distance_df.index[0], inplace=True)
print(most)

bar              0
meal_takeaway    1
food             1
store            0
meal_delivery    0
cafe             0
restaurant       1
Name: 184, dtype: int64


In [79]:
from scipy.spatial.distance import cosine

# subtract avg from list to ensure that missing values or 0's don't affect the distance 
mean_most = most.mean()
most = most * mean_most

# get the cosine distance for each row (0 indicates that the vectors are perfectly similar (i.e., they point in the same direction). 
# 1 indicates that the vectors are orthogonal (i.e., they are perpendicular to each other). 
# and 2 indicates that the vectors are perfectly dissimilar (i.e., they point in opposite directions)).
for index, row in distance_df.iterrows():
    mean_row = row.mean()
    row = row * mean_row
    cos_distance = cosine(most, row)
    distance_df.loc[index, 'distance'] = cos_distance

distance_df.sort_values(by='distance', ascending=True, inplace=True)

print(distance_df)

      bar  meal_takeaway  food  store  meal_delivery  cafe  restaurant  \
4110    0              1     1      0              0     0           1   
1637    0              1     1      0              0     0           1   
1729    0              1     1      0              0     0           1   
4175    0              1     1      0              0     0           1   
4104    0              1     1      0              0     0           1   
...   ...            ...   ...    ...            ...   ...         ...   
212     1              0     1      0              0     0           1   
1653    0              0     1      1              0     1           1   
4102    0              0     1      1              0     1           1   
4101    0              0     1      1              0     1           1   
1629    0              0     1      1              0     1           1   

      distance  
4110  0.000000  
1637  0.000000  
1729  0.000000  
4175  0.000000  
4104  0.000000  
...      

In [88]:
#the most different place, least different place from rated first
len_distance_df = len(distance_df)
most_5_similar = distance_df.head(5)
least_5_similar = distance_df.tail(5)

#connect distance_df to original df to get place_id and name
print('5 places that are most similar to top rated place:')
for index, row in most_5_similar.iterrows():
    print(df.loc[index]['name'] + " at " + df.loc[index]['address'])

print()
print('5 places that are most similar to top rated place:')
for index, row in least_5_similar.iterrows():
    print(df.loc[index]['name'] + " at " + df.loc[index]['address'])

# NEW PROBLEM ENCOUNTERED: remove places that have the same name but different locations (only keep the one that is closest)
# how to know which place is closer based on user location and delete the rest (a lot of iterations for sure)


5 places that are most similar to top rated place:
Dakao Food To Go at 14550 Brookhurst St, Westminster
China Wok at 12091 S Euclid St, Garden Grove
Tasty Zone Szechwan Cuisine at 3930 S Bristol St, Santa Ana
Taco Bell at 1600 W Katella Ave, Anaheim
Thien Loc Food To Go at 14328 Brookhurst St, Garden Grove

5 places that are most similar to top rated place:
STACKED at 7490 Edinger Ave, Huntington Beach
McDonald's at 18962 Brookhurst St, Fountain Valley
McDonald's at 7112 Westminster Blvd., Westminster
McDonald's at 16866 Beach Blvd, Huntington Beach
McDonald's at 1500 S Harbor Blvd, Anaheim


In [81]:
# # return best rated place first within user preferences (HOW TO GENERATE WEIGHTED MATRIX - DON'T NEED YET)
# not_sub = ['place_id', 'name', 'address', 'price', 'weighted_rating']
# for column_name in df.columns:
#     if column_name not in not_sub:
#         df[column_name] = df[column_name]*df['weighted_rating']
# df.drop(columns=['weighted_rating'], inplace=True)
# print(df)