DATA CLEANING!

In [60]:
import tensorflow as tf
import pandas as pd
import keras
import random
from pymongo import MongoClient
from geopy.geocoders import Nominatim
import gen_feedback

### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###
# get user data from mongodb database (specifically age and initial main_type preference)
# connection_string = "mongodb+srv://hangodb:hangodb@cluster0.phdgtft.mongodb.net/"
# dbname = "Hango"
# collection_name = "User Data"
# client = MongoClient(connection_string)
# db = client[dbname]
# collection = db[collection_name]
# query = {"email": email} #change this query when actually using di
# user_object = collection.find_one(query)
### USE WHEN LIZ AND GLORIA FINISHED INITIAL PAGE ###

### read in dataframe from the cleaned datafile ###
df = pd.read_json('Hango.Places.json')

### convert object id to str (Aidan's code) ###
df.rename(columns={'_id': 'place_id'}, inplace=True)
df['place_id'] = df['place_id'].apply(lambda x: x['$oid'])

### filter by age, main_type, and location (reduce calculations) ###
age = 20 #int(input("Enter age: ")) 
main_type = 'Food' #str(input('Enter main_type (Food, Entertainment, Nature/Recreation, Nightlife, Museum/Art, or Drinks): '))
main_type = main_type[0].upper() + main_type[1:].lower()
# N means age doesn't matter, Y means age does matter
# if underage, only keep the ones with N as age
if age < 21:
    df = df[df['age'] == 'N']
# only keep the main_types in df that are main_type
df = df[df['main_type'] == main_type]
# filter by location
app = Nominatim(user_agent="test")
user_add = 'Garden Grove, CA'
address = app.geocode(user_add).raw
# get long and lat from data
user_loc = [float(address['lat']), float(address['lon'])]
# get radius
radius = 3/111
# find radius in context of user lat and lon
lat_least = user_loc[0]-radius
lat_most = user_loc[0]+radius
lon_least = user_loc[1]-radius
lon_most = user_loc[1]+radius
# only keep the ones within radius
lat_radius = (df['lat'] >= lat_least) & (df['lat'] <= lat_most)
df = df[lat_radius]
lon_radius = (df['lon'] >= lon_least) & (df['lon'] <= lon_most)
df = df[lon_radius]
# then drop it from the df (unnecessary)
df.drop(columns=['main_type', 'weblink', 'age', 'lat', 'lon'], inplace=True)

### hot-encode ###
# cet unique subtypes (Aidan's code)
unique_subtypes = set(subtype for sublist in df['sub_types'] for subtype in sublist)
# create a DataFrame with one-hot encoding columns for subtypes (Aidan's code)
subtype_df = pd.DataFrame({subtype: df['sub_types'].apply(lambda x: 1 if subtype in x else 0) for subtype in unique_subtypes})
# concatenate the original DataFrame with the new subtype DataFrame (Aidan's code)
df = pd.concat([df, subtype_df], axis=1)
# drop original sub_types (unnecessary)
df.drop(columns='sub_types', inplace=True)

print(df.head())

                      place_id                         name  \
150   65c7cd527f987a416ea45b25               Real Thai Food   
161   65c7cd537f987a416ea45b30   SEMBRÓ Mexican Street Food   
1252  65cbbd70e88f8c2f795c9292            Dimi's Food Stand   
1258  65cbbd72e88f8c2f795c92c3  La Costenita Mexican Food 2   
1630  65cbc271c72cccdf7a6454e6          Thai Famous Cuisine   

                                    address  rating  rating_amount  \
150          9522 Chapman Ave, Garden Grove     4.6            247   
161         12011 Chapman Ave, Garden Grove     3.9            476   
1252  10801 Garden Grove Blvd, Garden Grove     5.0              4   
1258     715 N Harbor Blvd # 106, Santa Ana     5.0              3   
1630        11891 S Euclid St, Garden Grove     4.3            934   

                         price  food  meal_takeaway  restaurant  
150                          1     1              0           1  
161                          1     1              0           1  
12

MODEL BUILDING!!!

In [61]:

### get rated matrix for the df ###
# use Bayesian average rating formula to make sure rating isn't inflated (from too little ratings or too much ratings)
C = df['rating'].mean()  # get avg rating of all ratings
m = df['rating_amount'].quantile(0.9)  # get quantile threshold for rating amount (basically removes the one with too little ratings in calculation)
df['weighted_rating'] = ((df['rating_amount'] / (df['rating_amount'] + m)) * df['rating'] + (m / (df['rating_amount'] + m)) * C).round(2)
# then drop rating and rating_amount from df
df.drop(columns=['rating', 'rating_amount'], inplace=True)


In [67]:
df = df.sort_values(by='weighted_rating', ascending=False)
print(df[0:2])

                      place_id                 name  \
150   65c7cd527f987a416ea45b25       Real Thai Food   
1630  65cbc271c72cccdf7a6454e6  Thai Famous Cuisine   

                              address price  food  meal_takeaway  restaurant  \
150    9522 Chapman Ave, Garden Grove     1     1              0           1   
1630  11891 S Euclid St, Garden Grove     1     1              0           1   

      weighted_rating  
150              4.34  
1630             4.27  


In [59]:
# # return best rated place first within user preferences (HOW TO GENERATE WEIGHTED MATRIX - DON'T NEED YET)
# not_sub = ['place_id', 'name', 'address', 'price', 'weighted_rating']
# for column_name in df.columns:
#     if column_name not in not_sub:
#         df[column_name] = df[column_name]*df['weighted_rating']
# df.drop(columns=['weighted_rating'], inplace=True)
# print(df)