In [1]:
from bs4 import BeautifulSoup
from helper_functions import scrape_menu_page, avg_wv_bow
import numpy as np
import pandas as pd
import re

import requests as req

In [2]:
# my orders requested data from Uber Eats
# if you order the same item at the same
# time, it will show as separate items
# at the same time- this can be desired
# or undesired for your analysis
df_order_details = pd.read_csv("Orders.csv") 

In [3]:
df_order_details.head(2)

Unnamed: 0,Order Time,Item Name
0,2022-04-13 16:23:26 +0000 UTC,Chicken / Pollo taco
1,2022-04-13 16:23:26 +0000 UTC,Chicken / Pollo taco


# break down common names

In [4]:
df_my_orders = df_order_details[["Order Time", "Item Name"]].drop_duplicates()


df_my_orders['items'] = df_my_orders['Item Name'].map(lambda x: x.lower())
df_my_orders['items'] = df_my_orders['items'].map(lambda x: re.sub(r'[\W_]+', ' ', x).split(' '))

In [5]:
df_my_orders.head()

Unnamed: 0,Order Time,Item Name,items
0,2022-04-13 16:23:26 +0000 UTC,Chicken / Pollo taco,"[chicken, pollo, taco]"
3,2022-04-13 16:23:26 +0000 UTC,Steak / Asada taco,"[steak, asada, taco]"
13,2022-04-13 16:23:26 +0000 UTC,Large jamaica,"[large, jamaica]"
16,2022-04-13 15:39:12 +0000 UTC,Sprite®,"[sprite, ]"
18,2022-04-13 15:39:12 +0000 UTC,Spicy Southwest Salad,"[spicy, southwest, salad]"


In [6]:
my_bow = [x for y in df_my_orders["items"] for x in y]
stop = {"and", "", "of", "1", "2", "3", "4", "$"}
my_bow = [x for x in my_bow if x not in stop]

# get restaurants

In [7]:
site = "https://www.ubereats.com/chicago"
res = req.get(site, headers={'User-Agent':'Mozilla/5.0'})
soup = BeautifulSoup(res.text, "lxml")

restaurants = []

for rest in soup.find_all("a"):
    text = rest.text
    link = rest['href']
    if "Delivery" in text or "delivery" in text or "Food" in text or "Sign" in text or "restaurants" in text or "dish" in link:
        continue
    restaurants.append((text, link))

restaurants = restaurants[19:-46]
restaurants = [x for x in restaurants if x[1][0] == '/']

In [8]:
addr_rm = []
seen = set()

for place, link in restaurants:
    if '(' not in place:
        if place not in seen:
            addr_rm.append((place, link))
            seen.add(place)
        continue
    new_place = place[:place.index('(') - 1]
    if new_place in seen:
        continue
    seen.add(new_place)
    addr_rm.append((new_place, link))


In [9]:
rest_menus = []

for place, ext in addr_rm:
    url = "https://www.ubereats.com" + ext
    menu_page_string = scrape_menu_page(url)
    if not menu_page_string:
        continue
    menu_page = BeautifulSoup(menu_page_string, "html.parser")
    
    for main in menu_page.find_all("main"):
        for span in main.find_all("span"):
            text = span.text
            if "$" in text or len(text) < 5 or len(text) > 50:
                continue
            rest_menus.append((place, span.text))

In [10]:
d_food = {}

for rest, item in rest_menus:
    if "\xa0\xa0•\xa0\xa0" in item or "•" in item:
        continue
    d_food[rest] = d_food.get(rest, [])
    d_food[rest].append(item)

In [11]:
df = pd.DataFrame(columns=["restaurant", "menu_bow"])

logistics = {'and', 'hours', 'address', 'more', 'tap', 'for', 'hours,', 'address,', 'more,', 'tap,', 'for,'}

for rest in d_food:
    restaurant, menu = rest, d_food[rest][1:] # first item is address
    menu_bow = " ".join(menu)
    menu_bow = menu_bow.split(" ")
    menu_bow = [term.lower() for term in menu_bow]
    menu_bow = [term for term in menu_bow if term not in logistics]
    df.loc[len(df)] = [restaurant, menu_bow]

In [12]:
df.head(20)

Unnamed: 0,restaurant,menu_bow
0,Paulie Gee's Chicago,"[the, chicago, vegan, bread, pudding, (v), by,..."
1,Shawarma Inn III,"[chicken, beef, shawarma, combination, entree,..."


In [13]:
# compute avg w2v vector by restaurant menu
df['avg_wv_bow'] = df['menu_bow'].map(lambda bow: avg_wv_bow(bow))

# compute my avg w2v vector
my_bow_wv = avg_wv_bow(my_bow)

# compute cosine similarity between each avg w2v vector by restaurant and my avg w2v
df['cos_sim'] = df['avg_wv_bow'].map(lambda x: np.dot(x, my_bow_wv) / np.linalg.norm(x) * np.linalg.norm(my_bow_wv))

# sort by score- the higher the better
df = df.sort_values(by=['cos_sim'], ascending=False)

In [14]:
pd.set_option('display.max_rows', 200)

df[["restaurant", "cos_sim"]].reset_index().drop(["index"], axis=1)

Unnamed: 0,restaurant,cos_sim
0,Shawarma Inn III,0.204374
1,Paulie Gee's Chicago,0.188324
