# Eating in Buenos Aires on a budget
## Skills: Python, Web scraping, Geopy, Folium

Download the Jupyter Notebook from [GitHub](https://nicomoriuala.github.io)

<a id='content' />

## Contents

1. [Introduction](#intro)
1. [Web Scraping](#scraping)
2. [Data Cleansing](#cleansing)
3. [Mapping](#mapping)
4. [A definir](#volatility)

<a id='intro' />

## Introduction
[(back to top)](#content)

In [1]:
#Importing packages
import pandas as pd
import numpy as np
import re
import unidecode

from tqdm.notebook import tqdm
import time

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

import folium
from folium.plugins import MarkerCluster

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

<a id='scraping' />

## Web Scraping
[(back to top)](#content)

In [2]:
#Load webdriver
driver = webdriver.Chrome('/Users/nicolamori/Library/Application Support/binman_chromedriver/mac64/85.0.4183.38/chromedriver')

#Open page
driver.get('https://www.tripadvisor.com.ar/Restaurants-g312741-Buenos_Aires_Capital_Federal_District.html')
time.sleep(5)

#Filter by cheap restaurants
driver.find_element_by_xpath('//*[@id="component_47"]/div/div[6]/div[2]/div[1]/div/label/div/span').click()
time.sleep(3)

In [3]:
%%time

#Empty list
total = []

#Number of pages
npages = int(driver.find_element_by_xpath('//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/div/a[6]').text)

for h in tqdm(range(npages), position = 0, desc='1st loop'):
    
    #Restaurants elements
    container = driver.find_elements_by_class_name('_15_ydu6b')
    num_page_items = len(container)
    
    for i in tqdm(range(num_page_items), position = 1, leave=False, desc='2nd loop'):
        #Record original tab
        window_before = driver.window_handles[0]
        
        #Open tab
        element = container[i]
        driver.execute_script("arguments[0].click();", element)
        
        #Switch to new tab
        window_after = driver.window_handles[1]
        driver.switch_to.window(window_after)
        time.sleep(1)
        
        #Extract data with covid19 section
        try: 
            driver.find_element_by_xpath('//*[@data-tab="TABS_COVID19"]')
            
            name = driver.find_element_by_xpath('//*[@class="_3a1XQ88S"]').text
            try:cuisines = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[2]/div/div[2]/div/div/div[2]/div[2]/div[2]').text
            except NoSuchElementException: cuisines = None  
            try: rank = driver.find_element_by_xpath('//*[@class="_15QfMZ2L"]').text
            except NoSuchElementException: rank = None  
            nreviews = driver.find_element_by_xpath('//*[@class="_3Wub8auF"]').text
            
            address = driver.find_element_by_xpath('//*[@class="_2saB_OSe"]').text
            try:phone = driver.find_element_by_xpath('//*[@id="taplc_top_info_0"]/div/div/div[3]/span[2]/span/span[2]/a').text
            except NoSuchElementException: website = None  
            try: website = driver.find_element_by_xpath('//*[@id="taplc_top_info_0"]/div/div/div[3]/span[3]/span/a').get_attribute("href")
            except NoSuchElementException: website = None  
            
            try:score = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[2]/div/div[1]/div/div[1]/div[1]/span[2]/span').get_attribute("class")
            except NoSuchElementException: score = None  
            try:food = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[2]/div/div[1]/div/div[3]/div[2]/div[1]/span[3]/span').get_attribute("class")
            except NoSuchElementException: food = None
            try:service = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[2]/div/div[1]/div/div[3]/div[2]/div[2]/span[3]/span').get_attribute("class")
            except NoSuchElementException: service = None
            try:value = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[2]/div/div[1]/div/div[3]/div[2]/div[3]/span[3]/span').get_attribute("class")
            except NoSuchElementException: value = None
        
        #Extract data without covid19 section
        except NoSuchElementException:        
            
            name = driver.find_element_by_xpath('//*[@class="_3a1XQ88S"]').text
            try: cuisines = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[1]/div/div[2]/div/div/div[2]/div[2]/div[2]').text
            except NoSuchElementException: cuisines = None
            try: rank = driver.find_element_by_xpath('//*[@class="_15QfMZ2L"]').text
            except NoSuchElementException: rank = None  
            nreviews = driver.find_element_by_xpath('//*[@class="_3Wub8auF"]').text
            
            address = driver.find_element_by_xpath('//*[@class="_2saB_OSe"]').text
            try:phone = driver.find_element_by_xpath('//*[@id="taplc_top_info_0"]/div/div/div[3]/span[2]/span/span[2]/a').text
            except NoSuchElementException: website = None  
            try: website = driver.find_element_by_xpath('//*[@id="taplc_top_info_0"]/div/div/div[3]/span[3]/span/a').get_attribute("href")
            except NoSuchElementException: website = None  
            
            try:score = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[1]/div/div[1]/div/div[1]/div[1]/span[2]/span').get_attribute("class")
            except NoSuchElementException: score = None  
            try:food = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[1]/div/div[1]/div/div[3]/div[2]/div[1]/span[3]/span').get_attribute("class")
            except NoSuchElementException: food = None
            try:service = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[1]/div/div[1]/div/div[3]/div[2]/div[2]/span[3]/span').get_attribute("class")
            except NoSuchElementException: service = None
            try:value = driver.find_element_by_xpath('//*[@id="taplc_detail_overview_cards_0"]/div/div[1]/div/div[1]/div/div[3]/div[2]/div[3]/span[3]/span').get_attribute("class")
            except NoSuchElementException: value = None
        
        #Add data to list
        new = ((name, cuisines, rank, nreviews, address, phone, website, score, food, service, value))
        total.append(new)
        
        #Close tab and switch to main window
        driver.close()
        driver.switch_to.window(window_before)
    
    #Switch to next page
    try:
        if h==0: element2 = driver.find_element_by_xpath('//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/a')
        else: element2 = driver.find_element_by_xpath('//*[@id="EATERY_LIST_CONTENTS"]/div[2]/div/a[2]')        
        
        driver.execute_script("arguments[0].click();", element2)
        time.sleep(3)
    except NoSuchElementException: driver.close()

#List to dataframe
df = pd.DataFrame(total,columns=['name', 'cuisines', 'rank', 'nreviews', 'address', 'phone', 'website', 'score', 'food', 'service', 'value'])

df.to_csv('affordable_restaurantss.tsv', sep='\t', index=False)

HBox(children=(FloatProgress(value=0.0, description='1st loop', max=28.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='2nd loop', max=30.0, style=ProgressStyle(description_widt…




KeyboardInterrupt: 

<a id='cleansing' />

## Data Cleansing
[(back to top)](#content)

In [11]:
restaurants=pd.read_csv('affordable_restaurantss.tsv', sep='\t')

restaurants = restaurants.astype(str)

restaurants.head(2)

Unnamed: 0,name,cuisines,rank,nreviews,address,phone,website,score,food,service,value
0,La Choripanería,"Argentina, Latina, Comida rápida, Parrilla",#16 de 5.004 restaurantes en Buenos Aires,316 opiniones,"Bolívar 954 Local 42, Mercado de San Telmo, Bu...",+54 11 4244-9478,,ui_bubble_rating bubble_45,ui_bubble_rating bubble_50,ui_bubble_rating bubble_45,ui_bubble_rating bubble_45
1,Javi's Parrilla Restaurant,"Parrillada, Argentina, Parrilla",#29 de 5.004 restaurantes en Buenos Aires,157 opiniones,"Chile 800 Esquina Piedras, Buenos Aires C1098A...",+54 11 3402-7743,http://www.facebook.com/Javisrestaurantargentina/,ui_bubble_rating bubble_45,ui_bubble_rating bubble_45,ui_bubble_rating bubble_50,ui_bubble_rating bubble_45


In [12]:
#Scoring columns
temp=['score', 'food', 'service', 'value']

for i in temp:
    restaurants[i] = [j.replace('ui_bubble_rating bubble_','') for j in restaurants[i]]
    restaurants[i] = pd.to_numeric(restaurants[i], errors='coerce')/10

In [13]:
#Number of reviews
restaurants['nreviews'] = [i.replace(' opiniones','') for i in restaurants['nreviews']]
restaurants['nreviews'] = pd.to_numeric(restaurants['nreviews'], errors='coerce')

restaurants = restaurants[restaurants['nreviews'] > 0]

In [14]:
#Addresses format and removing not normalized rows
new_address=[]

for values in restaurants['address']:
    try:
        street=re.search("(.*?)\d+", values).group()+', CABA, Argentina'
    except AttributeError:
        street='Not normalized'
    
    new_address.append(street)

restaurants['new_address']=new_address

restaurants['new_address']=([i
                             .replace('Calle ','')
                             .replace('Avenida ','')
                             .replace('Av. ','')
                             for i in restaurants['new_address']])

restaurants = restaurants[restaurants['new_address'] != 'Not normalized']

In [15]:
#Rank format
restaurants['rank'] = [i.replace('restaurantes en Buenos Aires','') for i in restaurants['rank']]

restaurants.head(2)

Unnamed: 0,name,cuisines,rank,nreviews,address,phone,website,score,food,service,value,new_address
0,La Choripanería,"Argentina, Latina, Comida rápida, Parrilla",#16 de 5.004,316.0,"Bolívar 954 Local 42, Mercado de San Telmo, Bu...",+54 11 4244-9478,,4.5,5.0,4.5,4.5,"Bolívar 954, CABA, Argentina"
1,Javi's Parrilla Restaurant,"Parrillada, Argentina, Parrilla",#29 de 5.004,157.0,"Chile 800 Esquina Piedras, Buenos Aires C1098A...",+54 11 3402-7743,http://www.facebook.com/Javisrestaurantargentina/,4.5,4.5,5.0,4.5,"Chile 800, CABA, Argentina"


<a id='mapping' />

## Mapping
[(back to top)](#content)

In [2]:
#Obtaining coordinates
geolocator = Nominatim(user_agent="restaurants_coord")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

tqdm.pandas()
restaurants['location'] = restaurants['new_address'].progress_apply(geocode)

restaurants['point'] = restaurants['location'].apply(lambda loc: tuple(loc.point) if loc else None)

In [4]:
restaurants.to_csv('affordable_restaurantss2.tsv', sep='\t', index=False)

In [5]:
restaurants=pd.read_csv('affordable_restaurantss2.tsv', sep='\t')

In [6]:
#Keeping only restaurants with coordinates
restaurants = restaurants[restaurants.point.notnull()]

In [7]:
#Formatting coordinates 
restaurants[['latitude', 'longitude', 'altitude']] = restaurants.point.str.split(",", expand=True)
restaurants['latitude'] = [i.replace('(','') for i in restaurants['latitude']]

restaurants['latitude']=pd.to_numeric(restaurants['latitude'], errors='coerce')
restaurants['longitude']=pd.to_numeric(restaurants['longitude'], errors='coerce')

restaurants.drop(['address','point','altitude'], axis=1, inplace=True)

restaurants.reset_index(drop=True, inplace=True)

restaurants.head(2)

Unnamed: 0,name,cuisines,rank,nreviews,phone,website,score,food,service,value,new_address,location,latitude,longitude
0,La Choripanería,"Argentina, Latina, Comida rápida, Parrilla",#16 de 5.004,316.0,+54 11 4244-9478,,4.5,5.0,4.5,4.5,"Bolívar 954, CABA, Argentina","954, Bolívar, San Telmo, Comuna 1, Ciudad Autó...",-34.619072,-58.372906
1,Javi's Parrilla Restaurant,"Parrillada, Argentina, Parrilla",#29 de 5.004,157.0,+54 11 3402-7743,http://www.facebook.com/Javisrestaurantargentina/,4.5,4.5,5.0,4.5,"Chile 800, CABA, Argentina","800, Chile, Monserrat, Comuna 1, Buenos Aires,...",-34.616397,-58.377319


In [8]:
m = folium.Map(location=[-34.6083,-58.4212], tiles='cartodbpositron', zoom_start=12)


marker_cluster = MarkerCluster(name='cheap restaurants', overlay=True, control=False, icon_create_function=None)

for i in range(len(restaurants['latitude'])):
    location = restaurants['latitude'][i], restaurants['longitude'][i]
    marker = folium.Marker(location=location)
    popup = ('<b>Name:</b> {} <br><b>Cuisines:</b> {} <br><b>Rank:</b> {} <br><b>Reviews:</b> {} <br><b>Score:</b> {}  <br><b>Address:</b> {} <br><b>Phone:</b> {} <br><b>Website:</b> <a href={}>abc</a>'
             .format(re.sub(r'[^\w]', ' ', unidecode.unidecode(restaurants['name'][i])), 
                     restaurants['cuisines'][i],
                     restaurants['rank'][i],
                     restaurants['nreviews'][i],
                     restaurants['score'][i], 
                     restaurants['new_address'][i], 
                     restaurants['phone'][i], 
                     restaurants['website'][i]))
    folium.Popup(popup, max_width=500).add_to(marker)
    marker_cluster.add_child(marker)

marker_cluster.add_to(m)

folium.LayerControl().add_to(m)

m

In [None]:
#https://python-visualization.github.io/folium/plugins.html#folium-plugins
#https://nbviewer.jupyter.org/github/python-visualization/folium/blob/master/examples/MarkerCluster.ipynb

# PENDIENTES
## formato popup
## probar capas en el mapa