In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#impoer selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import time

In [2]:
base_url = 'https://www.booking.com'

In [3]:
df = pd.read_csv('../data/raw/booking_urls.csv')
df.head()

Unnamed: 0,name,link
0,New York,/attractions/searchresults/us/new-york.en-gb.h...
1,Las Vegas,/attractions/searchresults/us/las-vegas.en-gb....
2,Key West,/attractions/searchresults/us/key-west.en-gb.h...
3,San Diego,/attractions/searchresults/us/san-diego.en-gb....
4,Miami,/attractions/searchresults/us/miami.en-gb.html...


In [4]:
# load url in selenium
driver = webdriver.Chrome()

In [5]:
def scrape_single_attraction(city_name, soup):
    name = soup.find('h3', class_='css-jv2qn6')
    name = name.text if name is not None else ""
    description = soup.find('div', class_='css-6k49yo')
    description = description.text if description is not None else ""
    duration = soup.find('div', class_='a53cbfa6de css-j786b1')
    duration = duration.text if duration is not None else ""
    rating = soup.find('span', class_='a53cbfa6de css-35ezg3')
    rating = rating.text if rating is not None else ""
    reviews = soup.find('span', class_='a53cbfa6de')
    reviews = reviews.text if reviews is not None else ""
    price = soup.find('div', class_='e1eebb6a1e css-13pzcpe')
    price = price.text if price is not None else ""
    link = soup.find('a', class_='css-i6rjpg')
    link = link['href'] if link is not None else ""
    
    attraction = {
        'city': city_name,
        'name': name,
        'description': description,
        'duration': duration,
        'rating': rating,
        'reviews': reviews,
        'price': price,
        'link': link,
    }
    return attraction

In [6]:
# soup = BeautifulSoup(driver.page_source, 'html.parser')

In [7]:
# get by css selector from soup
# soup.select('button.a83ed08757.c21c56c305.bf0537ecb5.f671049264.d2529514af.af7297d90d')

In [8]:
def scrape_attractions_city(city_name, city_url):
    driver.get(city_url)
    wait = WebDriverWait(driver, 10)

    while True:
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)
            see_more = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.css-1f31mt9 button')))
            driver.execute_script("arguments[0].scrollIntoView();", see_more)
            driver.execute_script("arguments[0].click();", see_more)
            time.sleep(5) 
            continue
        except TimeoutException:
            print('No more button to show')
            break
        except Exception as e:
            print(e)
            break
        
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    cards = soup.find_all('div', class_='b817090550 b736e9e3f4')
    city_attractions = []
    for card in cards:
        attraction = scrape_single_attraction(city_name, card)
        city_attractions.append(attraction)
    
    return city_attractions


In [9]:
all_attractions = []
for i, destination in df.iterrows():
    city_name = destination['name']
    city_url = base_url + destination['link']
    city_attraction = scrape_attractions_city(city_name, city_url)
    for attraction in city_attraction:
        all_attractions.append(attraction)

No more button to show


In [10]:
# convert attractions to a dataframe
attractions_df = pd.DataFrame(all_attractions)
attractions_df.head()

Unnamed: 0,city,name,description,duration,rating,reviews,price,link
0,New York,SUMMIT One Vanderbilt Tickets,Digital art installations and views of New Yor...,Duration: 2 hours,4.7,4.7,CAD 63.57,/attractions/us/prcmokyuz7um-summit-one-vander...
1,New York,9/11 Memorial & Museum Admission,Chance to visit a memorial and museum that's d...,,4.8,4.8,CAD 36.66,/attractions/us/prgstbyhf5aj-911-memorial-muse...
2,New York,New York CityPASS,A sightseeing pass to explore Big Apple attrac...,,4.5,4.5,CAD 198.23,/attractions/us/prj7dhkaezyz-new-york-city-pas...
3,New York,One-hour Sightseeing Yacht Cruise,A narrated cruise with views of the Statue of ...,,3.8,3.8,CAD 21.72,/attractions/us/prulfebtv6ii-one-hour-sightsee...
4,New York,SUMMIT One Vanderbilt Ticket,Opportunity to go on an immersive multisensory...,,4.5,4.5,CAD 63.57,/attractions/us/pr3zhwuklbbp-summit-one-vander...


In [11]:
len(attractions_df)

1271

In [12]:
attractions_df.to_csv('../data/raw/city_attractions.csv', index=False)