In [1]:

#
import re
import time
import pandas as pd

#
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

#
usa_cities = pd.read_csv("../data/usa_cities.csv")
usa_states = pd.read_csv("../data/usa_states.csv")

In [2]:
# This function takes a state and the corresponding url and returns a list of dicts with each state's touristic attractions
def scrap_state_attractions(state: str, url: str, report=True) -> list:
    state_attractions = pd.DataFrame()
    failed_states = [] # to store errors while scrapping

    # Instanciate and configurate driver
    chrome_options = selenium.webdriver.chrome.options.Options()
    chrome_options.add_argument('--disable-infobars') # unables images loading
    driver = webdriver.Chrome(options=chrome_options)

    # Connect to url and wait to load
    driver.get(url) if report else None
    print(state, url)
    time.sleep(5)

    try:
        for page in range(10):
            print(f"{page+1}/10") if report else None

            # Find each attraction box
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight*0.85);") # scroll to load page and buttons
            wait = WebDriverWait(driver, 10) # 
            attraction_divs = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "jemSU"))) # find each attraction box

            for div in attraction_divs:
                try:
                    url = div.find_element(By.CLASS_NAME ,"BMQDV")
                    parts = div.text.split("\n")
                    if 15 > len(parts) > 2 and url:
                        if parts[0] == '2023':
                            del parts[0]
                            print(parts[0]) if report else None
                            print(parts[1]) if report else None
                            print(parts[2]) if report else None
                            attractions = pd.DataFrame({
                                "attraction": [parts[0]],
                                "n_reviews": [parts[1]],
                                "categories": [parts[2]]})
                            state_attractions = pd.concat([state_attractions, attractions], ignore_index=True)
                
                except:
                    pass
            
            next_page_button = driver.find_element(By.XPATH, '/html/body/div[1]/main/div[1]/div/div/div[3]/div/div[2]/div[2]/div[2]/div/div/div[2]/div/div[2]/div/div/section[40]/div/div[1]/div/div[1]/div[2]')
            next_page_button.click()
            time.sleep(2)
    
    except:
        failed_states.append(state)
        print(f"{state} state added to failed states.") if report else None

    # Close browser and return list with hotels data
    driver.quit()
    return state_attractions

Oregon

In [3]:
# Scrape
oregon_attractions = scrap_state_attractions("Oregon", "https://www.tripadvisor.com.ar/Attractions-g28958-Activities-oa0-Oregon.html", report=False)

Oregon https://www.tripadvisor.com.ar/Attractions-g28958-Activities-oa0-Oregon.html


In [4]:
oregon_attractions

In [None]:
# Transform
oregon_attractions['attraction'] = oregon_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
oregon_attractions['categories'] = oregon_attractions['categories'].str.replace(' • ', ', ') # Process categories

# Explore and export
personalised_graphics.missing_values_heatmap(oregon_attractions, "oregon_attractions")
oregon_attractions.to_csv("data/tripadvisor/oregon_attractions.csv")

Utah

In [6]:
# Scrape
utah_attractions = scrapping.scrap_state_attractions("Utah", "https://www.tripadvisor.com.ar/Attractions-g28965-Activities-a_allAttractions.true-Utah.html", report=False)

# Transform
utah_attractions['attraction'] = utah_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
utah_attractions['categories'] = utah_attractions['categories'].str.replace(' • ', ', ') # Process categories

# Explore and export
personalised_graphics.missing_values_heatmap(utah_attractions, "utah_attractions")
utah_attractions.to_csv("data/tripadvisor/utah_attractions.csv")

Utah https://www.tripadvisor.com.ar/Attractions-g28965-Activities-a_allAttractions.true-Utah.html


KeyError: 'attraction'

California

In [7]:
# Scrape
california_attractions = scrapping.scrap_state_attractions("California", "https://www.tripadvisor.com.ar/Attractions-g28926-Activities-oa0-California.html", report=False)

# Transform
california_attractions['attraction'] = california_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
california_attractions['categories'] = california_attractions['categories'].str.replace(' • ', ', ') # Process categories

# Explore and export
personalised_graphics.missing_values_heatmap(utah_attractions, "california_attractions")
california_attractions.to_csv("data/tripadvisor/california_attractions.csv")

California https://www.tripadvisor.com.ar/Attractions-g28926-Activities-oa0-California.html


KeyError: 'attraction'

In [None]:
#
west_coast_states = [[],
                #[],
                #["Arizona", "https://www.tripadvisor.com.ar/Attractions-g28924-Activities-oa0-Arizona.html"],
                #["Nevada", "https://www.tripadvisor.com.ar/Attractions-g28949-Activities-oa0-Nevada.html"],
                #["Oregon", "https://www.tripadvisor.com.ar/Attractions-g1438848-Activities-oa0-Oregon_Coast_Oregon.html"],
                #["Washington", "https://www.tripadvisor.com.ar/Attractions-g28968-Activities-oa0-Washington.html"],
                ["Idaho", "https://www.tripadvisor.com.ar/Attractions-g28933-Activities-oa0-Idaho.html"]]

#

Attractions atributes

In [23]:
# This function takes a state and the corresponding url and returns a list of dicts with each state's touristic attractions
def scrap_attractions_attribute(urls: str, report=True) -> list:
    
    attributes = [] # to store data while scrapping
    failed_urls = [] # to store errors while scrapping

    # Instanciate and configurate driver
    chrome_options = Options()
    chrome_options.add_argument('--headless') # unables GUI
    chrome_options.add_argument('--disable-infobars') # unables images loading
    driver = webdriver.Chrome(options=chrome_options)

    for i, url in enumerate(urls):
        print(f'{i+1}/{len(urls)}')
        try:         
            # 
            driver.get(url)
            time.sleep(5)
            #
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight*0.85);") # scroll to load page and buttons
            path_element = driver.find_element(By.CLASS_NAME, 'oPZZx')
            html = path_element.get_attribute("outerHTML")
            matches = re.search(r'center=([\d.-]+),([\d.-]+)', html)
            #
            if matches:
                attributes.append({"url": url, "latitud": matches.group(1), "longitude": matches.group(2)})
            else:
                attributes.append({"url": url, "latitud": matches.group(1), "longitude": matches.group(2)})
                
        except:
            attributes.append({"url": url, "latitud": None, "longitude": None})

    return attributes

# Example
#utah_attractions = pd.read_csv("data/etl/raw_Utah_attractions.csv")
#urls = utah_attractions["reviews_url"].tolist()
#data = pd.DataFrame(scrap_attractions_attribute(urls))
#data.to_csv('hotel_attributes.csv', index=False)

#data

In [9]:
# scrape attractions attributes
utah_attractions = pd.read_csv("data/booking/raw_utah_attractions.csv")
urls = utah_attractions["reviews_url"].tolist()
utah_attractions_atributes = pd.DataFrame(scrap_attractions_attribute(urls))
utah_attractions_atributes

FileNotFoundError: [Errno 2] No such file or directory: 'data/booking/raw_utah_attractions.csv'