#### ATTRACTIONS EXTRACTION PROCESS

This notebook was used to execute manually the scripts to scrape all of the usa most popular touristic attractions.

From the scrapping.py file we import scrape_state_attractions, this function takes a state and the corresponding url and returns a pandas dataframe with each state's touristic attractions:

- attributes
- scores

The data was scrapped from the [Tripadvisor.com](https://www.tripadvisor.com.ar/) web, from a page like this: [tripadvisor/attractions/california/.com](https://www.tripadvisor.com.ar/Attractions-g28926-Activities-oa0-California.html).

In [1]:
import scrapping
import pandas as pd

usa_cities = pd.read_csv("../files/data/usa_cities.csv")
usa_states = pd.read_csv("../files/data/usa_states.csv")

In [2]:
#
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService

def scrape_state_attractions(state: str, url: str, report=True) -> list:

    states_attractions = [] # to store data while scrapping
    failed_states = [] # to store errors while scrapping

    # Instanciate and configurate driver
    chrome_options = selenium.webdriver.chrome.options.Options()
    #chrome_options.add_argument('--headless') # unables GUI
    chrome_options.add_argument('--disable-infobars') # unables images loading
    driver = webdriver.Chrome(options=chrome_options)

    # Connect to url and wait to load
    driver.get(url)
    time.sleep(5)

    # Find amount of pages for the city
    try:
        for _ in range(10):
            # Find each attraction box
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight*0.85);") # scroll to load page and buttons
            wait = WebDriverWait(driver, 10) # 
            attraction_divs = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "jemSU"))) # find each attraction box

            for div in attraction_divs:
                try:
                    url = div.find_element(By.CLASS_NAME ,"BMQDV")
                    parts = div.text.split("\n")
                    if 15 > len(parts) > 2 and url:
                        if parts[0] == '2023':
                            del parts[0]
                            states_attractions.append({"state_id": usa_states[usa_states["state"] == state]["state_id"].iloc[0], "attraction": parts[0], "categories": parts[2], "reviews_url": url.get_attribute('href')})
                except:
                    pass

            next_page_button = driver.find_element(By.XPATH, '/html/body/div[1]/main/div[1]/div/div/div[3]/div/div[2]/div[2]/div[2]/div/div/div[2]/div/div[2]/div/div/section[40]/div/div[1]/div/div[1]/div[2]')
            next_page_button.click()
            
    except:
        failed_states.append(state)
        if report:
            print(f"{state} state added to failed states.") # Console report

    # Close browser and return list with hotels data
    driver.quit()
    return states_attractions


WEST - Pacific

In [3]:
pacific = [["Alaska", "https://www.tripadvisor.com.ar/Attractions-g28923-Activities-oa0-Alaska.html"],
                ["Hawaii", "https://www.tripadvisor.com.ar/Attractions-g29217-Activities-oa0-Island_of_Hawaii_Hawaii.html"],
                ["California", "https://www.tripadvisor.com.ar/Attractions-g28926-Activities-oa0-California.html"],
                ["Oregon", "https://www.tripadvisor.com.ar/Attractions-g1438848-Activities-oa0-Oregon_Coast_Oregon.html"],
                ["Washington", "https://www.tripadvisor.com.ar/Attractions-g28968-Activities-oa0-Washington.html"]]

pacific_attractions = pd.DataFrame()
for state in pacific:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    pacific_attractions = pd.concat([pacific_attractions, state_attributes], ignore_index=True)

# Transform
pacific_attractions['attraction'] = pacific_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
pacific_attractions['categories'] = pacific_attractions['categories'].str.replace(' • ', ', ') # Process categories
pacific_attractions["region"] = "West Pacific"

# Explore and export
pacific_attractions.to_csv("../files/data/tripadvisor/pacific_attractions.csv")

In [14]:
# Lo hago solo para washington y agrego la data al archivo de pacific_attractions.csv por temas de bloqueo de tripadvisor
pacific = [["Washington", "https://www.tripadvisor.com.ar/Attractions-g28968-Activities-oa0-Washington.html"]]

pacific_attractions = pd.DataFrame()

for state in pacific:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    pacific_attractions = pd.concat([pacific_attractions, state_attributes], ignore_index=True)

# Transform
pacific_attractions['attraction'] = pacific_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True)  # Process attraction
pacific_attractions['categories'] = pacific_attractions['categories'].str.replace(' • ', ', ')  # Process categories
pacific_attractions["region"] = "West Pacific"

# Load existing data
existing_data = pd.read_csv("../files/data/tripadvisor/pacific_attractions.csv", index_col=0)

# Append new data
combined_data = pd.concat([existing_data, pacific_attractions], ignore_index=True)

# Explore and export
combined_data.to_csv("../files/data/tripadvisor/pacific_attractions.csv", index=False)

In [16]:
pacifics = pd.read_csv(r'../files\data\tripadvisor\pacific_attractions.csv')
pacifics

Unnamed: 0,state_id,attraction,categories,reviews_url,region
0,AK,Mendenhall Glacier Visitor Center,"Centros de información turística, Formaciones ...",https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
1,AK,Fountainhead Antique Auto Museum,Museos especializados,https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
2,AK,Tracy Arm Fjord,Masas de agua,https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
3,AK,Alaska Wildlife Conservation Center,Áreas de naturaleza y vida silvestre,https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
4,AK,Denali,Montañas,https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
...,...,...,...,...,...
275,WA,Mount St. Helens,"Montañas, Volcanes",https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
276,WA,Bill & Melinda Gates Foundation Discovery Center,Museos especializados,https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
277,WA,Waterfront Park,Parques,https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific
278,WA,Seattle Japanese Garden,Jardines,https://www.tripadvisor.com.ar/Attraction_Revi...,West Pacific


WEST - Mountain

In [18]:
mountain = [["Arizona", "https://www.tripadvisor.com.ar/Attractions-g28924-Activities-oa0-Arizona.html"],
                ["Colorado", "https://www.tripadvisor.com.ar/Attractions-g28927-Activities-oa0-Colorado.html"],
                ["Idaho", "https://www.tripadvisor.com.ar/Attractions-g28933-Activities-oa0-Idaho.html"],
                ["Montana", "https://www.tripadvisor.com.ar/Attractions-g28947-Activities-oa0-Montana.html"],
                ["Nevada", "https://www.tripadvisor.com.ar/Attractions-g28949-Activities-oa0-Nevada.html"],
                ["New Mexico", "https://www.tripadvisor.com.ar/Attractions-g28952-Activities-oa0-New_Mexico.html"],
                ["Utah", "https://www.tripadvisor.com.ar/Attractions-g28965-Activities-oa0-Utah.html"],
                ["Wyoming", "https://www.tripadvisor.com.ar/Attractions-g28973-Activities-oa0-Wyoming.html"]]

mountain_attractions = pd.DataFrame()
for state in mountain:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    mountain_attractions = pd.concat([mountain_attractions, state_attributes], ignore_index=True)

# Transform
mountain_attractions['attraction'] = mountain_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
mountain_attractions['categories'] = mountain_attractions['categories'].str.replace(' • ', ', ') # Process categories
mountain_attractions["region"] = "Mountain"

# Explore and export
mountain_attractions.to_csv("../files/data/tripadvisor/mountain_attractions.csv")

SOUTH - West South Central

In [28]:
west_south_central = [["Arkansas", "https://www.tripadvisor.com.ar/Attractions-g28925-Activities-oa0-Arkansas.html"],
                ["Louisiana", "https://www.tripadvisor.com.ar/Attractions-g28939-Activities-oa0-Louisiana.html"],
                ["Oklahoma", "https://www.tripadvisor.com.ar/Attractions-g28957-Activities-oa0-Oklahoma.html"],
                ["Texas", "https://www.tripadvisor.com.ar/Attractions-g28964-Activities-oa0-Texas.html"]]

west_south_central_attractions = pd.DataFrame()
for state in west_south_central:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    west_south_central_attractions = pd.concat([west_south_central_attractions, state_attributes], ignore_index=True)

# Transform
west_south_central_attractions['attraction'] = west_south_central_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
west_south_central_attractions['categories'] = west_south_central_attractions['categories'].str.replace(' • ', ', ') # Process categories
west_south_central_attractions["region"] = "West South Central"

# Explore and export
west_south_central_attractions.to_csv("../files/data/tripadvisor/west_south_central_attractions.csv")

SOUTH - East South Central

In [27]:
east_south_central = [["Alabama", "https://www.tripadvisor.com.ar/Attractions-g28922-Activities-oa0-Alabama.html"],
                ["Kentucky", "https://www.tripadvisor.com.ar/Attractions-g28938-Activities-oa0-Kentucky.html"],
                ["Mississippi", "https://www.tripadvisor.com.ar/Attractions-g28945-Activities-oa0-Mississippi.html"],
                ["Tennessee", "https://www.tripadvisor.com.ar/Attractions-g28963-Activities-oa0-Tennessee.html"]]

east_south_central_attractions = pd.DataFrame()
for state in east_south_central:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    east_south_central_attractions = pd.concat([east_south_central_attractions, state_attributes], ignore_index=True)

# Transform
east_south_central_attractions['attraction'] = east_south_central_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
east_south_central_attractions['categories'] = east_south_central_attractions['categories'].str.replace(' • ', ', ') # Process categories
east_south_central_attractions["region"] = "East South Central"

# Explore and export
east_south_central_attractions.to_csv("../files/data/tripadvisor/east_south_central_attractions.csv")

SOUTH - South Atlantic

In [26]:
south_atlantic = [["Delaware", "https://www.tripadvisor.com.ar/Attractions-g28929-Activities-oa0-Delaware.html"],
                ["Florida", "https://www.tripadvisor.com.ar/Attractions-g28930-Activities-oa0-Florida.html"],
                ["Georgia", "https://www.tripadvisor.com.ar/Attractions-g28931-Activities-oa0-Georgia.html"],
                ["South Carolina", "https://www.tripadvisor.com.ar/Attractions-g28961-Activities-oa0-South_Carolina.html"],
                ["North Carolina", "https://www.tripadvisor.com.ar/Attractions-g28954-Activities-oa0-North_Carolina.html"],
                ["Maryland", "https://www.tripadvisor.com.ar/Attractions-g28941-Activities-oa0-Maryland.html"],
                ["Virginia", "https://www.tripadvisor.com.ar/Attractions-g28967-Activities-oa0-Virginia.html"],
                ["West Virginia", "https://www.tripadvisor.com.ar/Attractions-g28971-Activities-oa0-West_Virginia.html"],
                ["District of Columbia", "https://www.tripadvisor.com.ar/Attractions-g28970-Activities-oa0-Washington_DC_District_of_Columbia.html"]]

south_atlantic_attractions = pd.DataFrame()
for state in south_atlantic:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    south_atlantic_attractions = pd.concat([south_atlantic_attractions, state_attributes], ignore_index=True)

# Transform
south_atlantic_attractions['attraction'] = south_atlantic_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
south_atlantic_attractions['categories'] = south_atlantic_attractions['categories'].str.replace(' • ', ', ') # Process categories
south_atlantic_attractions["region"] = "South Atlantic"

# Explore and export
south_atlantic_attractions.to_csv("../files/data/tripadvisor/south_atlantic_attractions.csv")

MIDWEST - West North Central

In [3]:
west_north_central = [["North Dakota", "https://www.tripadvisor.com.ar/Attractions-g28955-Activities-oa0-North_Dakota.html"],
                ["South Dakota", "https://www.tripadvisor.com.ar/Attractions-g28962-Activities-oa0-South_Dakota.html"],
                ["Iowa", "https://www.tripadvisor.com.ar/Attractions-g28936-Activities-oa0-Iowa.html"],
                ["Kansas", "https://www.tripadvisor.com.ar/Attractions-g28937-Activities-oa0-Kansas.html"],
                ["Minnesota", "https://www.tripadvisor.com.ar/Attractions-g28944-Activities-oa0-Minnesota.html"],
                ["Missouri", "https://www.tripadvisor.com.ar/Attractions-g28946-Activities-oa0-Missouri.html"],
                ["Nebraska", "https://www.tripadvisor.com.ar/Attractions-g28948-Activities-oa0-Nebraska.html"]]

west_north_central_attractions = pd.DataFrame()
for state in west_north_central:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    west_north_central_attractions = pd.concat([west_north_central_attractions, state_attributes], ignore_index=True)

# Transform
west_north_central_attractions['attraction'] = west_north_central_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
west_north_central_attractions['categories'] = west_north_central_attractions['categories'].str.replace(' • ', ', ') # Process categories
west_north_central_attractions["region"] = "West North Central"

# Explore and export
west_north_central_attractions.to_csv("../files/data/tripadvisor/west_north_central_attractions.csv")

MIDWEST - East North Central

In [23]:
east_north_central = [["Illinois", "https://www.tripadvisor.com.ar/Attractions-g28934-Activities-oa0-Illinois.html"],
                ["Indiana", "https://www.tripadvisor.com.ar/Attractions-g28935-Activities-oa0-Indiana.html"],
                ["Michigan", "https://www.tripadvisor.com.ar/Attractions-g28943-Activities-oa0-Michigan.html"],
                ["Ohio", "https://www.tripadvisor.com.ar/Attractions-g28956-Activities-oa0-Ohio.html"],
                ["Wisconsin", "https://www.tripadvisor.com.ar/Attractions-g28972-Activities-oa0-Wisconsin.html"]]

east_north_central_attractions = pd.DataFrame()
for state in east_north_central:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    east_north_central_attractions = pd.concat([east_north_central_attractions, state_attributes], ignore_index=True)

# Transform
east_north_central_attractions['attraction'] = east_north_central_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
east_north_central_attractions['categories'] = east_north_central_attractions['categories'].str.replace(' • ', ', ') # Process categories
east_north_central_attractions["region"] = "East North Central"

# Explore and export

east_north_central_attractions.to_csv("../files/data/tripadvisor/east_north_central_attractions.csv")

NORTHEAST - New England

In [21]:
new_england = [["Connecticut", "https://www.tripadvisor.com.ar/Attractions-g28928-Activities-oa0-Connecticut.html"],
                ["Maine", "https://www.tripadvisor.com.ar/Attractions-g28940-Activities-oa0-Maine.html"],
                ["Massachusetts", "https://www.tripadvisor.com.ar/Attractions-g28942-Activities-oa0-Massachusetts.html"],
                ["New Hampshire", "https://www.tripadvisor.com.ar/Attractions-g28950-Activities-oa0-New_Hampshire.html"],
                ["Rhode Island", "https://www.tripadvisor.com.ar/Attractions-g28960-Activities-oa0-Rhode_Island.html"],
                ["Vermont", "https://www.tripadvisor.com.ar/Attractions-g28966-Activities-oa0-Vermont.html"]]

new_england_attractions = pd.DataFrame()
for state in new_england:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    new_england_attractions = pd.concat([new_england_attractions, state_attributes], ignore_index=True)

# Transform
new_england_attractions['attraction'] = new_england_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
new_england_attractions['categories'] = new_england_attractions['categories'].str.replace(' • ', ', ') # Process categories
new_england_attractions["region"] = "New England"

# Explore and export
new_england_attractions.to_csv("../files/data/tripadvisor/new_england_attractions.csv")

NORTHEAST - Mid Atlantic

In [22]:
mid_atlantic = [["New Jersey", "https://www.tripadvisor.com.ar/Attractions-g28951-Activities-oa0-New_Jersey.html"],
                ["New York", "https://www.tripadvisor.com.ar/Attractions-g60763-Activities-oa0-New_York_City_New_York.html"],
                ["Pennsylvania", "https://www.tripadvisor.com.ar/Attractions-g28959-Activities-oa0-Pennsylvania.html"]]

mid_atlantic_attractions = pd.DataFrame()
for state in mid_atlantic:
    state_attributes = pd.DataFrame(scrapping.scrape_state_attractions(state[0], state[1], report=False))
    mid_atlantic_attractions = pd.concat([mid_atlantic_attractions, state_attributes], ignore_index=True)

# Transform
mid_atlantic_attractions['attraction'] = mid_atlantic_attractions['attraction'].str.replace(r'^\d+\.\s*', '', regex=True) # Process attraction
mid_atlantic_attractions['categories'] = mid_atlantic_attractions['categories'].str.replace(' • ', ', ') # Process categories
mid_atlantic_attractions["region"] = "Mid Atlantic"

# Explore and export
mid_atlantic_attractions.to_csv("../files/data/tripadvisor/mid_atlantic_attractions.csv")