# Scraping
First I start importing all the necessary libraries I'll use for the scraping.

In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
import re

Now I start the scraping on the website. I'll get the data from the website and store it in a pandas dataframe to then save it on a .csv file.

In [29]:
# Initialize the Selenium WebDriver
opts = Options()
opts.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
opts.add_argument("--headless")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

# Open the Airbnb Barcelona page
url = "https://www.airbnb.com/s/Barcelona--Spain/homes"
driver.get(url)

# I'll let the page load for a few seconds to ensure that all content is loaded properly
time.sleep(5)

properties = []

# I'll loop through multiple pages if necessary, but I'll stop once I've collected 100 properties
while len(properties) < 100:
    try:
        # Here, I'm extracting the data for property names, prices, and ratings
        names = driver.find_elements(By.XPATH, '//div[@data-testid="listing-card-title"]')
        prices = driver.find_elements(By.XPATH, '//div[@data-testid="price-availability-row"]')
        ratings = driver.find_elements(By.XPATH, '//div[@class="t1a9j9y7 atm_da_1ko3t4y atm_dm_kb7nvz atm_fg_h9n0ih dir dir-ltr"]')
        # I couldn't find a way to get the number of rooms of each property with the time I had, so I'll skip that for now 
        for i in range(len(names)):
            name = names[i].text.replace('"',"").strip()

            # I use regex to clean the price and extract the first number
            price = re.search(r"\$\d+", prices[i].text).group(0)

            # Now I handle the ratings, ensuring None values don't cause issues
            try:
                rating = re.search(r"(\d+,\d+)", ratings[i].text).group(0).replace(",", ".")
            except:
                rating = None
            if rating != None:
                # I append the extracted data if all elements (name, price, rating) are present
                properties.append({
                    'Name': name,
                    'Price in USD': price,
                    'Rating out of 5': rating
                })

    except Exception as e:
        print(f"Error extracting data for one listing: {e}")
        continue

    # If I have collected 100 properties, I'll stop the loop
    if len(properties) >= 100:
        break

    # Here, I try to click the "Next" button to load more listings if available
    try:
        next_button = driver.find_element(By.XPATH, '//a[@aria-label="Siguiente"]')
        next_button.click()
        # I'll wait for the next page to load before extracting data
        time.sleep(5)
    except NoSuchElementException as e:
        print("No more pages or error navigating to next page:", e)
        break

# Once the data is collected, I create a pandas DataFrame
df = pd.DataFrame(properties)

# I save the DataFrame to a CSV file
df.to_csv('airbnb_barcelona.csv', index=False)
print("Data saved to airbnb_barcelona.csv")

# Finally, I close the browser to end the session
driver.quit()

Data saved to airbnb_barcelona.csv
