#**Airbnb Listings Scraper**
<hr style="border:1px solid"> </hr>

This notebook scrapes the data from Airbnb listings using Beautifulsoup and Selenium. 

**Input:** A .csv file that contains the url of the listings to be scraped. These urls were scraped from the Airbnb search results when a province is searched from the search bar of Airbnb.

**Output:** CSV files containing the data per listing.

### Package Imports

In [1]:
import requests
from selenium import webdriver
import random, time
import pandas as pd
import datetime
import csv

### <font color=Grey> Functions that gets the details from AirBnB </font>

In [2]:
def get_listing_data(url, driver, sleep_min, sleep_max):
    '''
    Scrapes the listings data from a specific url.

    Args:
        url: url of the Airbnb listing to be scraped
        driver: instance of chrome driver
        sleep_min: minimum sleep value while waiting for the page to load (in seconds)
        sleep_max: maximum sleep value while waiting for the page to load (in seconds)
        
    Returns:
        A dictionary with fields as dictionary keys and scraped details as values.
    '''
    
    listing_data = {}
    
    # Defines the element and the class that the scraper will look for per field

    # Creates the other urls to be scraped from the base url
    amenities_url, reviews_url, location_url = create_urls(url)
    
    # Includes the url as a column. URLs will serve as the unique identifier of each listing
    listing_data['url'] = url
    
    # Opens the main listing page.
    driver.get(url)
    time.sleep(random.randint(sleep_min, sleep_max))
    
    # Scrapes the title, location (address), and rating of the listing
    title = driver.find_elements_by_xpath(get_elem_xpath('title'))
    listing_data['title'] = [element.text for element in title]
    
    location = driver.find_elements_by_xpath(get_elem_xpath('location'))
    listing_data['location'] = [element.text for element in location]
    
    rating = driver.find_elements_by_xpath(get_elem_xpath('rating'))
    listing_data['rating'] = [element.text for element in rating]
    
    # Scrapes the type of listing (private room/ shared room / entire villa etc.)
    type_airbnb = driver.find_elements_by_xpath(get_elem_xpath('type_airbnb'))
    for i in type_airbnb:
        type_airbnb = i.text.split("hosted")
        listing_data["type_airbnb"] = type_airbnb[0]
        
    # Scrapes the decription tags (superhost, enhanced clean, entire home, etc.)
    desc_tags = driver.find_elements_by_xpath(get_elem_xpath('desc_tags')) 
    listing_data['desc_tags'] = [element.text for element in desc_tags]
    
    # Also the decription tags but using a different class in case it wasn't picked up by the previous block of code.
    desc_tags_2 = driver.find_elements_by_xpath(get_elem_xpath('desc_tags_2'))
    listing_data['desc_tags2'] = [element.text for element in desc_tags_2]

    # Scrapes the number of guests, bedrooms, beds, and bathrooms
    num_bed = driver.find_elements_by_xpath(get_elem_xpath('gbbb'))
    for i in num_bed:
        l1 = i.text.split("\n")
        l2 = l1[1].split("·")
        listing_data["gbbb"] = l2    
    
    # Scrapes the price per night
    price_per_night = driver.find_elements_by_xpath(get_elem_xpath('price_per_night'))
    for i in price_per_night:
        listing_data["price_per_night"] = i.text[1:]
    
    # Scrapes details about the host
    host_tags = driver.find_elements_by_xpath(get_elem_xpath('host_tags'))
    listing_data['host_tags'] = [element.text for element in host_tags]
    
    # Scrapes details about the response rate and response time of the host
    host_response = driver.find_elements_by_xpath(get_elem_xpath('host_response'))
    listing_data['host_response'] = [element.text for element in host_response]
    
    # Opens the reviews page
    driver.get(reviews_url)
    time.sleep(random.randint(sleep_min, sleep_max))

    # Scrapes the ratings per category
    review_label = driver.find_elements_by_xpath(get_elem_xpath('review_label'))
    review_value = driver.find_elements_by_xpath(get_elem_xpath('review_value'))

    for (i,j) in zip(review_label, review_value):
        listing_data[i.text] = j.text
    
    # Opens the amenities page
    driver.get(amenities_url)
    time.sleep(random.randint(sleep_min, sleep_max))
    
    # Scrapes the amenities values
    amenities = driver.find_elements_by_xpath(get_elem_xpath('amenities'))
    l4 = []
    for i in amenities:
        l3 = i.text.split("\n")
        # Appends the amenity in the list only if it is currently availble
        if "Unavailable" not in l3[0]:
            l4.append(l3[0])
    listing_data["amenities"] = l4
    
    # Opens the locations page
    driver.get(location_url)
    time.sleep(random.randint(sleep_min, sleep_max))
    
    # Gets the map coordinates
    coordinates = driver.find_elements_by_xpath('//a[@target="_blank"]')
    for i in coordinates:
        if "maps?ll" in i.get_attribute("href"):
            listing_data["latitude"] = i.get_attribute("href").split("=")[1].split("&")[0].split(",")[0]
            listing_data["longitude"] = i.get_attribute("href").split("=")[1].split("&")[0].split(",")[1]    
    
    return listing_data

In [3]:
def get_elem_xpath(key):
    '''
    # Function that returns the xpath needed to scrape each field

    Args:
        key: field to be scraped
         
    Returns:
        xpath value
    '''  
    fields_xpath = {
        'title': ['h1','_14i3z6h'],
        'location': ['span','_169len4r'],
        'rating': ['span','_mvp6j0g'],    
        'type_airbnb': ['div','_xcsyj0'],    
        'desc_tags': ['div','_1qsawv5'],    
        'desc_tags_2': ['div','t1bchdij dir dir-ltr'],
        'gbbb': ['div','_tqmy57'],
        'price_per_night': ['span','_pgfqnw'],
        'host_tags': ['div','_xcsyj0'],    
        'host_response': ['div','_1qsawv5'],    
        'review_label': ['div','_y1ba89'],
        'review_value': ['span','_4oybiu'],
        'amenities': ['div','_vzrbjl']
    }
    return "//" + fields_xpath[key][0] + "[@class='" + fields_xpath[key][1] + "']"

In [4]:
def create_urls(url):
    '''
    # Creates the other urls to be scraped from the base url

    Args:
        url: url of the Airbnb listing to be scraped
         
    Returns:
        Four other urls for the amenities, reviews, and location pages.
    '''    
    base_url = url.split("?")[0]
    return base_url + '/amenities', base_url + '/reviews', base_url + '/location'

In [5]:
def save_dataset(df):
    """
    Saves the scraped data (per batch) into a csv file with timestamp as the filename
    """
    df_data = pd.DataFrame(df)
    x = datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S")
    fname = x + ".csv"
    df_data.to_csv(fname)

In [6]:
def scrape_airbnb(df, index_start, index_end, rows_per_file, min_sleep, max_sleep, chromedriver_path):
    """
    Scrapes the data from the Airbnb website
    
    Args:
        df: list of urls to be scraped
        index_start: from the list of urls, select the index where you want to start scraping
        index_end: from the list of urls, select the index where you want to end scraping (exclusive)
        rows_per_file: number of listings data each output file will contain
        min_sleep: minimum sleep value while waiting for the page to load (in seconds)
        max_sleep: maximum sleep value while waiting for the page to load (in seconds, exclusive)
        chromedriver_path: path of the chromedriver in the local computer 

    Returns:
        None. This functions outputs the scraped data in separate csv files.
        
    """
    
    dataset = []
    urls = df['url'][index_start:index_end]
    
    driver = webdriver.Chrome(chromedriver_path)
    
    i = 0
    j = len(urls)-1
    
    for url in urls:
        try:
            print("\nScraping listing data " + str(i) + " out of " + str(j)) 
            new_data = get_listing_data(url, driver, min_sleep, max_sleep)
            print(new_data)
            dataset.append(new_data)
            if (i+1) % rows_per_file == 0:
                print("Saved")
                save_dataset(dataset)
                dataset = []
            i += 1
        except Exception:
            print("An exception occured")
            i += 1
            pass
    
    driver.close()

### <font color=Grey> Loads the url list </font>

In [7]:
df_all = pd.read_csv('listings_urls.csv')
df_all.head()

Unnamed: 0,title,url
0,Studio Type The Grass Residence,https://www.airbnb.com/rooms/45858507?previous...
1,Abby's Place,https://www.airbnb.com/rooms/45569962?previous...
2,Balai Ni Kuya Condo-Feel Apartment Unit,https://www.airbnb.com/rooms/39305420?previous...
3,"1 Bedroom w/ Balcony at Trees Residences, Fair...",https://www.airbnb.com/rooms/23222443?previous...
4,Condo Village at Trees Residences,https://www.airbnb.com/rooms/22454349?previous...


### <font color=Grey> User options </font>

In [12]:
# from the list of urls, select the index where you want to start scraping
index_start = 0

# from the list of urls, select the index where you want to end scraping (exclusive)
index_end = 5

# number of listings data each output file will contain
rows_per_file = 5

# minimum sleep value while waiting for the page to load (in seconds)
min_sleep = 18

# maximum sleep value while waiting for the page to load (in seconds, exclusive)
max_sleep = 22

# path of the chromedriver in the local computer 
chromedriver_path = r'C:\\Users\\Edward Apostol\\Downloads\\chromedriver.exe'

### <font color=Grey> Running the scraper </font>

In [13]:
scrape_airbnb(df_all, index_start, index_end, rows_per_file, min_sleep, max_sleep, chromedriver_path)


Scraping listing data 0 out of 4
{'url': 'https://www.airbnb.com/rooms/45858507?previous_page_section_name=1000&federated_search_id=771f9784-bd44-4af7-92d5-7e2b678c8f1f', 'title': ['Studio Type The Grass Residence'], 'location': ['', 'Quezon City, Metro Manila, Philippines'], 'rating': [], 'type_airbnb': 'Entire condominium ', 'desc_tags': ['Entire home', 'Enhanced Clean'], 'desc_tags2': [], 'gbbb': ['2 guests ', ' Studio ', ' 0 beds ', ' 1 bath'], 'price_per_night': '500', 'host_tags': ['Entire condominium hosted by Motel'], 'host_response': ['Entire home', 'Enhanced Clean'], 'amenities': ['Hair dryer', 'Shampoo', 'Iron', 'Hangers', 'Essentials', 'TV', 'Heating', 'Air conditioning', 'Smoke alarm', 'Fire extinguisher', 'Wifi', 'Dedicated workspace', 'Kitchen', 'Gym', 'Pool', 'Long term stays allowed'], 'latitude': '14.65975', 'longitude': '121.02906'}

Scraping listing data 1 out of 4
{'url': 'https://www.airbnb.com/rooms/45569962?previous_page_section_name=1000&federated_search_id=77