In [31]:
import requests
import pprint
from bs4 import BeautifulSoup

pp = pprint.PrettyPrinter(indent=4)

# Get listing info
# URL = 'https://www.airbnb.com/s/homes?search_type=pagination&query=Boston%2C%20MA&checkin=2020-08-16&checkout=2020-08-22&adults=2&items_offset=0'
def getListings(city, state, checkin, checkout, adults='1', page='0', search_type='pagination'):
    # Build the URL
    baseurl = 'https://www.airbnb.com/s/homes?'

    # Add pagination
    items_offset = str(int(page) * 20)
    URL = baseurl + 'search_type=' + search_type + '&items_offset=' + items_offset
    
    # Add location
    query = city + '%2C%20' + state
    URL = URL + '&query=' + query
    
    # Add logistics
    URL = URL + '&checkin=' + checkin + '&checkout=' + checkout + '&adults=' + adults
    
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    listings = []

    links = soup.find_all('a')
    # GET LISTING NAME AND URL
    counter = 0
    for link in links:
        # We just want to add real listings, not all link names
        if link.get('data-check-info-section'):
            listing_name = link.get('aria-label')
            url = 'https://www.airbnb.com' + link.get('href')
            listings.append({'listing_name': listing_name, 'url': url})
            counter += 1

    # GET TOTAL PRICE
    spans = soup.find_all('span')
    counter = 0
    for span in spans:
        text = span.get_text()
        if text and 'total' in text:
            total = text.replace('$', '')
            total = total.replace(' total', '')
            listings[counter]['total_price'] = total
            counter += 1

    
    # GET SUPERHOST, LISTING_TYPE, RATING, NUM_REVIEWS
    divs = soup.find_all('div')
    counter = 0
    for div in divs:
        if counter < len(listings) and div.get_text() == listings[counter]['listing_name']:
            is_superhost = 'False'
            listing_type = ''
            rating = None
            num_reviews = '0'

            listing_info = div.previous_sibling
            for child in listing_info:
                if 'Entire ' in child.get_text() or 'Private ' in child.get_text():
                    listing_type = child.get_text()
                elif 'SUPERHOST' in child.get_text():
                    is_superhost = 'True'
                elif '(' and ')' in child.get_text():
                    for c in child:
                        split_rating = c.get_text().split()
                        rating = split_rating[0]
                        num_reviews = split_rating[1].replace('(', '')
                        num_reviews = num_reviews.replace(')', '')

            listings[counter]['is_superhost'] = is_superhost
            listings[counter]['listing_type'] = listing_type
            listings[counter]['rating'] = rating
            listings[counter]['num_reviews'] = num_reviews

            counter += 1

    # GET PRICE PER NIGHT, AMENITIES, HOUSING_INFO
    counter = 0
    for span in spans:
        text = span.get_text()
        if text and '/ night' in text and 'total' not in text:
            price_per_night = None
            amenities = []
            housing_info = []

            # Some have a discounted price so we only want the actual price per night
            price_per_night = text.rsplit('$', 1)[1]
            price_per_night = price_per_night.replace(' / night', '')
            print('ppn', price_per_night)

            # Gets amenities like Wifi/Kitching/Free Parking
            amenities = span.parent.parent.parent.previous_sibling.get_text()
            amenities = amenities.split(' · ')

            # Gets guests, bedrooms, baths
            housing_info = span.parent.parent.parent.previous_sibling.previous_sibling.get_text()
            housing_info = housing_info.split(' · ')

            listings[counter]['price_per_night'] = price_per_night      
            listings[counter]['amenities'] = amenities
            listings[counter]['housing_info'] = housing_info

            counter += 1

    return listings

In [34]:
soup = getListings('Boston', 'MA', '2020-08-16', '2020-08-22', adults='2', page='2')
pp.pprint(soup)

ppn 134
<div class="_1ulsev2" style="margin-top:4px">Wifi<span aria-hidden="true"> · </span>Kitchen</div>
ppn 120
<div class="_1ulsev2" style="margin-top:4px">Free parking<span aria-hidden="true"> · </span>Wifi</div>
ppn 193
<div class="_1ulsev2" style="margin-top:4px">Free parking<span aria-hidden="true"> · </span>Wifi<span aria-hidden="true"> · </span>Kitchen</div>
ppn 89
<div class="_1ulsev2" style="margin-top:4px">Wifi<span aria-hidden="true"> · </span>Kitchen</div>
ppn 95
<div class="_1ulsev2" style="margin-top:4px">Wifi<span aria-hidden="true"> · </span>Kitchen</div>
ppn 73
<div class="_1ulsev2" style="margin-top:4px">Wifi<span aria-hidden="true"> · </span>Kitchen</div>
ppn 112
<div class="_1ulsev2" style="margin-top:4px">Wifi<span aria-hidden="true"> · </span>Kitchen</div>
ppn 91
<div class="_1ulsev2" style="margin-top:4px">Free parking<span aria-hidden="true"> · </span>Wifi<span aria-hidden="true"> · </span>Kitchen</div>
ppn 140
<div class="_1ulsev2" style="margin-top:4px">Free

In [24]:
def getSoup(city, state, checkin, checkout, adults='1', page='0', search_type='pagination'):
    # Build the URL
    baseurl = 'https://www.airbnb.com/s/homes?'

    # Add pagination
    items_offset = str(int(page) * 20)
    URL = baseurl + 'search_type=' + search_type + '&items_offset=' + items_offset
    
    # Add location
    query = city + '%2C%20' + state
    URL = URL + '&query=' + query
    
    # Add logistics
    URL = URL + '&checkin=' + checkin + '&checkout=' + checkout + '&adults=' + adults
    print(URL)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    return soup

soup = getSoup('Boston', 'MA', '2020-08-16', '2020-08-22', adults='2', page='1')


https://www.airbnb.com/s/homes?search_type=pagination&items_offset=20&query=Boston%2C%20MA&checkin=2020-08-16&checkout=2020-08-22&adults=2


In [26]:
links = soup.find_all('a')
listings = []
# GET LISTING NAME
for link in links:
    # We just want to add real listings, not all link names
    if link.get('data-check-info-section'):
        listings.append({'listing_name': link.get('aria-label')})

# GET SUPERHOST, LISTING_TYPE, RATING, NUM_REVIEWS
divs = soup.find_all('div')
counter = 0
for div in divs:
    if counter < len(listings) and div.get_text() == listings[counter]['listing_name']:
        is_superhost = 'False'
        listing_type = ''
        rating = None
        num_reviews = '0'
        
        listing_info = div.previous_sibling
        for child in listing_info:
            if 'Entire ' in child.get_text() or 'Private ' in child.get_text():
                listing_type = child.get_text()
            elif 'SUPERHOST' in child.get_text():
                is_superhost = 'True'
            elif '(' and ')' in child.get_text():
                for c in child:
                    split_rating = c.get_text().split()
                    rating = split_rating[0]
                    num_reviews = split_rating[1].replace('(', '')
                    num_reviews = num_reviews.replace(')', '')
        
        listings[counter]['is_superhost'] = is_superhost
        listings[counter]['listing_type'] = listing_type
        listings[counter]['rating'] = rating
        listings[counter]['num_reviews'] = num_reviews
        
        counter += 1
            
# GET TOTAL PRICE
spans = soup.find_all('span')
counter = 0
for span in spans:
    text = span.get_text()
    if text and 'total' in text:
        total = text.replace('$', '')
        total = total.replace(' total', '')
        listings[counter]['total_price'] = total
        counter += 1

# GET PRICE PER NIGHT, AMENITIES, HOUSING_INFO
counter = 0
for span in spans:
    text = span.get_text()
    if text and '/ night' in text and 'total' not in text:
        price_per_night = None
        amenities = []
        housing_info = []

        # Some have a discounted price so we only want the actual price per night
        price_per_night = text.rsplit('$', 1)[1]
        price_per_night = price_per_night.replace(' / night', '')
        
        # Gets amenities like Wifi/Kitching/Free Parking
        amenities = span.parent.parent.parent.previous_sibling.get_text()
        amenities = amenities.split(' · ')
        
        # Gets gusts, bedrooms, baths
        housing_info = span.parent.parent.parent.previous_sibling.previous_sibling.get_text()
        housing_info = housing_info.split(' · ')
        
#         # Gets is_superhost, listing_type, rating, and num_reviews
#         listing_info = span.parent.parent.parent.previous_sibling.previous_sibling.previous_sibling.previous_sibling.children
#         for child in listing_info:
#             if 'Entire ' in child.get_text() or 'Private ' in child.get_text():
#                 listing_type = child.get_text()
#             elif 'SUPERHOST' in child.get_text():
#                 is_superhost = 'True'
#             elif '(' and ')' in child.get_text():
#                 for c in child:
#                     split_rating = c.get_text().split()
#                     rating = split_rating[0]
#                     num_reviews = split_rating[1].replace('(', '')
#                     num_reviews = num_reviews.replace(')', '')

        listings[counter]['price_per_night'] = price_per_night      
        listings[counter]['amenities'] = amenities
        listings[counter]['housing_info'] = housing_info

        counter += 1


pp.pprint(listings)

AttributeError: 'NoneType' object has no attribute 'get_text'