In [291]:
import requests
import pprint
from bs4 import BeautifulSoup

# Get listing info
# URL = 'https://www.airbnb.com/s/homes?search_type=pagination&query=Boston%2C%20MA&checkin=2020-08-16&checkout=2020-08-22&adults=2&items_offset=0'
def getListings(city, state, checkin, checkout, adults='1', page='0', search_type='pagination'):
    # Build the URL
    baseurl = 'https://www.airbnb.com/s/homes?'

    # Add pagination
    items_offset = str(int(page) * 20)
    URL = baseurl + 'search_type=' + search_type + '&items_offset=' + items_offset
    
    # Add location
    query = city + '%2C%20' + state
    URL = URL + '&query=' + query
    
    # Add logistics
    URL = URL + '&checkin=' + checkin + '&checkout=' + checkout + '&adults=' + adults
    
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    listings = []

    links = soup.find_all('a')
    # GET LISTING NAME AND URL
    counter = 0
    for link in links:
        # We just want to add real listings, not all link names
        if link.get('data-check-info-section'):
            listing_name = link.get('aria-label')
            url = 'https://www.airbnb.com' + link.get('href')
            listings.append({'listing_name': listing_name, 'url': url})
            counter += 1

    # GET TOTAL PRICE
    spans = soup.find_all('span')
    counter = 0
    for span in spans:
        text = span.get_text()
        if text and 'total' in text:
            total = text.replace('$', '')
            total = total.replace(' total', '')
            listings[counter]['total_price'] = total
            counter += 1

    # GET PRICE PER NIGHT, AMENITIES, HOUSING_INFO, SUPERHOST, LISTING_TYPE, RATING, NUM_REVIEWS
    counter = 0
    for span in spans:
        text = span.get_text()
        if text and '/ night' in text and 'total' not in text:
            price_per_night = None
            amenities = []
            housing_info = []
            is_superhost = 'False'
            listing_type = ''
            rating = None
            num_reviews = '0'

            # Some have a discounted price so we only want the actual price per night
            price_per_night = text.rsplit('$', 1)[1]
            price_per_night = price_per_night.replace(' / night', '')

            # Gets amenities like Wifi/Kitching/Free Parking
            amenities = span.parent.parent.parent.previous_sibling.get_text()
            amenities = amenities.split(' · ')

            # Gets gusts, bedrooms, baths
            housing_info = span.parent.parent.parent.previous_sibling.previous_sibling.get_text()
            housing_info = housing_info.split(' · ')

            # Gets is_superhost, listing_type, rating, and num_reviews
            listing_info = span.parent.parent.parent.previous_sibling.previous_sibling.previous_sibling.previous_sibling.children
            for child in listing_info:
                child_text = child.get_text()
                if 'Entire ' in child_text or 'Private ' in child_text:
                    listing_type = child_text
                elif 'SUPERHOST' in child_text:
                    is_superhost = 'True'
                elif '(' and ')' in child_text:
                    for c in child:
                        split_rating = c.get_text().split()
                        rating = split_rating[0]
                        num_reviews = split_rating[1].replace('(', '')
                        num_reviews = num_reviews.replace(')', '')

            listings[counter]['price_per_night'] = price_per_night      
            listings[counter]['amenities'] = amenities
            listings[counter]['housing_info'] = housing_info
            listings[counter]['is_superhost'] = is_superhost
            listings[counter]['listing_type'] = listing_type
            listings[counter]['rating'] = rating
            listings[counter]['num_reviews'] = num_reviews
            counter += 1

    return listings

In [292]:
pp = pprint.PrettyPrinter(indent=4)
soup = getListings('Boston', 'MA', '2020-08-16', '2020-08-22', adults='2', page='2')
pp.pprint(soup)

[   {   'amenities': ['Wifi'],
        'housing_info': ['2 guests', '1 bedroom', '1 bed', '1 bath'],
        'is_superhost': 'True',
        'listing_name': 'Minutes to/from Logan - Ocean view',
        'listing_type': 'Entire guest suite',
        'num_reviews': '81',
        'price_per_night': '96',
        'rating': '4.84',
        'total_price': '668',
        'url': 'https://www.airbnb.com/rooms/22631832?adults=2&check_in=2020-08-16&check_out=2020-08-22&previous_page_section_name=1000&federated_search_id=2906afcd-a567-4328-92b5-7c674368dac4'},
    {   'amenities': ['Wifi', 'Kitchen'],
        'housing_info': ['2 guests', '1 bedroom', '1 bed', '1 bath'],
        'is_superhost': 'False',
        'listing_name': 'On the Hill|Historic Boston|Studio #3',
        'listing_type': 'Entire apartment',
        'num_reviews': '27',
        'price_per_night': '102',
        'rating': '4.37',
        'total_price': '799',
        'url': 'https://www.airbnb.com/rooms/19309861?adults=2&check_in=

In [218]:
def getSoup(city, state, checkin, checkout, adults='1', page='0', search_type='pagination'):
    # Build the URL
    baseurl = 'https://www.airbnb.com/s/homes?'

    # Add pagination
    items_offset = str(int(page) * 20)
    URL = baseurl + 'search_type=' + search_type + '&items_offset=' + items_offset
    
    # Add location
    query = city + '%2C%20' + state
    URL = URL + '&query=' + query
    
    # Add logistics
    URL = URL + '&checkin=' + checkin + '&checkout=' + checkout + '&adults=' + adults
    print(URL)
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')

    return soup

soup = getSoup('Boston', 'MA', '2020-08-16', '2020-08-22', adults='2', page='1')


https://www.airbnb.com/s/homes?search_type=pagination&items_offset=20&query=Boston%2C%20MA&checkin=2020-08-16&checkout=2020-08-22&adults=2


In [283]:
links = soup.find_all('a')
listings = []
# GET LISTING NAME
for link in links:
    # We just want to add real listings, not all link names
    if link.get('data-check-info-section'):
        listings.append({'listing_name': link.get('aria-label')})

# GET TOTAL PRICE
spans = soup.find_all('span')
counter = 0
for span in spans:
    text = span.get_text()
    if text and 'total' in text:
        total = text.replace('$', '')
        total = total.replace(' total', '')
        listings[counter]['total_price'] = total
        counter += 1

# GET PRICE PER NIGHT, AMENITIES, HOUSING_INFO, SUPERHOST, LISTING_TYPE, RATING, NUM_REVIEWS
counter = 0
for span in spans:
    text = span.get_text()
    if text and '/ night' in text and 'total' not in text:
        price_per_night = None
        amenities = []
        housing_info = []
        is_superhost = 'False'
        listing_type = ''
        rating = None
        num_reviews = '0'

        # Some have a discounted price so we only want the actual price per night
        price_per_night = text.rsplit('$', 1)[1]
        price_per_night = price_per_night.replace(' / night', '')
        
        # Gets amenities like Wifi/Kitching/Free Parking
        amenities = span.parent.parent.parent.previous_sibling.get_text()
        amenities = amenities.split(' · ')
        
        # Gets gusts, bedrooms, baths
        housing_info = span.parent.parent.parent.previous_sibling.previous_sibling.get_text()
        housing_info = housing_info.split(' · ')
        
        # Gets is_superhost, listing_type, rating, and num_reviews
        listing_info = span.parent.parent.parent.previous_sibling.previous_sibling.previous_sibling.previous_sibling.children
        for child in listing_info:
            if 'Entire ' in child.get_text() or 'Private ' in child.get_text():
                listing_type = child.get_text()
            elif 'SUPERHOST' in child.get_text():
                is_superhost = 'True'
            elif '(' and ')' in child.get_text():
                for c in child:
                    split_rating = c.get_text().split()
                    rating = split_rating[0]
                    num_reviews = split_rating[1].replace('(', '')
                    num_reviews = num_reviews.replace(')', '')

        listings[counter]['price_per_night'] = price_per_night      
        listings[counter]['amenities'] = amenities
        listings[counter]['housing_info'] = housing_info
        listings[counter]['is_superhost'] = is_superhost
        listings[counter]['listing_type'] = listing_type
        listings[counter]['rating'] = rating
        listings[counter]['num_reviews'] = num_reviews
        counter += 1


pp.pprint(listings)

[   {   'amenities': ['Wifi', 'Kitchen'],
        'housing_info': ['2 guests', 'Studio', '1 bed', '1 bath'],
        'is_superhost': 'False',
        'listing_name': 'Stylish Downtown Studio in the SouthEnd, #8',
        'listing_type': 'Entire apartment',
        'num_reviews': '97',
        'price_per_night': '82',
        'rating': '4.59',
        'total_price': '673'},
    {   'amenities': ['Wifi', 'Kitchen'],
        'housing_info': ['3 guests', '1 bedroom', '2 beds', '1 bath'],
        'is_superhost': 'True',
        'listing_name': 'Sunny and cozy home near one end of freedom trail',
        'listing_type': 'Entire house',
        'num_reviews': '233',
        'price_per_night': '129',
        'rating': '4.91',
        'total_price': '933'},
    {   'amenities': ['Free parking', 'Wifi', 'Kitchen'],
        'housing_info': ['5 guests', '2 bedrooms', '3 beds', '1 bath'],
        'is_superhost': 'True',
        'listing_name': 'Park view 2 (private entrance & free parking)',
      