# Scraping Hotel Ratings on Tripadvisor

In this homework we will practice web scraping. Let's get some basic information for each hotel in Boston.

On each hotel page, scrape the Traverler ratings. **(10 pts)**

![Information to be scraped](traveler_ratings.png)

Save the data in "traverler_ratings.csv" in the following format:

hotel_name, rating, count

In [None]:
from bs4 import BeautifulSoup
import sys
import csv
import time 
import requests

base_url = "http://www.tripadvisor.com"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"

""" STEP 3 """
def get_hotellist_page(city_url, count):
    """ Get the hotel list page given the url returned by
        get_city_page(). Return the html after saving
        it to the datadir 
    """

    url = base_url + city_url
    # Sleep 0.5 sec before starting a new http request
    time.sleep(0.5)
    # Given the url, request the HTML page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text
    # Save the file
    with open('hotelist-' + str(count) + '.html', "w", encoding='utf-8') as h:
         h.write(html)
    return html
    
def get_page(page_url):
    
    ''' Get the hotel review page given the url provided by 
        listing title in hotellist_page. Return the html
    '''
    # Sleep 0.5 sec before starting a new http request
    time.sleep(0.5)
    url = base_url + page_url
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text
    return html

""" STEP 4 """
def parse_hotellist_page(html):
    """ Parse the html pages returned by get_hotellist_page().
        Return the next url page to scrape (a city can have
        more than one page of hotels) if there is, else exit
        the script.
    """
    global continues
    global data
    global hotel_count

    soup = BeautifulSoup(html)
    
    # writing hotel name, rating, count to csv
    with open('traverler_ratings.csv', 'a') as h:
        headers = ['hotel_name', 'rating','count']
        writer = csv.DictWriter(h, fieldnames=headers, lineterminator='\n')
    
        # Extract hotel name, ratings and count for that rating
        hotel_boxes = soup.select('div.listing.easyClear.p13n_imperfect')
        #hotel_boxes = soup.find_all("div", { "class" : "listing_title"})
        hotel_count += len(hotel_boxes)
        for hotel_box in hotel_boxes:
            #hotel_name = hotel_box.find(text=True)
            hotel_name = hotel_box.find('div', {'class' :'listing_title'}).find(text=True)
            hotel_url = hotel_box.find('a').get('href')
            data.append(hotel_url)
            hotel_html = get_page(hotel_url)
            soup2 = BeautifulSoup(hotel_html)
            
            # retrieving the travel ratings
            cols_rating = soup2.find(id="ratingFilter")
            rows = cols_rating.find_all('li')
            for r in rows:
                rating = r.find("div", {"class" : "row_label"}).get_text()

                # find a list of all span elements
                spans = r.find_all('span')
                count = ''
                for span in spans:
                    # removing '\n' from text                        
                    stripped_txt = (span.text).rstrip()
                    if (stripped_txt != ''):
                        count = stripped_txt
                        break

                # writing to csv
                writer.writerow({'hotel_name': hotel_name, 'rating': rating, 'count': count})

    # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "unified pagination standard_pagination"})
    # check if last page
    if div.find('span', {'class' : 'nav next ui_button disabled'}):
        print("We reached last page")    
        continues = False
    else:
        # If it is not last page there must be the Next URL
        hrefs = div.findAll('a', href= True)
        for href in hrefs:
            if href.find(text = True) == 'Next':
                print("Next url is %s" % href['href'])
                return href['href']

# Get URL to obtain the list of hotels in a specific city
city_url = '/Hotels-g60745-Boston_Massachusetts-Hotels.html'
c=0

# creating the csv file
with open('traverler_ratings.csv', 'w') as h:
        headers = ['hotel_name', 'rating','count']
        writer = csv.DictWriter(h, fieldnames=headers, lineterminator='\n')
        writer.writeheader()

# global variable to keep track of whether continuing or not with the while loop
continues = True
# global variable list containing all hotel urls
data = []
hotel_count = 0

while(continues):
    c +=1
    html = get_hotellist_page(city_url,c)
    city_url = parse_hotellist_page(html)
print('# of hotels: {}'.format(hotel_count))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Next url is /Hotels-g60745-oa30-Boston_Massachusetts-Hotels.html#ACCOM_OVERVIEW
Next url is /Hotels-g60745-oa60-Boston_Massachusetts-Hotels.html#ACCOM_OVERVIEW
We reached last page
# of hotels: 82


-------

Next, scrape all the reviews of each hotel for the star ratings of the following attributes: Value, Location, Sleep Quality, Rooms, Cleanliness, Service. Note that some reviews may not have attribute ratings and some may only have some of the attributes. **(25 pts)**

![Information to be scraped](attribute_ratings.png)

Save the data in "attribute_ratings.csv" in the following format:

hotel_name, review_id, attribute, star_value

In [None]:
def get_page_fast(page_url):
    
    ''' No delay in between http requests
    '''
    url = base_url + page_url
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text
    return html

""" STEP 4 """
def parse_reviewlist_page(html):
    """ Parse the review pages returned by get_page().
        Return the next url page to scrape (a city can have
        more than one page of hotels) if there is, else exit
        the script.
    """
    global continues
    # for output testing
    global review_count
    
    soup = BeautifulSoup(html)
    #print(soup.prettify())
    
    # writing hotel name, rating, count to csv
    with open('attribute_ratings.csv', 'a') as h:
        headers = ['hotel_name', 'review_id','attribute', 'star_value']
        writer = csv.DictWriter(h, fieldnames=headers, lineterminator='\n')
    
        # Extract hotel name, review id, attributes and count for that attribute for all reviews of that hotel
        hotel_name = soup.find(id="HEADING").get_text()
        review_boxes = soup.find_all("div", { "class" : "reviewSelector"})
        review_count += len(review_boxes)
        for review_box in review_boxes:
            
            # retrieving attributes of div for review id
            review_attrib = review_box.attrs
            review_id = review_attrib['id']         

            # retrieving review url for attributes pg 
            quote = review_box.find('div', {'class': 'quote'})
            if (quote != None):
                quote_url = quote.find('a').get('href')
            
                # getting review page of review and extracting attributes for featured review
                quote_html = get_page_fast(quote_url)
                soup2 = BeautifulSoup(quote_html)
                feat_review = soup2.find(id=review_id)
                
                # extracting rating-list section
                rating_list = feat_review.find('div', {'class': 'rating-list'})
                
                if (rating_list != None):
                    # check if review has ratings
                    if not (rating_list.find('span', {'class': 'noRatings'})):
                        # get all the containers for review ratings
                        recommend_ans = rating_list.find_all('li', {'class': 'recommend-answer'})
                        for ans in recommend_ans:
                            sprite_rating = ans.find('span', {'class': 'rate'}) 
                            # retrieving attribute text
                            attribute = ans.find('div').get_text()
                            # retrieving star value from img alt data
                            temp = sprite_rating.find('img')
                            star_val = temp['alt']

                            # writing to csv
                            writer.writerow({'hotel_name': hotel_name[2:-1], 'review_id': review_id[7:], 'attribute': attribute, 'star_value': star_val[0]})            

    # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "unified pagination "})
    # check if last page
    if div.find('span', {'class' : 'nav next disabled'}):
        print("We reached last page")    
        continues = False
    else:
        # If it is not last page there must be the Next URL
        hrefs = div.findAll('a', href= True)
        for href in hrefs:
            if href.find(text = True) == 'Next':
                print("Next url is %s" % href['href'])
                return href['href']


# creating the csv file
with open('attribute_ratings.csv', 'w') as h:
    headers = ['hotel_name', 'review_id','attribute', 'star_value']
    writer = csv.DictWriter(h, fieldnames=headers, lineterminator='\n')
    writer.writeheader()

# Get hotel page of urls in data
for hotel_url in data:
    # keeps track of total english reviews for a hotel
    review_count = 0
    # global variable to keep track of whether continuing or not with the while loop
    continues = True
    while(continues):
        hotel_html = get_page(hotel_url)
        hotel_url = parse_reviewlist_page(hotel_html)
    print('# of english reviews: {}'.format(review_count))



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Next url is /Hotel_Review-g60745-d94344-Reviews-or10-Marriott_Vacation_Club_Pulse_at_Custom_House_Boston-Boston_Massachusetts.html#REVIEWS


-------