# Web Scraper for trip advisor

## Prerequisites

In [None]:
pip install selenium

## Imports

In [37]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException

## Constants

In [38]:
URL_RESTAURANTS = "https://www.tripadvisor.com/Restaurants-g189473-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html"
URL_ONE_CAFE = "https://www.tripadvisor.com/Restaurant_Review-g189473-d3807291-Reviews-To_Tsai_Thessaloniki-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html"

PATH = "chromedriver.exe"

## Custom tools

#### Elements Class - Contains all the element identifiers

In [48]:
class element:
    '''
    This is a class that contains all the elements we will need to scrape data from TripAdvisor
    '''
    COOKIES_ACCEPT_BUTTON = (By.ID, "onetrust-accept-btn-handler")
    SHOW_MORE_ESTABLISHMENT_TYPES_BUTTON = (By.CLASS_NAME, "fdmYH")
    RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_3')
    RESTAURANTS_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")
    COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_6')
    COFFEE_AND_TEA_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")
    BARS_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_241')
    BARS_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{BARS_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")

#### Establishments Class

In [49]:
class establishment:
    def __init__(self,checkboxElement,buttonElement,clickWhenStateIs):
        self.checkboxElement = checkboxElement
        self.buttonElement = buttonElement
        self.clickWhenStateIs = clickWhenStateIs

#### Scraper Class

In [50]:
class TripAdvisorScraper:
    def __init__(self, path, url):
        '''
        Constructor
        '''
        self.driver = webdriver.Chrome(PATH)
        self.url = url

    def getElementObject(self,element):
        '''
        This method is used to gather an element defined in the elements class
        '''
        return self.driver.find_element(element[0],element[1])

    def waitForElement(self,element):
        '''
        This method is used to gather an element defined in the elements class but it also waits until the element is presented
        '''
        try:
            return WebDriverWait(self.driver, 20).until(EC.presence_of_element_located(element))
        except:
            return None

    def open_browser(self):
        '''
        This method is used to navigate to the URL that was instructed to the scraper durring initialization
        '''
        self.driver.get(self.url) #Load URL
        try:
            cookies = WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located(element.COOKIES_ACCEPT_BUTTON)
            )
            cookies.click()
        except:
            print("No cookies! Impolite! Meh...")

    def loop_at_rest(self):
        num_page = 2
        for i in range(0, num_page):
            num_items = 1
            for j in range(1, num_items+1):
                time.sleep(10)
                xpath = ".//div[@data-test='"+ str(j)+"_list_item']"
                item = self.driver.find_element(By.XPATH, xpath)
                url = item.find_element_by_xpath(".//a[@class='bHGqj Cj b']").get_attribute("href")
                self.driver.get(url)
                self.get_reviews()

    def select_establishments(self,establishments:list):
        '''
        This method is used to select establishment types on TripAdvisor
        Parameters:
        establistmentElements :
            Gets a list of objects with three keys, checkbox, initialState and button :
                'checkbox'      : The checkbox element to check (Type: element object from element class)
                'button'        : The button element to click in order to change the state (Type: element object from element class)
                'initialState'  : The initial state of the checkbox (Type: boolean, True/False)
        '''

        #Expand list of establishments:
        try:
            #there are other elements with the same class name, but this is the first one
            more_button = self.waitForElement(element.SHOW_MORE_ESTABLISHMENT_TYPES_BUTTON)
            more_button.click()
        except:
            pass

        for e in establishments:
            if type(e) != establishment: continue
            try:
                establishment_checkbox = self.getElementObject(e.checkboxElement).is_selected()
                if establishment_checkbox == e.clickWhenStateIs:
                    self.getElementObject(e.buttonElement).click()
            except:
                pass

    def get_reviews(self):
        try:
            cookies = WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.ID, "onetrust-accept-btn-handler"))
            )

            cookies.click()
        except:
            print("No cookies! Impolite! Meh...")
        #num_page = 10
        review_table = WebDriverWait(self.driver, 20).until(
            EC.presence_of_element_located((By.ID, "taplc_location_reviews_list_resp_rr_resp_0"))
        )
        try:
            reviews_no = self.driver.find_element(By.CLASS_NAME, "reviews_header_count").text
            reviews_no = reviews_no.replace("(", "")
            reviews_no = int(reviews_no.replace(")", ""))
            num_page = int(reviews_no / 10)
            print("page = ",num_page)
            k=0
            for i in range(0, num_page):
                # expand the review
                time.sleep(4)
                container = self.driver.find_elements(By.XPATH,".//div[@class='review-container']")
                print("container ", len(container))
                for j in range(len(container)):
                    try:
                        try:
                            time.sleep(2)
                            more = container[j].find_element_by_xpath("//span[@class='taLnk ulBlueLinks']").text
                            if more == "More":
                                container[j].find_element_by_xpath("//span[@class='taLnk ulBlueLinks']").click()
                        except StaleElementReferenceException:
                            print("Comment ",j, " could be longer, but I will let that pass..." )

                        title = container[j].find_element_by_xpath(".//span[@class='noQuotes']").text
                        date = container[j].find_element_by_xpath(".//span[contains(@class, 'ratingDate')]").get_attribute("title")
                        rating = container[j].find_element_by_xpath(".//span[contains(@class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
                        review = container[j].find_element_by_xpath(".//p[@class='partial_entry']").text.replace("\n", " ")
                        print(title)
                        print(date)
                        print(rating)
                        print(review)
                        print("-----------")
                    except NoSuchElementException:
                        print("Didn't found item ",j, ", but it should be there.")
                k=i
                # change the page
                self.driver.find_element(By.XPATH,'.//a[@class="nav next ui_button primary"]').click()
        except NoSuchElementException:
            print("Oops, unlucky search!")
        except TimeoutException:
            print("You run out of time!")
        finally:
            for i in (0,k-1):
                self.driver.execute_script("window.history.go(-1)")

    def quit_browser(self):
        self.driver.quit()


In [47]:
TripAdvisorScraper = TripAdvisorScraper(PATH,URL_RESTAURANTS)
TripAdvisorScraper.open_browser()
TripAdvisorScraper.select_establishments(
    establishments=[
        establishment(element.RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX,element.RESTAURANTS_ESTABLISHMENT_TYPE_BUTTON,True),
        establishment(element.COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX,element.COFFEE_AND_TEA_ESTABLISHMENT_TYPE_BUTTON,False),
        establishment(element.BARS_ESTABLISHMENT_TYPE_CHECKBOX,element.BARS_ESTABLISHMENT_TYPE_BUTTON,False)
    ]
)
TripAdvisorScraper.loop_at_rest()
# TripAdvisorScraper.get_reviews()
# TripAdvisorScraper.quit_browser()

TypeError: 'scraper' object is not callable

In [None]:
scraper.quit_browser()