# Web Scraper for trip advisor

## Prerequisites

In [None]:
pip install selenium

## Imports

In [12]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException

import numpy as np
import json

## Constants

In [13]:
URL_RESTAURANTS = "https://www.tripadvisor.com/Restaurants-g189473-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html"
URL_ONE_CAFE = "https://www.tripadvisor.com/Restaurant_Review-g189473-d3807291-Reviews-To_Tsai_Thessaloniki-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html"

PATH = "chromedriver.exe"

## Custom tools

#### Elements Class - Contains all the element identifiers

In [14]:
class element:
    '''
    This is a class that contains all the elements we will need to scrape data from TripAdvisor
    '''
    COOKIES_ACCEPT_BUTTON = (By.ID, "onetrust-accept-btn-handler")
    SHOW_MORE_ESTABLISHMENT_TYPES_BUTTON = (By.CLASS_NAME, "fdmYH")

    #Establishment Type elements:
    RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_3')
    RESTAURANTS_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")
    COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_6')
    COFFEE_AND_TEA_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")
    BARS_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_241')
    BARS_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{BARS_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")


    LIST_ITEM = (By.XPATH,".//div[@data-test='&1_list_item']")
    LIST_ITEM_URL = (By.XPATH,".//a[@class='bHGqj Cj b']")
    REVIEW_TABLE = (By.ID, "taplc_location_reviews_list_resp_rr_resp_0")
    REVIEWS_COUNT = (By.CLASS_NAME, "reviews_header_count")
    REVIEW_CONTAINER = (By.XPATH,".//div[@class='review-container']")
    EXPAND_REVIEW_BUTTON = (By.XPATH,"//span[@class='taLnk ulBlueLinks']")

    REVIEW_TITLE = (By.XPATH,".//span[@class='noQuotes']")
    REVIEW_DATE = (By.XPATH,".//span[contains(@class, 'ratingDate')]")
    REVIEW_RATING = (By.XPATH,".//span[contains(@class, 'ui_bubble_rating bubble_')]")
    REVIEW_TEXT = (By.XPATH,".//p[@class='partial_entry']")
    DATE_OF_VISIT = (By.XPATH, ".//div[@class='prw_rup prw_reviews_stay_date_hsx']")
    NEXT_PAGE_IN_REVIEWS = (By.XPATH,'.//a[@class="nav next ui_button primary"]')

    REVIEWER_IMAGE = (By.XPATH, './/div[@class="prw_rup prw_reviews_member_info_resp"]')
    REVIEWER_POP_UP_CONTAINER = (By.CLASS_NAME, "ui_overlay")
    REVIEWER_NAME = (By.XPATH, './/h3[@class="username reviewsEnhancements"]')
    REVIEWER_AGE_TOWN = (By.XPATH, './/ul[@class="memberdescriptionReviewEnhancements"]')
    REVIEWER_PROFILE = (By.XPATH, './/a[contains(@href, "/Profile/")]')
    REVIEWER_ENCHANCEMENTS = (By.XPATH, './/ul[@class="countsReviewEnhancements"]/li/span[@class="badgeTextReviewEnhancements"]')

    POI_NAME = (By.CLASS_NAME, 'fHibz') # Place Of Interest

    CLOSE_X = (By.XPATH, './/div[@class="ui_close_x"]')

#### Establishments Class - Used to along with the select_establishments method of TripAdvisorScrapper

In [15]:
class establishment:
    def __init__(self,checkboxElement,buttonElement,clickWhenStateIs):
        self.checkboxElement = checkboxElement
        self.buttonElement = buttonElement
        self.clickWhenStateIs = clickWhenStateIs

#### General Tools Class

In [16]:
class tools:
    @staticmethod 
    def contains_number(inputString):
        return any(char.isdigit() for char in inputString)  

#### TripAdvisor Scraper Class

In [17]:
class TripAdvisorScraper:
    def __init__(self, path, url):
        '''
        Constructor
        '''
        self.driver = webdriver.Chrome(PATH)
        self.url = url

    def getElementObjects(self, element, rootItem = None,placeHolderValues:list=None):
        '''
        This method is used to gather all matching elements
        '''
        if rootItem == None:
            rootItem = self.driver
        value = element[1]
        if placeHolderValues != None:
            for i in range(0,len(placeHolderValues)):
                value = value.replace("&" + str(i),placeHolderValues[i])
        return rootItem.find_elements(element[0],value)

    def getElementObject(self, element, rootItem = None,placeHolderValues:list=None):
        '''
        This method is used to gather an element defined in the elements class
        '''
        if rootItem == None:
            rootItem = self.driver
        value = element[1]
        if placeHolderValues != None:
            for i in range(0,len(placeHolderValues)):
                value = value.replace("&" + str(i),placeHolderValues[i])
        return rootItem.find_element(element[0],value)

    def waitForElement(self,element,rootItem = None, placeHolderValues:list=None):
        '''
        This method is used to gather an element defined in the elements class but it also waits until the element is presented
        '''
        if rootItem == None:
            rootItem = self.driver
        value = element[1]
        if placeHolderValues != None:
            for i in range(0,len(placeHolderValues)):
                value = value.replace("&" + str(i+1),placeHolderValues[i])
        try:
            return WebDriverWait(rootItem, 20).until(EC.presence_of_element_located((element[0],value)))
        except:
            return None

    def acceptCookies(self):
        try:
            cookies = self.waitForElement(element.COOKIES_ACCEPT_BUTTON)
            cookies.click()
        except:
            print("No cookies! Impolite! Meh...")

    def open_browser(self):
        '''
        This method is used to navigate to the URL that was instructed to the scraper durring initialization
        '''
        self.driver.get(self.url) #Load URL
        self.acceptCookies()

    def select_establishments(self,establishments:list):
        '''
        This method is used to select establishment types on TripAdvisor
        Parameters:
        establistmentElements :
            Gets a list of objects with three keys, checkbox, initialState and button :
                'checkbox'      : The checkbox element to check (Type: element object from element class)
                'button'        : The button element to click in order to change the state (Type: element object from element class)
                'initialState'  : The initial state of the checkbox (Type: boolean, True/False)
        '''

        #Expand list of establishments:
        try:
            #there are other elements with the same class name, but this is the first one
            more_button = self.waitForElement(element.SHOW_MORE_ESTABLISHMENT_TYPES_BUTTON)
            more_button.click()
        except:
            pass

        #Select establishment types:
        for e in establishments:
            if type(e) != establishment: continue
            try:
                establishment_checkbox = self.getElementObject(e.checkboxElement).is_selected()
                if establishment_checkbox == e.clickWhenStateIs:
                    self.getElementObject(e.buttonElement).click()
                    time.sleep(1) #merikes fores den prolabene na kanei click sta cafes
            except:
                pass

    def loop_at_rest(self):
        time.sleep(2)
        geolocation = ((self.driver.current_url).split('-')[1]).replace('g', '')
        print(geolocation)
        num_page = 2
        for i in range(0, num_page):
            num_items = 1
            for j in range(1, num_items+1):
                time.sleep(10)
                item = self.waitForElement(element.LIST_ITEM, placeHolderValues=[str(j)] )
                url = self.getElementObject(element.LIST_ITEM_URL, rootItem=item).get_attribute("href")
                self.driver.get(url)
                self.get_reviews(geolocation)

    def get_reviews(self, geolocation):
        self.acceptCookies()
        poi_name = self.getElementObject(element.POI_NAME).text
        print(poi_name)        
        review_table = self.waitForElement(element.REVIEW_TABLE)
        # d_location is a unique key for the POI
        poi_location = ((self.driver.current_url.split(geolocation+'-')[1]).split('-')[0]).replace('d', '') 
        print("poi data:")
        print(poi_location)        
        try:
            reviews_no = self.getElementObject(element.REVIEWS_COUNT).text
            reviews_no = int(reviews_no.replace("(", "").replace(")", ""))
            num_page = int(reviews_no / 10)
            print("page = ",num_page)
            k=0
            for i in range(0, num_page):
                # expand the review
                time.sleep(4) #<= We need to think about this waiting time {!}
                container = self.getElementObjects(element.REVIEW_CONTAINER)
                print("container ", len(container))
                for j in range(len(container)):
                    try:
                        try:
                            time.sleep(2) #<= We need to think about this waiting time {!}
                            more = self.getElementObject(element.EXPAND_REVIEW_BUTTON,rootItem=container[j])
                            if more != None:
                                if more.text == "More":
                                    more.click()
                        except StaleElementReferenceException:
                            print("Comment ",j, " could be longer, but I will let that pass..." )

                        review_id = container[j].get_attribute("data-review-id")
                        print('review_data:')
                        review_title = self.getElementObject(element.REVIEW_TITLE,rootItem=container[j]).text
                        review_date = self.getElementObject(element.REVIEW_DATE,rootItem=container[j]).get_attribute("title")
                        review_rating = self.getElementObject(element.REVIEW_RATING,rootItem=container[j]).get_attribute("class").split("_")[3]
                        review_text = self.getElementObject(element.REVIEW_TEXT,rootItem=container[j]).text.replace("\n", " ")
                        date_of_visit = self.getElementObject(element.DATE_OF_VISIT,rootItem=container[j]).text.replace("Date of visit: ", "")
                        print(review_title)
                        print(review_date)
                        print(review_rating)
                        print(review_text)
                        print(date_of_visit)
                        try:
                            print(" reviewer_data:")
                            reviewer_json = self.get_reviewer_info(container[j])
                            print(reviewer_json)
                        except ElementClickInterceptedException:
                            print("cannot click for some reason... but I am trying... ")
                        print("-----------")
                    except NoSuchElementException:
                        print("Didn't found item ",j, ", but it should be there.")
                k=i
                # change the page
                self.getElementObject(element.NEXT_PAGE_IN_REVIEWS).click()
        except NoSuchElementException:
            print("Oops, unlucky search!")
        except TimeoutException:
            print("You run out of time!")         

        time.sleep(2)
        self.driver.execute_script("window.history.go(-1)") #go back   

    def get_reviewer_info(self, container):
        time.sleep(2)       
        self.getElementObject(element.REVIEWER_IMAGE,rootItem=container).click() # open pop up    
        time.sleep(2) 
        reviewer_container = self.waitForElement(element.REVIEWER_POP_UP_CONTAINER)
        reviewer_name = self.getElementObject(element.REVIEWER_NAME, reviewer_container).text
        try: 
            contributions = helpful_votes = cities_visited = photos = 0
            reviewer_enchancements = self.getElementObjects(element.REVIEWER_ENCHANCEMENTS)
            for enchancement in reviewer_enchancements:
                if 'Contribution' in enchancement.text:
                    contributions = enchancement.text.split(' ')[0]
                elif 'Help' in enchancement.text:
                    helpful_votes = enchancement.text.split(' ')[0]
                elif 'visit' in enchancement.text:
                    cities_visited = enchancement.text.split(' ')[0]
                elif 'Photo' in enchancement.text:
                    photos = enchancement.text.split(' ')[0]          
        except:
            pass    
        try:
            reviewer_handle = (self.getElementObject(element.REVIEWER_PROFILE, reviewer_container).get_attribute('href')).split('/Profile/')[1]
        except:
            pass    
        # it is not always there but we will probably need it as nan for the sentiment analysis part (or not)
        # reviewer_age_town = np.nan #it is not always there
        location = np.nan
        age = np.nan
        sex = np.nan
        try:
            reviewer_other_info = self.getElementObjects(element.REVIEWER_AGE_TOWN)
            l=0
            for reviewer in reviewer_other_info:
                if '\n' in reviewer.text:
                    text = (reviewer.text).split('\n')[1]
                else:
                    text = reviewer.text    
                if 'From' in text: 
                    location = text.replace('From ', '')
                elif 'from' in text:             
                    location = text.split(' from ')[1]
                    if ' ' in text.split(' from ')[0]:
                        age = text.split(' ')[0]
                        sex = text.split(' ')[1]
                    else:
                        if tools.contains_number((text).split(' from ')[0]):
                            age = (text).split(' from ')[0]
                        else:
                            sex = ((text).split(' from ')[0]).lower()
        except:
            pass 
        reviewer_object = {
            "handle":reviewer_handle,
            "name":reviewer_name,
            "age":age,
            "sex":sex,
            "location":location,
            "contributions":int(contributions),
            "helpful_votes":int(helpful_votes),
            "cities_visited":int(cities_visited),
            "photos":int(photos)
        }           
        self.getElementObject(element.CLOSE_X,rootItem=reviewer_container).click() # close pop up       
        return json.dumps(reviewer_object)

    def quit_browser(self):
        self.driver.quit()


In [18]:
TripAdvisorScraper = TripAdvisorScraper(PATH,URL_RESTAURANTS)
TripAdvisorScraper.open_browser()
TripAdvisorScraper.select_establishments(
    establishments=[
        establishment(element.RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX,element.RESTAURANTS_ESTABLISHMENT_TYPE_BUTTON,True),
        establishment(element.COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX,element.COFFEE_AND_TEA_ESTABLISHMENT_TYPE_BUTTON,False),
        establishment(element.BARS_ESTABLISHMENT_TYPE_CHECKBOX,element.BARS_ESTABLISHMENT_TYPE_BUTTON,False)
    ]
)
TripAdvisorScraper.loop_at_rest()
# TripAdvisorScraper.get_reviews()
TripAdvisorScraper.quit_browser()

  self.driver = webdriver.Chrome(PATH)


189473
To Tsai Thessaloniki
poi data:
3807291
page =  19
container  10
review_data:
Delicious tea, for tea lovers
December 19, 2021
50
If you are a tea lover , this is the place to visit in Thessaloniki. Great variety, nice atmosphere, perfect service.
December 2021
 reviewer_data:
126 Contributions
126
20 Cities visited
84 Helpful votes
{"handle": "stefanosb114", "name": "Stefanos B", "age": NaN, "sex": NaN, "location": "Thessaloniki, Greece", "contributions": 126, "helpful_votes": 84, "cities_visited": 20, "photos": 0}
-----------
review_data:
Awesome place!
June 20, 2021
50
The staff was amazing, the tea sublime and the ambience great! I got the yerba mate and then came back the next day for an iced vanilla matcha latte. Both teas were so good I didn't feel like doing anything except drinking and enjoying the experience
June 2021
 reviewer_data:
2 Contributions
2
1 City visited
{"handle": "deesvs", "name": "Dee", "age": NaN, "sex": NaN, "location": "Thessaloniki Region, Greece", "co

KeyboardInterrupt: 