# Web Scraper for trip advisor

## Prerequisites

In [None]:
pip install selenium

## Imports

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException

import numpy as np
import json

import pymongo

## Constants

In [2]:
URL_RESTAURANTS = "https://www.tripadvisor.com/Restaurants-g189473-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html"
URL_ONE_CAFE = "https://www.tripadvisor.com/Restaurant_Review-g189473-d3807291-Reviews-To_Tsai_Thessaloniki-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html"

PATH = "chromedriver.exe"

## Custom tools

#### Elements Class - Contains all the element identifiers

In [3]:
class element:
    '''
    This is a class that contains all the elements we will need to scrape data from TripAdvisor
    '''
    COOKIES_ACCEPT_BUTTON = (By.ID, "onetrust-accept-btn-handler")
    SHOW_MORE_ESTABLISHMENT_TYPES_BUTTON = (By.CLASS_NAME, "fdmYH")

    #Establishment Type elements:
    RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_3')
    RESTAURANTS_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")
    COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_6')
    COFFEE_AND_TEA_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")
    BARS_ESTABLISHMENT_TYPE_CHECKBOX = (By.ID, 'checkbox_241')
    BARS_ESTABLISHMENT_TYPE_BUTTON = (By.XPATH, f".//label[@for='{BARS_ESTABLISHMENT_TYPE_CHECKBOX[1]}']")
    NUMBER_OF_ESTABLISHMENTS = (By.XPATH, ".//span[@class='ffdhf b']")
    NEXT_PAGE_IN_ESTABLISHMENTS = (By.XPATH, './/a[@class="nav next rndBtn ui_button primary taLnk"]')


    LIST_ITEM = (By.XPATH,".//div[@data-test='&1_list_item']")
    LIST_ITEM_URL = (By.XPATH,".//a[@class='bHGqj Cj b']")
    REVIEW_TABLE = (By.ID, "taplc_location_reviews_list_resp_rr_resp_0")
    REVIEWS_COUNT = (By.CLASS_NAME, "reviews_header_count")
    REVIEW_CONTAINER = (By.XPATH,".//div[@class='review-container']")
    EXPAND_REVIEW_BUTTON = (By.XPATH,"//span[@class='taLnk ulBlueLinks']")

    REVIEW_TITLE = (By.XPATH,".//span[@class='noQuotes']")
    REVIEW_DATE = (By.XPATH,".//span[contains(@class, 'ratingDate')]")
    REVIEW_RATING = (By.XPATH,".//span[contains(@class, 'ui_bubble_rating bubble_')]")
    REVIEW_TEXT = (By.XPATH,".//p[@class='partial_entry']")
    DATE_OF_VISIT = (By.XPATH, ".//div[@class='prw_rup prw_reviews_stay_date_hsx']")
    NEXT_PAGE_IN_REVIEWS = (By.XPATH,'.//a[@class="nav next ui_button primary"]')

    REVIEWER_IMAGE = (By.XPATH, './/div[@class="prw_rup prw_reviews_member_info_resp"]')
    REVIEWER_POP_UP_CONTAINER = (By.CLASS_NAME, "ui_overlay")
    REVIEWER_NAME = (By.XPATH, './/h3[@class="username reviewsEnhancements"]')
    REVIEWER_AGE_TOWN = (By.XPATH, './/ul[@class="memberdescriptionReviewEnhancements"]')
    REVIEWER_PROFILE = (By.XPATH, './/a[contains(@href, "/Profile/")]')
    REVIEWER_ENCHANCEMENTS = (By.XPATH, './/ul[@class="countsReviewEnhancements"]/li/span[@class="badgeTextReviewEnhancements"]')

    POI_NAME = (By.CLASS_NAME, 'fHibz') # Place Of Interest

    CLOSE_X = (By.XPATH, './/div[@class="ui_close_x"]')

#### Establishments Class - Used to along with the select_establishments method of TripAdvisorScrapper

In [4]:
class establishment:
    '''
    This is a class used to create an establishment type passed on TripAdvisor class in order to make the correct establishment type selections
    '''
    def __init__(self,checkboxElement,buttonElement,clickWhenStateIs):
        '''
        Constructor:
            checkboxElement     : The establishment type checkbox element
            buttonElement       : The establishment type button element that is pressed depending on "clickWhenStateIs" value
            clickWhenStateIs    : Checks if the checkboxElement state is equal to 'clickWhenStateIs' and if so buttonElement is clicked else nothing is clicked
        '''
        self.checkboxElement = checkboxElement
        self.buttonElement = buttonElement
        self.clickWhenStateIs = clickWhenStateIs

#### General Tools Class

In [5]:
class tools:
    '''
    This class contains some static general purpose tools needed
    '''
    @staticmethod
    def contains_number(inputString):
        '''
        Check if the inputString contains any numeric values
        '''
        return any(char.isdigit() for char in inputString)

#### Database management class

In [6]:
class Database:
    '''
    This class is used to manage the data handling of a Mongo database
    '''
    def __init__(self,mongo_uri,db_name,col_name):
        '''
        Constructor:
            mongo_uri   : the mongoDB URI to connect to
            db_name     : the mongo database to use
            col_name    : the mongo collection of the database to use
        '''
        try:
            self.client = pymongo.MongoClient(mongo_uri)
            self.database = self.client[db_name]
            self.collection = self.database[col_name]
        except Exception as exc:
            raise exc
    def addDocument(self,document):
        '''
        Insert one document in the mongo collection in use
        '''
        #To add an existence check based on key (to be designed) - {!}
        self.collection.insert_one(document)

#### TripAdvisor Scraper Class

In [7]:
class TripAdvisorScraper:
    '''
    This is a scraper for TripAdvisor, it can be used to gather review data from the site.
    '''
    def __init__(self, chrome_driver_path, url, database: Database):
        '''
        Constructor :
            chrome_driver_path : the local path of chrome driver
            url : the TripAdvisor URL to start from
            database : the database instance used to gather the data
        '''
        self.driver = webdriver.Chrome(chrome_driver_path)
        self.url = url
        self.database = database

    def getElementObjects(self, element, rootItem = None,placeHolderValues:list=None):
        '''
        This method is used to gather all matching elements
        '''
        if rootItem == None:
            rootItem = self.driver
        value = element[1]
        if placeHolderValues != None:
            for i in range(0,len(placeHolderValues)):
                value = value.replace("&" + str(i),placeHolderValues[i])
        return rootItem.find_elements(element[0],value)

    def getElementObject(self, element, rootItem = None,placeHolderValues:list=None):
        '''
        This method is used to gather an element defined in the elements class
        '''
        if rootItem == None:
            rootItem = self.driver
        value = element[1]
        if placeHolderValues != None:
            for i in range(0,len(placeHolderValues)):
                value = value.replace("&" + str(i),placeHolderValues[i])
        return rootItem.find_element(element[0],value)

    def waitForElement(self,element,rootItem = None, placeHolderValues:list=None):
        '''
        This method is used to gather an element defined in the elements class but it also waits until the element is presented
        '''
        if rootItem == None:
            rootItem = self.driver
        value = element[1]
        if placeHolderValues != None:
            for i in range(0,len(placeHolderValues)):
                value = value.replace("&" + str(i+1),placeHolderValues[i])
        try:
            return WebDriverWait(rootItem, 20).until(EC.presence_of_element_located((element[0],value)))
        except:
            return None

    def acceptCookies(self):
        try:
            cookies = self.waitForElement(element.COOKIES_ACCEPT_BUTTON)
            cookies.click()
        except:
            print("No cookies! Impolite! Meh...")

    def open_browser(self):
        '''
        This method is used to navigate to the URL that was instructed to the scraper durring initialization
        '''
        self.driver.get(self.url) #Load URL
        self.acceptCookies()

    def select_establishments(self,establishments:list):
        '''
        This method is used to select establishment types on TripAdvisor
        Parameters:
        establistmentElements :
            Gets a list of objects with three keys, checkbox, initialState and button :
                'checkbox'      : The checkbox element to check (Type: element object from element class)
                'button'        : The button element to click in order to change the state (Type: element object from element class)
                'initialState'  : The initial state of the checkbox (Type: boolean, True/False)
        '''

        #Expand list of establishments:
        try:
            #there are other elements with the same class name, but this is the first one
            more_button = self.waitForElement(element.SHOW_MORE_ESTABLISHMENT_TYPES_BUTTON)
            more_button.click()
        except:
            pass

        #Select establishment types:
        for e in establishments:
            if type(e) != establishment: continue
            try:
                establishment_checkbox = self.getElementObject(e.checkboxElement).is_selected()
                if establishment_checkbox == e.clickWhenStateIs:
                    self.getElementObject(e.buttonElement).click()
                    time.sleep(1) #merikes fores den prolabene na kanei click sta cafes
            except:
                pass

    def gather_data(self):
        '''
        Gather data from TripAdvisor
        '''
        time.sleep(2)
        geolocation = ((self.driver.current_url).split('-')[1]).replace('g', '')
        number_of_establishments = int(self.getElementObject(element.NUMBER_OF_ESTABLISHMENTS).text)
        k=1
        while k <= number_of_establishments:
            try:
                item = self.waitForElement(element.LIST_ITEM, placeHolderValues=[str(k)] )
                url = self.getElementObject(element.LIST_ITEM_URL, rootItem=item).get_attribute("href")
                print(url)
                k = k + 1
            except NoSuchElementException:
                pass
            if k%30==0:
                try:
                    self.getElementObject(element.NEXT_PAGE_IN_ESTABLISHMENTS).click()
                except:
                    print("its over")
                    break                          

    def get_reviews(self, geolocation):
        '''
        Get reviews from TripAdvisor
        '''
        self.acceptCookies()
        review_table = self.waitForElement(element.REVIEW_TABLE)
        # d_location is a unique key for the POI
        try:
            reviews_no = self.getElementObject(element.REVIEWS_COUNT).text
            reviews_no = int(reviews_no.replace("(", "").replace(")", ""))
            num_page = int(reviews_no / 10)
            k=0
            for i in range(0, num_page):
                # expand the review
                time.sleep(4) #<= We need to think about this waiting time {!}
                container = self.getElementObjects(element.REVIEW_CONTAINER)
                print("container ", len(container))
                for j in range(len(container)):
                    review_object = {}
                    review_object["poi_name"] = self.getElementObject(element.POI_NAME).text
                    review_object["poi_location_id"] = ((self.driver.current_url.split(geolocation+'-')[1]).split('-')[0]).replace('d', '')
                    try:
                        try:
                            time.sleep(2) #<= We need to think about this waiting time {!}
                            more = self.getElementObject(element.EXPAND_REVIEW_BUTTON,rootItem=container[j])
                            if more != None:
                                if more.text == "More":
                                    more.click()
                        except StaleElementReferenceException:
                            print("Comment ",j, " could be longer, but I will let that pass..." )

                        review_object["id"]=container[j].get_attribute("data-reviewid")
                        review_object["title"] = self.getElementObject(element.REVIEW_TITLE,rootItem=container[j]).text
                        review_object["date"] = self.getElementObject(element.REVIEW_DATE,rootItem=container[j]).get_attribute("title")
                        review_object["review_rating"] = self.getElementObject(element.REVIEW_RATING,rootItem=container[j]).get_attribute("class").split("_")[3]
                        review_object["text"] = self.getElementObject(element.REVIEW_TEXT,rootItem=container[j]).text.replace("\n", " ")
                        review_object["date_of_visit"] = self.getElementObject(element.DATE_OF_VISIT,rootItem=container[j]).text.replace("Date of visit: ", "")
                        try:
                            reviewer_json = self.get_reviewer_info(container[j])
                        except ElementClickInterceptedException:
                            print("cannot click for some reason... but I am trying... ")

                        review_object["reviewer"]=reviewer_json
                        review_json = json.dumps(review_object)
                        print(review_json)
                        self.database.addDocument(review_object) # insert to mongodb
                        print("-----------")
                    except NoSuchElementException:
                        print("Didn't found item ",j, ", but it should be there.")
                k=i
                # change the page
                self.getElementObject(element.NEXT_PAGE_IN_REVIEWS).click()
        except NoSuchElementException:
            print("Oops, unlucky search!")
        except TimeoutException:
            print("You run out of time!")

        time.sleep(2)
        self.driver.execute_script("window.history.go(-1)") #go back

    def get_reviewer_info(self, container):
        '''
        Get reviewer info from a review
        '''
        reviewer_object = {}
        time.sleep(2)
        self.getElementObject(element.REVIEWER_IMAGE,rootItem=container).click() # open pop up
        time.sleep(2)
        reviewer_container = self.waitForElement(element.REVIEWER_POP_UP_CONTAINER)
        reviewer_object["name"] = self.getElementObject(element.REVIEWER_NAME, reviewer_container).text
        try:
            reviewer_object["handle"] = (self.getElementObject(element.REVIEWER_PROFILE, reviewer_container).get_attribute('href')).split('/Profile/')[1]
        except:
            pass

        try:
            reviewer_other_info = self.getElementObjects(element.REVIEWER_AGE_TOWN)
            for reviewer in reviewer_other_info:
                if '\n' in reviewer.text:
                    text = (reviewer.text).split('\n')[1]
                else:
                    text = reviewer.text
                if 'From' in text:
                    reviewer_object["location"] = text.replace('From ', '')
                elif 'from' in text:
                    reviewer_object["location"] = text.split(' from ')[1]
                    if ' ' in text.split(' from ')[0]:
                        reviewer_object["age"] = text.split(' ')[0]
                        reviewer_object["sex"] = text.split(' ')[1]
                    else:
                        if tools.contains_number((text).split(' from ')[0]):
                            reviewer_object["age"] = (text).split(' from ')[0]
                        else:
                            reviewer_object["sex"] = ((text).split(' from ')[0]).lower()
        except:
            pass

        try:
            reviewer_enchancements = self.getElementObjects(element.REVIEWER_ENCHANCEMENTS)
            for enchancement in reviewer_enchancements:
                if 'Contribution' in enchancement.text:
                    reviewer_object["contributions"] = int(enchancement.text.split(' ')[0])
                elif 'Help' in enchancement.text:
                    reviewer_object["helpful_votes"] = int(enchancement.text.split(' ')[0])
                elif 'visit' in enchancement.text:
                    reviewer_object["cities_visited"] = int(enchancement.text.split(' ')[0])
                elif 'Photo' in enchancement.text:
                    reviewer_object["photo"] = int(enchancement.text.split(' ')[0])
        except:
            pass

        self.getElementObject(element.CLOSE_X,rootItem=reviewer_container).click() # close pop up
        return reviewer_object

    def quit_browser(self):
        '''
        Close browser
        '''
        self.driver.quit()


In [8]:
#Connect to mongoDB database :
database = Database("mongodb://localhost:27017/","trip_advisor","reviews")
#Scrape data from TripAdvisor :
tripAdvisorScraper = TripAdvisorScraper(PATH,URL_RESTAURANTS,database)
tripAdvisorScraper.open_browser()
tripAdvisorScraper.select_establishments(
    establishments=[
        establishment(element.RESTAURANTS_ESTABLISHMENT_TYPE_CHECKBOX,element.RESTAURANTS_ESTABLISHMENT_TYPE_BUTTON,True),
        establishment(element.COFFEE_AND_TEA_ESTABLISHMENT_TYPE_CHECKBOX,element.COFFEE_AND_TEA_ESTABLISHMENT_TYPE_BUTTON,False),
        establishment(element.BARS_ESTABLISHMENT_TYPE_CHECKBOX,element.BARS_ESTABLISHMENT_TYPE_BUTTON,False)
    ]
)
tripAdvisorScraper.gather_data()
# TripAdvisorScraper.get_reviews()
tripAdvisorScraper.quit_browser()

  self.driver = webdriver.Chrome(chrome_driver_path)


https://www.tripadvisor.com/Restaurant_Review-g189473-d3807291-Reviews-To_Tsai_Thessaloniki-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d10798846-Reviews-Koukos-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d7088226-Reviews-Menta_Cafe_Bar-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d17535220-Reviews-Valenio-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d8386847-Reviews-The_Blue_Cup-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d8759001-Reviews-JOIN_Juice_Bars-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d5461549-Reviews-Sugar_Angel-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripa