In [7]:
# -*- coding: utf-8 -*-
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import traceback
from datetime import datetime, timedelta

pd.set_option('display.max_colwidth', None)

GM_WEBPAGE = 'https://www.google.com/maps/'
MAX_WAIT = 25
MAX_RETRY = 5
MAX_SCROLLS = 40

class GoogleMapsScraper:

    def __init__(self, debug=False):
        self.debug = debug
        self.driver = self.__getDriver()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is not None:
            traceback.print_exception(exc_type, exc_value, tb)

        self.driver.close()
        self.driver.quit()

        return True

    def sortBy(self, url, ind):
        self.driver.get(url)
        
        self.__clickOnCookieAgreement()

        wait = WebDriverWait(self.driver, MAX_WAIT)

        # Dropdown-Menu öffnen
        clicked = False
        tries = 0
        while not clicked and tries < MAX_RETRY:
            try:
                menu_bt = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@data-value=\'Sort\']')))
                menu_bt.click()

                clicked = True
                time.sleep(3)
            except Exception as e:
                tries += 1

            # die Dropdown-Menu konnte nicht geöffnet werden
            if tries == MAX_RETRY:
                return -1

        # die absteigende Sortieroption wird ausgewählt
        recent_rating_bt = self.driver.find_elements_by_xpath('//div[@role=\'menuitemradio\']')[ind]
        recent_rating_bt.click()

        # Warten auf das Laden der Bewertung (Ajax-Call)
        time.sleep(5)

        return 0

    def getPlaces(self, method='urls'):

        df_places = pd.DataFrame()

        if method == 'urls':
            pass

        df_places = df_places[['search_point_url', 'href', 'name', 'rating', 'num_reviews', 'close_time', 'other']]
        df_places.to_csv('output/places_wax.csv', index=False)
        self.driver.quit()

    # Triggert das Scrollen, das dann das Laden des nächsten Abschnitts der Bewertungen auslöst.
    # Erweitert dann die Bewertungen, fragt das DOM ab und iteriert durch jede Bewertung, um sie zu parsen
    # Gibt die Liste der geparsten Bewertungen zurück.
    def getReviews(self, offset):
        # warten, bis der nächste Teil der Bewertungen geladen sind (Ajax)
        time.sleep(4)

        self.__scroll()

        # Bewertungstext öffnen
        self.__expandReviews()

        # Bewertungen parsen
        response = BeautifulSoup(self.driver.page_source, 'html.parser')
        
        rblock = response.find_all('div', class_='jftiEf fontBodyMedium')
        
        parsed_reviews = []
        for index, review in enumerate(rblock):
            if index >= offset:
                parsed_reviews.append(self.__parse(review))
        return parsed_reviews


    def __parse(self, review):
        item = {}

        try:
            id_review = review['data-review-id']
        except Exception as e:
            id_review = None

        try:
            review_text = self.__filterString(review.find('span', class_='wiI7pd').text)
        except Exception as e:
            review_text = None

        try:
            rating = float(review.find('span', class_='kvMYJc')['aria-label'].split(' ')[1])
        except Exception as e:
            rating = None

        try:
            relative_date = review.find('span', class_='rsqaWe').text
        except Exception as e:
            relative_date = None

        item['id_review'] = id_review
        item['caption'] = review_text
        item['relative_date'] = relative_date
        item['rating'] = rating

        return item


    def __parsePlace(self, response):
        place = {}
        try:
            place['overall_rating'] = float(response.find('div', class_='gm2-display-2').text.replace(',', '.'))
        except:
            place['overall_rating'] = 'NOT FOUND'

        try:
            place['n_reviews'] = int(response.find('div', class_='gm2-caption').text.replace('.', '').replace(',','').split(' ')[0])
        except:
            place['n_reviews'] = 0
        
        return place

    # Inhalt der Bewertung öffnen - klickt auf jede Expand-Button, die dann den ellipsenförmigen Text ersetzt
    # XPath verwenden, um komplette Bewertungen zu laden
    def __expandReviews(self):
        links = self.driver.find_elements_by_xpath('//button[@jsaction="pane.review.expandReview"]')
        for l in links:
            l.click()
        wait = WebDriverWait(self.driver, 1) # 2
            
    # Scroll simulieren, um den nächsten Teil von Bewertungen zu erhalten
    def __scroll(self):
        scrollable_div = self.driver.find_element_by_css_selector('div.m6QErb.DxyBCb.kA9KIf.dS8AEf')
        self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)


    def __getDriver(self):
        options = Options()

        if not self.debug:
            options.add_argument("--headless")
        else:
            options.add_argument("--window-size=1366,768")

        options.add_argument("--disable-notifications")
        options.add_argument("--lang=en-GB")
        input_driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)

        input_driver.get(GM_WEBPAGE)
        return input_driver

    # Cookies akzeptieren klick
    def __clickOnCookieAgreement(self):
        try:
            agree = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//span[contains(text(), "Reject all")]')))
            agree.click()

            return True
        except:
            return False

    # util-Funktion zum Entfernen von Sonderzeichen
    def __filterString(self, str):
        strOut = str.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
        return strOut

In [8]:
collection = [];
class ReviewGatherer:
    def __init__(self, url_file, from_date, max_items, debug):

        # Url laden
        with open(url_file, 'r') as furl:
            self.urls = [u[:-1] for u in furl]

        # Min. Datum den Bewertungen zu scrapen
        self.min_date_review = datetime.strptime(from_date, '%Y-%m-%d')

        self.max_items = max_items
        
        self.debug = debug

    def scrapeGMReviews(self):
        # init scraper and incremental add reviews
        with GoogleMapsScraper(debug=self.debug) as scraper:
            for url in self.urls:
                try:
                    error = scraper.sortBy(url, 1)
                    
                    if error == 0:
                        stop = False
                        offset = 0
                        n_new_reviews = 0
                        while not stop:
                            rlist = scraper.getReviews(offset)
                            for r in rlist:
                                # Bewertungsdatum erfassen und mit der min_date_review vergleichen
                                r['timestamp'] = self.__parseRelativeDate(r['relative_date'])
                                stop = self.__stop(r)
                                if not stop:
                                    collection.append(r)
                                    n_new_reviews += 1
                                else:
                                    break
                            offset += len(rlist)
                    else:
                        print('Sortieren von Bewertungen fehlgeschlagen für {}'.format(url))
                except Exception as e:
                    print('Exception: {}'.format(e))
                    
        return(collection)


    def __parseRelativeDate(self, string_date):
        curr_date = datetime.now()
        split_date = string_date.split(' ')

        n = split_date[0]
        delta = split_date[1]

        if delta == 'year':
            return curr_date - timedelta(days=365)
        elif delta == 'years':
            return curr_date - timedelta(days=365 * int(n))
        elif delta == 'month':
            return curr_date - timedelta(days=30)
        elif delta == 'months':
            return curr_date - timedelta(days=30 * int(n))
        elif delta == 'week':
            return curr_date - timedelta(weeks=1)
        elif delta == 'weeks':
            return curr_date - timedelta(weeks=int(n))
        elif delta == 'day':
            return curr_date - timedelta(days=1)
        elif delta == 'days':
            return curr_date - timedelta(days=int(n))
        elif delta == 'hour':
            return curr_date - timedelta(hours=1)
        elif delta == 'hours':
            return curr_date - timedelta(hours=int(n))
        elif delta == 'minute':
            return curr_date - timedelta(minutes=1)
        elif delta == 'minutes':
            return curr_date - timedelta(minutes=int(n))
        elif delta == 'moments':
            return curr_date - timedelta(seconds=1)


    def __stop(self, r):
        review_list = [x for x in collection if x['id_review'] == r['id_review']];
        is_old_review = review_list[0] if len(review_list) != 0 else None
        
        if is_old_review is None and r['timestamp'] >= self.min_date_review and len(collection) < self.max_items:
            return False
        else:
            return True

In [9]:
gatherer = ReviewGatherer("urls.text", '2011-01-01', 500, True)

try:
    collection = gatherer.scrapeGMReviews()
except Exception as e:
    print('Not handled error: {}'.format(e))

Exception: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=110.0.5481.177)

Not handled error: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=110.0.5481.177)



In [10]:
df_collection = pd.DataFrame.from_records(collection)

In [11]:
df_reviews = df_collection[['caption', 'rating', 'timestamp']]
df_reviews.to_csv('../data/reviews.csv', sep=';', encoding='utf-8')

In [12]:
df_reviews.loc[df_reviews['caption'] != ''].head()

Unnamed: 0,caption,rating,timestamp
0,"Große vegane Auswahl, außergewöhnliche und leckere Kombinationen (Translated by Google) Large vegan selection, unusual and delicious combinations",5.0,2023-03-04 23:49:58.817043
1,Wie immer ein excellenter burger und nette Bedienungen (Translated by Google) …,5.0,2023-03-04 00:49:58.817090
2,"The food is very good, I would recommend",4.0,2023-03-04 00:49:58.817094
4,"Mega cooles Ambiente und Konzept die Bürger sind auch geschmacklich sehr lecker. Man kann seiner Bürger individuelle auf seine Bedürfnisse anpassen ( Allergien/vegetarisch/Co) Was echt ein bisschen besser laufen könnte sind die Angestellten. Wir haben gefüllt 15 min zum bezahlen warten müssen( haben fast unsere Bahn verpasst )…. und beim bestellen wurden wir auch erst dran genommen , als wir erneut auf uns aufmerksam gemacht haben. Also, ist ein hipper und lecker Laden. Nur man muss als Kunde auf sich aufmerksam machen, dann quatschen die unter sich vielleicht nicht mehr so viel, aber ansonsten Top. Achja … in vielen Läden ist Mayo und Ketchup „meist kostenlos“, hier zahlt man für jede Soße … schade … Kenne das eigentlich anders, aber Jedermanns Eigen-Konzept … daher 🤷🏻‍♀️ (Translated by Google) Super cool ambience and concept the citizens are also very tasty. You can customize your citizens to your needs (allergies/vegetarian/co) What could really go a little better are the employees. We had to wait 15 minutes to pay (almost missed our train)... and when ordering, we were only taken when we drew attention to ourselves again. So, is a hip and delicious place. You just have to draw attention to yourself as a customer, then maybe they don't chat as much among themselves, but otherwise great. Oh yeah... many stores have mayo and ketchup ""usually free"", here you pay for every sauce ... too bad ... I actually know it differently, but everyone's own concept... therefore 🤷🏻 ♀️",4.0,2023-02-26 00:49:58.817099
5,"Super sympathische Mitarbeiter, eine wirklich freundliche und gemütliche Atmosphäre und fantastisches Essen! Wir kommen ganz sicher öfter. Vielen Dank für den tollen Nachmittag! (Translated by Google) Super personable staff, a really friendly and cozy atmosphere and fantastic food! We will definitely come more often. Thank you for the great afternoon!",5.0,2023-02-26 00:49:58.817102
