In [70]:
# -*- coding: utf-8 -*-
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re
import logging
import traceback
import numpy as np
import itertools


GM_WEBPAGE = 'https://www.google.com/maps/'
MAX_WAIT = 20
MAX_RETRY = 5
MAX_SCROLLS = 40

class GoogleMapsScraper:

    def __init__(self, debug=False):
        self.debug = debug
        self.driver = self.__get_driver()
        self.logger = self.__get_logger()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is not None:
            traceback.print_exception(exc_type, exc_value, tb)

        self.driver.close()
        self.driver.quit()

        return True

    def sort_by(self, url, ind):

        self.driver.get(url)
        self.__click_on_cookie_agreement()

        wait = WebDriverWait(self.driver, MAX_WAIT)

        # open dropdown menu
        clicked = False
        tries = 0
        while not clicked and tries < MAX_RETRY:
            try:
                menu_bt = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@data-value=\'Sort\']')))
                menu_bt.click()

                clicked = True
                time.sleep(3)
            except Exception as e:
                tries += 1
                self.logger.warn('Failed to click sorting button')

            # failed to open the dropdown
            if tries == MAX_RETRY:
                return -1

        #  element of the list specified according to ind
        recent_rating_bt = self.driver.find_elements_by_xpath('//div[@role=\'menuitemradio\']')[ind]
        recent_rating_bt.click()

        # wait to load review (ajax call)
        time.sleep(5)

        return 0

    def get_places(self, method='urls', keyword_list=None):

        df_places = pd.DataFrame()

        if method == 'urls':
            # search_point_url = row['url']  # TODO:
            pass
        if method == 'squares':
            search_point_url_list = self._gen_search_points_from_square(keyword_list=keyword_list)
        else:
            # search_point_url = f"https://www.google.com/maps/search/{row['keyword']}/@{str(row['longitude'])},{str(row['latitude'])},{str(row['zoom'])}z"
            # TODO:
            pass

        for i, search_point_url in enumerate(search_point_url_list):

            if (i+1) % 10 == 0:
                print(f"{i}/{len(search_point_url_list)}")
                df_places = df_places[['search_point_url', 'href', 'name', 'rating', 'num_reviews', 'close_time', 'other']]
                df_places.to_csv('output/places_wax.csv', index=False)


            try:
                self.driver.get(search_point_url)
            except NoSuchElementException:
                self.driver.quit()
                self.driver = self.__get_driver()
                self.driver.get(search_point_url)

            # Gambiarra to load all places into the page
            scrollable_div = self.driver.find_element_by_css_selector(
                "div.siAUzd-neVct.section-scrollbox.cYB2Ge-oHo7ed.cYB2Ge-ti6hGc > div[aria-label*='Results for']")
            for i in range(10):
                self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)

            # Get places names and href
            # time.sleep(2)
            response = BeautifulSoup(self.driver.page_source, 'html.parser')
            div_places = response.select('div[jsaction] > a[href]')
            # print(len(div_places))
            for div_place in div_places:
                place_info = {
                    'search_point_url': search_point_url.replace('https://www.google.com/maps/search/', ''),
                    'href': div_place['href'],
                    'name': div_place['aria-label'],
                    'rating': None,
                    'num_reviews': None,
                    'close_time': None,
                    'other': None
                }

                df_places = df_places.append(place_info, ignore_index=True)
        df_places = df_places[['search_point_url', 'href', 'name', 'rating', 'num_reviews', 'close_time', 'other']]
        df_places.to_csv('output/places_wax.csv', index=False)
        self.driver.quit()

    def _gen_search_points_from_square(self, keyword_list=None):
        # TODO: Generate search points from corners of square

        keyword_list = [] if keyword_list is None else keyword_list

        square_points = pd.read_csv('input/square_points.csv')

        cities = square_points['city'].unique()

        search_urls = []

        for city in cities:

            df_aux = square_points[square_points['city'] == city]
            latitudes = np.linspace(df_aux['latitude'].min(), df_aux['latitude'].max(), num=20)
            longitudes = np.linspace(df_aux['longitude'].min(), df_aux['longitude'].max(), num=20)
            coordinates_list = list(itertools.product(latitudes, longitudes, keyword_list))

            search_urls += [f"https://www.google.com/maps/search/{coordinates[2]}/@{str(coordinates[1])},{str(coordinates[0])},{str(15)}z"
             for coordinates in coordinates_list]

        return search_urls



    def get_reviews(self, offset):
        # scroll to load reviews

        # wait for other reviews to load (ajax)
        time.sleep(4)

        self.__scroll()


        # expand review text
        self.__expand_reviews()

        # parse reviews
        response = BeautifulSoup(self.driver.page_source, 'html.parser')
        # TODO: Subject to changes
        rblock = response.find_all('div', class_='jftiEf fontBodyMedium')
        parsed_reviews = []
        for index, review in enumerate(rblock):
            if index >= offset:
                parsed_reviews.append(self.__parse(review))

                # logging to std out
                print(self.__parse(review))
        return parsed_reviews


    def get_account(self, url):
        self.driver.get(url)

        # ajax call also for this section
        time.sleep(4)

        resp = BeautifulSoup(self.driver.page_source, 'html.parser')

        place_data = self.__parse_place(resp)
        return place_data


    def __parse(self, review):
        item = {}


        try:
            # TODO: Subject to changes
            id_review = review['data-review-id']
        except Exception as e:
            id_review = None

        try:
            # TODO: Subject to changes
            username = review['aria-label']
        except Exception as e:
            username = None

        try:
            # TODO: Subject to changes
            review_text = self.__filter_string(review.find('span', class_='wiI7pd').text)
        except Exception as e:
            review_text = None

        try:
            # TODO: Subject to changes
            rating = float(review.find('span', class_='kvMYJc')['aria-label'].split(' ')[1])
        except Exception as e:
            rating = None

        try:
            # TODO: Subject to changes
            relative_date = review.find('span', class_='rsqaWe').text
        except Exception as e:
            relative_date = None

        try:
            n_reviews_photos = review.find('div', class_='section-review-subtitle').find_all('span')[1].text
            metadata = n_reviews_photos.split('\xe3\x83\xbb')
            if len(metadata) == 3:
                n_photos = int(metadata[2].split(' ')[0].replace('.', ''))
            else:
                n_photos = 0

            idx = len(metadata)
            n_reviews = int(metadata[idx - 1].split(' ')[0].replace('.', ''))

        except Exception as e:
            n_reviews = 0
            n_photos = 0

        try:
            user_url = review.find('a')['href']
        except Exception as e:
            user_url = None

        item['id_review'] = id_review
        item['caption'] = review_text

        # depends on language, which depends on geolocation defined by Google Maps
        # custom mapping to transform into date should be implemented
        item['relative_date'] = relative_date

        # store datetime of scraping and apply further processing to calculate
        # correct date as retrieval_date - time(relative_date)
        item['retrieval_date'] = datetime.now()
        item['rating'] = rating
        item['username'] = username
        item['n_review_user'] = n_reviews
        item['n_photo_user'] = n_photos
        item['url_user'] = user_url

        return item


    def __parse_place(self, response):
        place = {}
        try:
            place['overall_rating'] = float(response.find('div', class_='gm2-display-2').text.replace(',', '.'))
        except:
            place['overall_rating'] = 'NOT FOUND'

        try:
            place['n_reviews'] = int(response.find('div', class_='gm2-caption').text.replace('.', '').replace(',','').split(' ')[0])
        except:
            place['n_reviews'] = 0
        
        return place

    # expand review description
    def __expand_reviews(self):
        # use XPath to load complete reviews
        # TODO: Subject to changes
        links = self.driver.find_elements_by_xpath('//button[@jsaction="pane.review.expandReview"]')
        for l in links:
            l.click()
        time.sleep(2)


    def __scroll(self):
        # TODO: Subject to changes
        scrollable_div = self.driver.find_element_by_css_selector('div.m6QErb.DxyBCb.kA9KIf.dS8AEf')
        self.driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
        #self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")


    def __get_logger(self):
        print('logger')
        # create logger
        logger = logging.getLogger('googlemaps-scraper')
        logger.setLevel(logging.DEBUG)

        # create console handler and set level to debug
        fh = logging.FileHandler('gm-scraper.log')
        fh.setLevel(logging.DEBUG)

        # create formatter
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

        # add formatter to ch
        fh.setFormatter(formatter)

        # add ch to logger
        logger.addHandler(fh)
        print('logger return')
        return logger


    def __get_driver(self, debug=False):
        print('get driver')
        options = Options()

        if not self.debug:
            options.add_argument("--headless")
        else:
            options.add_argument("--window-size=1366,768")

        options.add_argument("--disable-notifications")
        options.add_argument("--lang=en-GB")
        input_driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)

         # click on google agree button so we can continue (not needed anymore)
         # EC.element_to_be_clickable((By.XPATH, '//span[contains(text(), "I agree")]')))
        input_driver.get(GM_WEBPAGE)
        print('get driver return')
        return input_driver

    # cookies agreement click
    def __click_on_cookie_agreement(self):
        print('cookie')
        try:
            agree = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//span[contains(text(), "Reject all")]')))
            agree.click()

            # back to the main page
            # self.driver.switch_to_default_content()
            print('cookie click success')
            return True
        except:
            print('cookie click fail')
            return False

    # util function to clean special characters
    def __filter_string(self, str):
        strOut = str.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
        return strOut

In [73]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
import argparse
import logging
import sys

collenction = [];
class Monitor:
    def __init__(self, url_file, from_date):

        # load urls file
        with open(url_file, 'r') as furl:
            self.urls = [u[:-1] for u in furl]

        # min date review to scrape
        self.min_date_review = datetime.strptime(from_date, '%Y-%m-%d')

        # logging
        self.logger = self.__get_logger()

    def scrape_gm_reviews(self):
        # init scraper and incremental add reviews
        # TO DO: pass logger as parameter to log into one single file?
        with GoogleMapsScraper(debug=True) as scraper:
            for url in self.urls:
                print(url)
                try:
                    #ind = {'most_relevant' : 0 , 'newest' : 1, 'highest_rating' : 2, 'lowest_rating' : 3 }
                    error = scraper.sort_by(url, 1)
                    
                    if error == 0:
                        stop = False
                        offset = 0
                        n_new_reviews = 0
                        while not stop:
                            rlist = scraper.get_reviews(offset)
                            print(rlist)
                            #for r in rlist:
                                # calculate review date and compare to input min_date_review
                            #    r['timestamp'] = self.__parse_relative_date(r['relative_date'])
                            #    stop = self.__stop(r, collection)
                            #    if not stop:
                            #        collection.insert_one(r)
                            #        n_new_reviews += 1
                            #    else:
                            #        break
                            offset += len(rlist)

                        # log total number
                        self.logger.info('{} : {} new reviews'.format(url, n_new_reviews))
                    else:
                        self.logger.warning('Sorting reviews failed for {}'.format(url))

                except Exception as e:
                    print("Exception")
                    exc_type, exc_obj, exc_tb = sys.exc_info()
                    fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]

                    print('{}: {}, {}, {}'.format(url, exc_type, fname, exc_tb.tb_lineno))


    def __parse_relative_date(self, string_date):
        curr_date = datetime.now()
        split_date = string_date.split(' ')

        n = split_date[0]
        delta = split_date[1]

        if delta == 'year':
            return curr_date - timedelta(days=365)
        elif delta == 'years':
            return curr_date - timedelta(days=365 * int(n))
        elif delta == 'month':
            return curr_date - timedelta(days=30)
        elif delta == 'months':
            return curr_date - timedelta(days=30 * int(n))
        elif delta == 'week':
            return curr_date - timedelta(weeks=1)
        elif delta == 'weeks':
            return curr_date - timedelta(weeks=int(n))
        elif delta == 'day':
            return curr_date - timedelta(days=1)
        elif delta == 'days':
            return curr_date - timedelta(days=int(n))
        elif delta == 'hour':
            return curr_date - timedelta(hours=1)
        elif delta == 'hours':
            return curr_date - timedelta(hours=int(n))
        elif delta == 'minute':
            return curr_date - timedelta(minutes=1)
        elif delta == 'minutes':
            return curr_date - timedelta(minutes=int(n))
        elif delta == 'moments':
            return curr_date - timedelta(seconds=1)


    def __stop(self, r, collection):
        is_old_review = collection.find_one({'id_review': r['id_review']})
        if is_old_review is None and r['timestamp'] >= self.min_date_review:
            return False
        else:
            return True

    def __get_logger(self):
        # create logger
        logger = logging.getLogger('monitor')
        logger.setLevel(logging.DEBUG)
        # create console handler and set level to debug
        fh = logging.FileHandler('monitor.log')
        fh.setLevel(logging.DEBUG)
        # create formatter
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        # add formatter to ch
        fh.setFormatter(formatter)
        # add ch to logger
        logger.addHandler(fh)

        return logger

    #parser = argparse.ArgumentParser(description='Monitor Google Maps places')
    #parser.add_argument('--i', type=str, default='urls.txt', help='target URLs file')
    #parser.add_argument('--from-date', type=str) # start date in format: YYYY-MM-DD

    #args = parser.parse_args()

    

In [74]:
monitor = Monitor("urls.text", '2021-01-01')

try:
    monitor.scrape_gm_reviews()
except Exception as e:
    monitor.logger.error('Not handled error: {}'.format(e))

get driver
get driver return
logger
logger return
https://www.google.com/maps/place/Super+Bro's/@50.1217539,8.6802749,15z/data=!4m2!3m1!1s0x0:0xc088ec1873216106?sa=X&ved=2ahUKEwiiyr7V89v8AhU5hv0HHZ17ArQQ_BJ6BQiFARA
cookie
cookie click success


  self.logger.warn('Failed to click sorting button')


get reviews
scroll start
scroll end
expand reviews
expand reviews end
parse
parse return
parse
parse return
{'id_review': 'ChZDSUhNMG9nS0VJQ0FnSURCeWIza1FBEAE', 'caption': '', 'relative_date': 'an hour ago', 'retrieval_date': datetime.datetime(2023, 1, 23, 1, 12, 3, 122426), 'rating': 5.0, 'username': 'H S', 'n_review_user': 0, 'n_photo_user': 0, 'url_user': 'https://www.google.com/maps/contrib/115518838229771297793/reviews?hl=en-US'}
parse
parse return
parse
parse return
{'id_review': 'ChZDSUhNMG9nS0VJQ0FnSURCaWN6emZREAE', 'caption': '', 'relative_date': '8 hours ago', 'retrieval_date': datetime.datetime(2023, 1, 23, 1, 12, 3, 123355), 'rating': 5.0, 'username': 'Felipe Lopez', 'n_review_user': 0, 'n_photo_user': 0, 'url_user': 'https://www.google.com/maps/contrib/118229155488862556280/reviews?hl=en-US'}
parse
parse return
parse
parse return
{'id_review': 'ChZDSUhNMG9nS0VJQ0FnSURCNFlmM1ZBEAE', 'caption': 'Sehr geil  (Translated by Google) very cool', 'relative_date': 'a day ago', 'ret

scroll start
scroll end
expand reviews
expand reviews end
parse
parse return
parse
parse return
{'id_review': 'ChdDSUhNMG9nS0VJQ0FnSURCOUtXNTVnRRAB', 'caption': '', 'relative_date': 'a week ago', 'retrieval_date': datetime.datetime(2023, 1, 23, 1, 12, 9, 714981), 'rating': 5.0, 'username': 'András Sz', 'n_review_user': 0, 'n_photo_user': 0, 'url_user': 'https://www.google.com/maps/contrib/115955041507470202119/reviews?hl=en-US'}
parse
parse return
parse
parse return
{'id_review': 'ChdDSUhNMG9nS0VJQ0FnSURCNU1XdDdBRRAB', 'caption': 'Pizza Mozarella. Frische Zutaten. Der Boden aber zu labbig, dafür am Rand reichlich Teig. Die Wartezeit war kurz. Für den stolzen Preis ein durchwachsenes Produkt. …', 'relative_date': 'a week ago', 'retrieval_date': datetime.datetime(2023, 1, 23, 1, 12, 9, 715857), 'rating': 3.0, 'username': 'MrAbraxason', 'n_review_user': 0, 'n_photo_user': 0, 'url_user': 'https://www.google.com/maps/contrib/112073951757001021508/reviews?hl=en-US'}
parse
parse return
parse


scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
parse
parse return
par

scroll start
scroll end
expand reviews
expand reviews end
parse
parse return
parse
parse return
{'id_review': 'ChdDSUhNMG9nS0VJQ0FnSUQtaDZTNmxRRRAB', 'caption': '', 'relative_date': 'a month ago', 'retrieval_date': datetime.datetime(2023, 1, 23, 1, 13, 27, 24133), 'rating': 4.0, 'username': 'Jia-wei Chiu', 'n_review_user': 0, 'n_photo_user': 0, 'url_user': 'https://www.google.com/maps/contrib/112977440490690424206/reviews?hl=en-US'}
parse
parse return
parse
parse return
{'id_review': 'ChZDSUhNMG9nS0VJQ0FnSUQtMjctV0hREAE', 'caption': '', 'relative_date': 'a month ago', 'retrieval_date': datetime.datetime(2023, 1, 23, 1, 13, 27, 24941), 'rating': 5.0, 'username': 'MP4.Oktober', 'n_review_user': 0, 'n_photo_user': 0, 'url_user': 'https://www.google.com/maps/contrib/103777670302255651469/reviews?hl=en-US'}
parse
parse return
parse
parse return
{'id_review': 'ChZDSUhNMG9nS0VJQ0FnSUQtMjdfVVRREAE', 'caption': '', 'relative_date': 'a month ago', 'retrieval_date': datetime.datetime(2023, 1, 23,

scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews
scroll start
scroll end
expand reviews
expand reviews end
return get reviews
[]
get reviews


Traceback (most recent call last):
  File "/var/folders/8f/b5rl_hw50bn0_k4xrt7mtm7w0000gn/T/ipykernel_46877/548901151.py", line 37, in scrape_gm_reviews
    rlist = scraper.get_reviews(offset)
  File "/var/folders/8f/b5rl_hw50bn0_k4xrt7mtm7w0000gn/T/ipykernel_46877/3596845422.py", line 166, in get_reviews
    time.sleep(4)
KeyboardInterrupt
