In [22]:
import re
import json
import time
import dataclasses
from typing import Optional

import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC

In [23]:
USER = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"

REVIEW_LINK = "https://www.google.com/maps/search/spain+hotels/@31.774513,-15.9297508,5z/data=!3m1!4b1?hl=en&entry=ttu"

In [4]:
@dataclasses.dataclass
class Review:
    review_id: str
    author_link: str
    author_title: str
    author_id: str
    author_image: str
    review_text: str
    owner_answer: str
    owner_answer_timestamp: str
    owner_answer_timestamp_datetime_utc: str
    review_link: str
    review_rating: str
    review_timestamp: str
    review_datetime_utc: str
    review_likes: str
    review_img_url: Optional[list] = dataclasses.field(default_factory=list)

@dataclasses.dataclass
class Scores:
    one_star: str
    two_stars: str
    three_stars: str
    four_stars: str
    five_stars: str
    
@dataclasses.dataclass
class Place:
    name: str
    google_id: str
    rating: str
    reviews_total: str
    location_link: str
    reviews_link: Optional[str] = None
    reviews_id: Optional[str] = None
    reviews_per_score: Scores = None
    reviews: list[Review] = dataclasses.field(default_factory=list)

In [24]:
options = webdriver.ChromeOptions()

options.add_argument("--disable-infobars")

options.add_argument('--no-sandbox')

options.add_argument('--start-maximized')

options.add_argument('--ignore-gpu-blocklist')

options.add_argument('--single-process')

options.add_argument('--disable-dev-shm-usage')

# options.add_argument("--headless=new")

options.add_argument(f"user-agent={USER}")

options.add_argument("--incognito")

options.add_argument('--disable-blink-features=AutomationControlled')

options.add_experimental_option('useAutomationExtension', False)

options.add_experimental_option("excludeSwitches", ["enable-automation"])

options.add_argument("--log-level=3")

options.add_argument('--disable-extensions')

options.add_argument('--lang=en-GB')

options.set_capability('pageLoadStrategy', 'none')

chrome_service = Service()
# chrome_service.creation_flags = CREATE_NO_WINDOW

while True:
    try:
        browser = webdriver.Chrome(service=chrome_service, 
                                        options=options)

        browser.command_executor._commands["SEND_COMMAND"] = (
            "POST", "/session/$sessionId/chromium/send_command"
        )
        
        break

    except:pass

In [25]:
browser.get(REVIEW_LINK)

In [26]:
feed = browser.find_element(By.CSS_SELECTOR, "div[role='feed']")

In [8]:
previous_len, trials = 0, 0

while True:
    businesses = feed.find_elements(By.CSS_SELECTOR,
                                    'div[class="TFQHme "]')
    
    if len(businesses) == previous_len:
        if trials == 4:
            break

        trials += 1

        time.sleep(10)

    previous_len = len(businesses)
    
    feed.send_keys(Keys.END)

    time.sleep(5)

In [32]:
businesses_cont = browser.find_element(By.CSS_SELECTOR,
                                "div[role='feed']")

businesses = businesses_cont.find_elements(By.XPATH, "./*")

start_found = False

companies: list[WebElement] = []

for element in businesses:
    if start_found and not element.get_attribute("class"):
        companies.append(element)

    if element.get_attribute("role") == "presentation":
        start_found = True

print(len(companies))

6


In [25]:
re.search(r"(.+?)\n(\d+\.?\d?)\s*\(([\d,]+)", 
                companies[0].text,
                flags=re.I|re.DOTALL).group(3)

'2,956'

In [30]:
# ActionChains(browser).move_to_element_with_offset(companies[2], 0, 50).perform()

# desired_pos = 100

# browser.execute_script("window.scrollBy(0, arguments[0]);", desired_pos)

element = companies[2]

# browser.execute_script("return arguments[0].scrollIntoView(0, document.documentElement.scrollHeight);", element

element.location_once_scrolled_into_view

{'x': 72, 'y': 211}

In [21]:
feed.send_keys(Keys.PAGE_DOWN)

In [16]:
places: list[Place] = []

for company in companies:
    details = re.search(r"(.+?)\n(\d+\.?\d?)\s*\(([\d,]+)", 
                company.text,
                flags=re.I|re.DOTALL)

    if not details: continue

    name = details.group(1)

    rating = details.group(2)

    reviews = details.group(3)

    for link in company.find_elements(By.TAG_NAME, "a"):
        if re.search(rf'{name}', 
                    link.get_attribute("aria-label"),
                    flags=re.I):
            place_link_tag = link

            break

    location_link = place_link_tag.get_attribute("href")

    jslog: str = place_link_tag.get_attribute("jslog")

    google_id_re = re.search(r"metadata:(.+)", jslog, re.I)

    google_id = google_id_re.group(1)

    place = Place(name=name, 
                  google_id=google_id,
                  rating=rating,
                  reviews_total=reviews,
                  location_link=location_link)
    
    places.append(place)


In [17]:
browser.execute_script('''window.open("{}","_blank");'''.format(places[0].location_link))

In [20]:
browser.switch_to.window(browser.window_handles[0])

In [21]:
browser.title

'spain hotels - Google Maps'

In [17]:
places[-1]

Place(name='Palacio de los Duques Gran Meliá - The Leading Hotels of the World', google_id='WyIwYWhVS0V3aXYtc24xbV91QkF4WDlTUEVESGVTS0M4OFE4QmNJdFFJb0FBIixudWxsLDNd', rating='4.7', reviews_total='1,934', location_link='https://www.google.com/maps/place/Palacio+de+los+Duques+Gran+Meli%C3%A1+-+The+Leading+Hotels+of+the+World/data=!4m10!3m9!1s0xd42287bbb349661:0x97dbe6a71867fcc6!5m2!4m1!1i2!8m2!3d40.4197035!4d-3.7096772!16s%2Fg%2F1tr9pqw6!19sChIJYZY0u3soQg0RxvxnGKfm25c?authuser=0&hl=en&rclk=1', reviews_link=None, reviews_id=None, reviews_per_score=None, reviews=[])

In [33]:
def locate(selector: str,
           method: By,
           driver: Optional[WebElement]=None,
           multiple: Optional[bool]=False,
           breakout: Optional[bool]=True) -> WebElement|list[WebElement]:
        if driver is None:
            driver = browser
        
        trials = 0

        while True:
            try:
                if multiple:
                    return driver.find_elements(method, selector)
                else:
                    return driver.find_element(method, selector)
            
            except Exception as e: 
                if trials == 3 and breakout:
                    return
                
                trials += 1

                time.sleep(10)
                print(selector)

mappings = {"1": "one_star",
            "2": "two_stars",
            "3": "three_stars",
            "4": "four_stars",
            "5": "five_stars"}

In [34]:
def get_review(review_element: WebElement, container: WebElement) -> Review:
    # ActionChains(browser).move_to_element(review_element).perform()
    review_element.location_once_scrolled_into_view
    
    review_id = review_element.get_attribute("data-review-id")

    review_text_tag = locate(driver=review_element,
                             selector=f"div[id='{review_id}']",
                             method=By.CSS_SELECTOR)
    
    try:
        see_more = review_text_tag.find_element(By.CSS_SELECTOR, 
                                                'button[aria-label="See more"]')
        
        see_more.click()

        time.sleep(1)

    except:pass

    review_text = review_text_tag.text
    
    author_re = re.search(r"([\w\s'\-]+)\n.*?(\d+)\s*reviews?.*?\n(\d{1,2}/\d{1,2})\n([\w\s]+?ago)", 
                          review_element.text,
                          flags=re.DOTALL|re.I)

    if not author_re: 
        print(review_element.text)
        return
    
    name = author_re.group(1)
    # author_reviews_num = author_re.group(2)
    review_rating = author_re.group(3)
    review_posted = author_re.group(4)

    buttons: list[WebElement] = locate(
        method=By.CSS_SELECTOR, 
        driver=review_element,
        multiple=True,
        selector=f'button[data-review-id="{review_id}"]'
    )

    review_images = []
    
    for button in buttons:
        aria_label = button.get_attribute("aria-label")

        button_text = aria_label if aria_label else ""
        
        if re.search(r"photo\s+of", button_text, re.I):
            author_link: str = button.get_attribute("data-href")

            image = locate(method=By.TAG_NAME, 
                           selector="img",
                           driver=button)

            author_image_link = image.get_attribute("src")

            author_id_re = re.search(r"contrib/(.+)/review", author_link)

            author_id = author_id_re.group(1)
        
        if re.search(r"Share", button_text, re.I):
            print(button_text)

            while True:
                try:
                    button.location_once_scrolled_into_view
                    # ActionChains(browser).move_to_element(button).perform()

                    # window_h = browser.execute_script('return window.innerHeight')
                    # window_y = browser.execute_script('return window.pageYOffset')
                    # current_y = (window_h / 2) + window_y

                    # container.send_keys(Keys.PAGE_DOWN)

                    button.click()

                    time.sleep(2)

                    input_tag = locate(
                        method=By.CSS_SELECTOR,
                        selector='input[jsaction="pane.copyLink.clickInput"]')

                    review_link = input_tag.get_attribute("value")

                    locate(selector='button[aria-label="Close"]',
                        method=By.CSS_SELECTOR).click()
                    
                    time.sleep(2)

                    break

                except: pass
            
        title = button.get_attribute("title")

        title = title if title else ""

        likes_re = re.search(r"(\d*)\s*like", title, re.I)

        if likes_re:
            likes = likes_re.group(1)

            likes = likes if likes.strip() else "0"
        
        photo_index = button.get_attribute("data-photo-index")

        if photo_index:
            style: str = button.get_attribute("style")

            image_re = re.search(r"(http.+)&quot;", style)

            if image_re:
                review_images.append(image_re.group(1))

    response_re = re.search(r"Response\s+from\s+the\s+owner\s*([\w\s]+ago)\s(.+)",
                            review_element.text,
                            flags=re.I|re.DOTALL)

    if response_re:
        owner_answer_timestamp = response_re.group(1)
        owner_answer = response_re.group(2)
    else:
        owner_answer_timestamp = ""
        owner_answer = ""
    
    review = Review(review_id=review_id,
                author_link=author_link,
                author_title=name,
                author_id=author_id,
                author_image=author_image_link,
                review_text=review_text,
                owner_answer=owner_answer,
                owner_answer_timestamp=owner_answer_timestamp,
                owner_answer_timestamp_datetime_utc=None,
                review_link=review_link,
                review_rating=review_rating,
                review_timestamp=review_posted,
                review_datetime_utc=None,
                review_likes=likes,
                review_img_url=review_images)
    
    return review



In [35]:
def load_more_reviews(reviews_container: WebElement) -> None:
    reviews_container.send_keys(Keys.END)
    time.sleep(5)

In [36]:
for place in places[:3]:
    browser.get(place.location_link)

    time.sleep(3)

    for _ in range(3):
        try:
            score_tags: list[WebElement] = locate(
                selector="tr[role='img']", 
                method=By.CSS_SELECTOR, 
                multiple=True
            )

            scores = {}

            for tag in score_tags:
                score_text: str = tag.get_attribute("aria-label")

                scores_re = re.search(r"(\d)\s*\w+,\s*([\d,]+)", score_text)

                scores[mappings[scores_re.group(1)]] = scores_re.group(2)
            
            place.reviews_per_score = Scores(**scores)
        except:
            time.sleep(5)

    buttons: list[WebElement] = locate(
        selector='button[role="tab"]',
        method=By.CSS_SELECTOR,
        multiple=True
    )

    for button in buttons:
        aria_label = button.get_attribute("aria-label")
        
        if not aria_label: continue

        if re.search(r"reviews", aria_label, re.I):
            button.click()

            time.sleep(2)
            
            place.reviews_link = browser.current_url

            reviews_id_re = re.search(r"data=(.+)\?", place.reviews_link)

            place.reviews_id = reviews_id_re.group(1)

            refine_reviews = locate(
                selector='div[aria-label="Refine reviews"]',
                method=By.CSS_SELECTOR,
            )

            reviews_container = locate(selector="..",
                                       method=By.XPATH,
                                       driver=refine_reviews)
            
            len_reviews, trials = 0, 0
            
            while True:
                load_more_reviews(reviews_container)

                review_tags = locate(
                    method=By.CSS_SELECTOR,
                    selector='div[class="jftiEf fontBodyMedium "]',
                    multiple=True)

                if len(review_tags) == len_reviews \
                    or len(review_tags) >= 200:
                    if trials == 4 or len(review_tags) >= 200:
                        break

                    trials += 1

                    time.sleep(10)

                    continue

                len_reviews = len(review_tags)

                trials = 0

            
            for review_tag in review_tags[:200]:
                review = get_review(review_tag, reviews_container)

                if review is not None:
                    place.reviews.append(review)
            
            break
        
    

input[jsaction="pane.copyLink.clickInput"]
input[jsaction="pane.copyLink.clickInput"]
Share Shared Account's review.
Share Ruth Coulson's review.
Share Dan Kit's review.
Share JHon Cas's review.
Share Andre's review.
Share Sarah Harding's review.
Share Neal Smith's review.
Share FWM's review.
Share Matthew Hall's review.
Share Patrick Farrell's review.
Share Rory Gleeson's review.
Share Christof Byers's review.
Share Christian Andersson's review.
Share John James's review.
Share Champagne's review.
Share Natalie 44118's review.
Share Vernon Beacham's review.
Share Sue Bonsor's review.
Share Insert Text Here with Daro's review.
Share Stefan Beutler's review.
Share Chantal van der Giessen's review.
Share Bernadette Keegan's review.
Share Ryan Leishman's review.
Share Claire Davies's review.
Share Svein-Magne Tunli's review.
Share rachel kirk's review.
Share Ian Johnson's review.
Share Steve Crawford's review.
Share Roy Grew's review.
Share Ken H's review.
Share Matti Räty's review.
Share

In [2]:
place = places[0]

for review_tag in review_tags[:200]:
    review = get_review(review_tag)

    if review is not None:
        place.reviews.append(review)

NameError: name 'places' is not defined

In [104]:
dataclasses.asdict(place)

{'name': 'Hotel Riu Papayas',
 'google_id': 'WyIwYWhVS0V3aWdfdGlVanZtQkF4VkRUNlFFSFZWY0MyQVE4QmNJOFJRb0FBIl0=',
 'rating': '4.5',
 'reviews_total': '3,383',
 'location_link': 'https://www.google.com/maps/place/Hotel+Riu+Papayas/data=!4m10!3m9!1s0xc3f62bde181f7a9:0xdadf70ce344183aa!5m2!4m1!1i2!8m2!3d27.763355!4d-15.572008!16s%2Fg%2F1tfsd2qt!19sChIJqfeB4b1iPwwRqoNBNM5w39o?authuser=0&hl=en&rclk=1',
 'reviews_link': 'https://www.google.com/maps/place/Hotel+Riu+Papayas/@27.7633552,-15.5720078,17z/data=!3m1!5s0xc3f62bde871caeb:0x84dd1809654d84d!4m11!3m10!1s0xc3f629454909933:0xdadf70ce344183aa!5m2!4m1!1i2!8m2!3d27.7633552!4d-15.5720078!9m1!1b1!16s%2Fg%2F1tfsd2qt?authuser=0&hl=en&entry=ttu',
 'reviews_id': '!3m1!5s0xc3f62bde871caeb:0x84dd1809654d84d!4m11!3m10!1s0xc3f629454909933:0xdadf70ce344183aa!5m2!4m1!1i2!8m2!3d27.7633552!4d-15.5720078!9m1!1b1!16s%2Fg%2F1tfsd2qt',
 'reviews_per_score': {'one_star': '66',
  'two_stars': '48',
  'three_stars': '195',
  'four_stars': '871',
  'five_stars': '2

In [37]:
results = []

for place in places:
    results.append(dataclasses.asdict(place))

In [38]:
results[0]

{'name': 'Radisson Blu Resort & Spa, Gran Canaria Mogan',
 'google_id': 'WyIwYWhVS0V3alMyLXFReGZtQkF4WERWcVFFSFlGNEFyOFE4QmNJeEJVb0FBIl0=',
 'rating': '4.5',
 'reviews_total': '2,956',
 'location_link': 'https://www.google.com/maps/place/Radisson+Blu+Resort+%26+Spa,+Gran+Canaria+Mogan/data=!4m10!3m9!1s0xc4080eac79be159:0xa558d1157a97ada2!5m2!4m1!1i2!8m2!3d27.825872!4d-15.757701!16s%2Fg%2F11by_fbkzz!19sChIJWeGbx-qAQAwRoq2XehXRWKU?authuser=0&hl=en&rclk=1',
 'reviews_link': 'https://www.google.com/maps/place/Radisson+Blu+Resort+%26+Spa,+Gran+Canaria+Mogan/@27.8258724,-15.7577009,17z/data=!4m11!3m10!1s0xc408093da9b4087:0xa558d1157a97ada2!5m2!4m1!1i2!8m2!3d27.8258724!4d-15.7577009!9m1!1b1!16s%2Fg%2F11by_fbkzz?authuser=0&hl=en&entry=ttu',
 'reviews_id': '!4m11!3m10!1s0xc408093da9b4087:0xa558d1157a97ada2!5m2!4m1!1i2!8m2!3d27.8258724!4d-15.7577009!9m1!1b1!16s%2Fg%2F11by_fbkzz',
 'reviews_per_score': {'one_star': '79',
  'two_stars': '55',
  'three_stars': '148',
  'four_stars': '585',
  'five_

In [39]:
with open("sample.json", "w") as file:
    json.dump(results, file, indent=4)