In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re
import requests
from bs4 import BeautifulSoup
from selenium.common.exceptions import TimeoutException
import json

In [2]:
def change_language(driver):
    # Click the "菜單" button
    menu_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@aria-label, '菜單')]")))
    menu_button.click()

    # Click the "語言" button
    language_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@jsaction, 'settings.languages')]")))
    language_button.click()

    # Switch to English
    english_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), '‪English (United States)‬')]")))
    english_button.click()

In [3]:
def prepare_webpage(driver):
    # Find the "Reviews" button and click on it
    reviews_button= WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(@aria-label, 'Reviews')]")))
    reviews_button.click()

    # Scroll down the webpage
    scroll = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "/html/body/div[3]/div[9]/div[9]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]")))
    for i in range(2):
        scroll.send_keys(Keys.END)
        time.sleep(1)

    try:
        # Wait for the "More" button to be present
        more_buttons = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, "//button[contains(text(), 'More')]")))
        # Click on all "More" button
        for button in more_buttons:
            button.click()
    except TimeoutException:
        pass 

### Sample the relevant data

In [4]:
google_info = pd.read_csv("googlemaps_info.csv")

### Sample based on general location

In [5]:
boundaries = {"North":22.3, "South":22.27, "East":114.14, "West":114.11} # Western District

In [6]:
drop_row = []
for index in range(len(google_info)):
    current_coor = google_info["coordinates"][index]
    current_coor = current_coor.replace("'", "\"")
    convert_coor = json.loads(current_coor)
    
    lat = convert_coor["lat"]
    long = convert_coor["lng"]

    if boundaries["South"] <= lat <= boundaries["North"] and boundaries["West"] <= long <= boundaries["East"]:
        pass
    else:
        drop_row.append(index)
western_places = google_info.drop(drop_row).reset_index(drop=True)

In [7]:
western_places

Unnamed: 0,location_id,location,rating,num_rating,price_level,url,types,coordinates,top5reviews
0,id_33,starbucks 香港大學綜合大樓地下2號舖,3.8,93,2,https://maps.google.com/?cid=17336784663903124052,"['cafe', 'food', 'store', 'point_of_interest',...","{'lat': 22.2831435, 'lng': 114.1358022}","[{'author_name': 'Coey', 'author_url': 'https:..."
1,id_48,starbucks The Henry*,3.9,145,2,https://maps.google.com/?cid=7986681234539092847,"['cafe', 'store', 'food', 'point_of_interest',...","{'lat': 22.2873098, 'lng': 114.1385283}","[{'author_name': 'Jack Poon', 'author_url': 'h..."
2,id_93,starbucks Imperial Kennedy,3.7,220,2,https://maps.google.com/?cid=7983266056879862977,"['cafe', 'food', 'point_of_interest', 'store',...","{'lat': 22.2825974, 'lng': 114.1287074}","[{'author_name': 'Mugi Yamamoto', 'author_url'..."


### Extract the reviews from googlemap webpage

In [8]:
url_link = western_places["url"].tolist()

allreview_list = []
location_id_list = []
for link in range(len(url_link)):
    driver = webdriver.Chrome(ChromeDriverManager().install())

    url = url_link[link]
    print(f'Extract for - {western_places["location"][link]}')

    driver.get(url)
    
    change_language(driver)
    prepare_webpage(driver)
    
    # Get the element that contains review data
    elements = driver.find_elements_by_xpath('//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[3]/div[9]')

    for review in elements:
        user_list = review.find_elements_by_class_name("WEBjve")
        rating_list = review.find_elements_by_class_name("kvMYJc")
        ilya_list = review.find_elements_by_class_name("rsqaWe")
        review_list = review.find_elements_by_class_name("wiI7pd")

    # Extract the data
    for i in range(len(rating_list)):
        allreview_list.append(json.dumps({"author_name":user_list[i].get_attribute("aria-label").split(" ", 2)[2],
                            "rating":rating_list[i].get_attribute("aria-label"), 
                            "review_date":ilya_list[i].text,
                            "review":review_list[i].text}))
        location_id_list.append(western_places["location_id"][link])

    
    # Close the browser window
    driver.quit()
    

data_dict = {"location_id":location_id_list, "moreReview":allreview_list}
data_df = pd.DataFrame(data = data_dict)

Extract for - starbucks  香港大學綜合大樓地下2號舖
Extract for - starbucks The Henry*
Extract for - starbucks Imperial Kennedy


### Save the updated data into new csv file.

In [9]:
data_df.to_csv("googlemaps_review.csv", index=False)