In [1]:
# install selenium
!pip install -U selenium

Collecting selenium
  Downloading selenium-4.29.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Using cached trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Collecting websocket-client~=1.8 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Downloading attrs-25.1.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting sniffio>=1.3.0 (from trio~=0.17->selenium)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting cffi>=1.14 (from trio~=0.17->selenium)
  Using cached cffi-1.17.1-cp312-cp312-wi


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# imports
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains as AC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait  
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException 
from timeit import default_timer as timer

## Data Gathering

In [3]:
# set playstore url of the app and the system path to chromedriver
htmlpro = 'https://play.google.com/store/apps/details?id=ai.replika.app&hl=en'
# set file path
path = r"C:\Users\jiyaa\chromedriver.exe"

In [8]:
# use the Service class
service = Service(executable_path=path)
driver = webdriver.Chrome(service=service)

In [9]:
# wait for the window to open 
time.sleep(15)

In [10]:
# open glovo app site on playstore
driver.get(htmlpro)

In [11]:
# maximize window to full screen
driver.maximize_window()

In [12]:
# get home window
home_window = driver.current_window_handle

In [13]:
# create a function for clickable elements 
def klick(xpath):
    """
    A private function to find and click 
    on any clickable element on a webpage.
    Its argument should be a valid XPath of
    the corresponding element.
    If the XPath is invalid, it returns a 
    TimeoutException after ten seconds.
    
    e.g. klick('//div[@class="dropdown"]')
    """
    wait = WebDriverWait(driver, 10)
    locate = wait.until(EC.element_to_be_clickable((By.XPATH, xpath)))
    locate.click()

In [16]:
# click "ratings and review" button
klick('/html/body/c-wiz[2]/div/div/div[1]/div/div[2]/div/div[1]/div[1]/c-wiz[5]/section/header/div/div[3]/span/div/button/i')

In [17]:
# wait for the new window to open 
time.sleep(15)

In [20]:
def scrape_name(driver_name, container):
    """
    A private function to extract the name of the app reviewer.
    Its first argument should be the assigned variable name to 
    a WebElement object. While the second argument should be a
    list datatype for storing the extracted name.
    """
    reviewer_name = driver_name.find_element(By.CLASS_NAME, "X5PpBb").text
    container.append(reviewer_name)

In [21]:
def scrape_rating(driver_name, container):
    """
    A private function to extract the rating count of the app 
    review.
    Its first argument should be the assigned variable name to 
    a WebElement object. While the second argument should be a
    list datatype for storing the extracted ratings.
    """
    star_rating = driver_name.find_element(By.CLASS_NAME, "iXRFPc").get_attribute("aria-label").split(None, 2)[1]
    container.append(star_rating)

In [22]:
def scrape_date(driver_name, container):
    """
    A private function to extract the date of review.
    Its first argument should be the assigned variable name to 
    a WebElement object. While the second argument should be a
    list datatype for storing the extracted date.
    """
    review_date = driver_name.find_element(By.CLASS_NAME, "bp9Aid").text
    container.append(review_date)

In [23]:
def scrape_review(driver_name, container):
    """
    A private function to extract the body of the review.
    Its first argument should be the assigned variable name to 
    a WebElement object. While the second argument should be a
    list datatype for storing the extracted date.
    """
    review_note = driver_name.find_element(By.CLASS_NAME, "h3YV2d").text
    container.append(review_note)

In [24]:
def scrape_num(driver_name, container):
    """
    A private function to extract the number of people that
    found the review helpful.
    Its first argument should be the assigned variable name to 
    a WebElement object. While the second argument should be a
    list datatype for storing the extracted number.
    """
    try:
        yes_num = driver_name.find_element(By.CLASS_NAME, "AJTPZc").text.split(None, 2)[0]
        container.append(yes_num)
        
    except NoSuchElementException:
        container.append("None")

In [25]:
# create a dictionary to store extracted data
app_review = {
    'name': [],
    'star_rating': [],
    'date': [],
    'review': [],
    'people': [],
    'device': []
}

In [26]:
def scrape_all(driver_name):
    """
    A private function to extract all data from a review 
    including the name of the reviewer; the star rating given;
    the date of review; the body of the review; and the number
    of people that found the review to be helpful.
    Its only argument should be the assigned variable name to 
    a WebElement object.
    """
    scrape_name(driver_name, app_review['name'])
    scrape_rating(driver_name, app_review['star_rating'])
    scrape_date(driver_name, app_review['date'])
    scrape_review(driver_name, app_review['review'])
    scrape_num(driver_name, app_review['people'])

In [None]:
# set xpaths for the dropdown arrow used for filtering by phone, chromebook, or tablet
menu = '//*[@id="formFactor_{}"]/div[2]/i'

# set xpaths for the menu review options:'Phone', 'Chromebook', and 'Tablet'
xbase = '//*[@id="yDmH0d"]/div[5]/div[2]/div/div/div/div/div[2]/div[2]/div/div/span[{}]'

In [28]:
def scroll_down(action, section):
    action.move_to_element(section).click().send_keys(Keys.PAGE_DOWN).perform()   

In [29]:
def terminal_scroll(limit):
    actions = AC(driver)
    while True:
        try:
            reviews_section = driver.find_element(By.XPATH, '//*[@id="yDmH0d"]/div[5]/div[2]/div/div/div/div/div[2]/div/div[2]/div[{}]'.format(limit))
            scroll_down(actions, reviews_section)
            print(limit)
            time.sleep(7)
            limit += 20
            scroll_down(actions, reviews_section)
            check = limit + 1
            
        except NoSuchElementException:
            try:
                while limit <= check :
                    reviews_section = driver.find_element(By.XPATH, '//*[@id="yDmH0d"]/div[5]/div[2]/div/div/div/div/div[2]/div/div[2]/div[{}]'.format(check))
                    scroll_down(actions, reviews_section)
                    time.sleep(7)
                    scroll_down(actions, reviews_section)
                    check += 1
                    
            except NoSuchElementException:
                # debugging
                print(limit, check)
                break

In [30]:
def get_phone_reviews(x):
    time.sleep(45)
    terminal_scroll(x)
    reviews = driver.find_elements(By.CLASS_NAME, "RHo1pe")
    for review in reviews:
        scrape_all(review)
        app_review['device'].append('Phone')
    # click dropdown arrow    
    klick(menu.format(2))
    # select tablet reviews
    try:
        klick(xbase.format(3))
        
    except TimeoutException:
        klick(xbase.format(2))

In [31]:
def get_tablet_reviews(x):
    time.sleep(45)
    terminal_scroll(x)
    reviews = driver.find_elements(By.CLASS_NAME, "RHo1pe")
    for review in reviews:
        scrape_all(review)
        app_review['device'].append('Tablet')
    # click dropdown arrow
    klick(menu.format(3))
    # select chromebook reviews//*[@id="formFactor_3"]/div[2]/i
    klick(xbase.format(2))
    time.sleep(45)
    next_reviews = driver.find_elements(By.CLASS_NAME, "RHo1pe")
    # use if-else to check if tablet or chromebook was selected 
    if reviews[0] == next_reviews[0]:
        print('No Chromebook reviews')
        
    else:
        print("Run 'get_chromebook_reviews'")

In [32]:
def get_chromebook_reviews(x):
    time.sleep(45)
    terminal_scroll(x)
    reviews = driver.find_elements(By.CLASS_NAME, "RHo1pe")
    for review in reviews:
        scrape_all(review)
        app_review['device'].append('Chromebook')
    # click dropdown arrow
    klick(menu.format(5))
    # select phone reviews
    klick(xbase.format(1))
    time.sleep(45)

In [33]:
get_phone_reviews(20)

UnboundLocalError: cannot access local variable 'check' where it is not associated with a value

In [34]:
get_tablet_reviews(8)

UnboundLocalError: cannot access local variable 'check' where it is not associated with a value

In [45]:
driver.quit()

In [35]:
# store the gathered data into a pandas dataframe
df = pd.DataFrame(app_review)
df

Unnamed: 0,name,star_rating,date,review,people,device


In [44]:
# save data to csv file
df.to_csv('learn-html-pro.csv', index=False)