In [18]:
import time, random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.relative_locator import locate_with

from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException

In [12]:
options = Options()

In [13]:
options.add_argument('--start-maximized')

In [14]:
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(2)
driver.get('http://web.workwisse.com/jobs')

## Task

### CSS Selectors
These are very handy for hard to find elements, arising from situation where:
- HTML element only has a general attribute such as class.
- Inner HTML is needed (XPATH locators will match any element on the entire page).

The general form is
```html
"<html-tag>[<attribute-name>='<attribute-value>']"
"p[class='mb-0 fz14 list-inline-item mb5-sm pe-1']"
```

In [15]:
def analyse_page(page_count: int, driver) -> list[dict]:
    data = []
    
    try:
        job_cards = driver.find_elements(By.XPATH, '//div[@class="col-md-6 col-lg-12"]')
    except StaleElementReferenceException:
        driver.refresh()        # Potentially problematic. Returns us to the first page.
        # Ensure we are on the right page numeber.
        # driver.find_element(By.CSS_SELECTOR, f'li[page-num="{page_count}"]').click()   
        job_cards = driver.find_elements(By.XPATH, '//div[@class="col-md-6 col-lg-12"]')
        
    
    for job in job_cards:
        time.sleep(1)
        title_bar = job.find_element(By.CSS_SELECTOR, 'h5[class="title mb-3"]')
        # title_bar = job.find_element(By.TAG_NAME, 'h5')
        title = title_bar.text
        url = title_bar.find_element(By.TAG_NAME, 'a').get_attribute('href')
        location = job.find_element(By.CSS_SELECTOR, "p[class='mb-0 fz14 list-inline-item mb5-sm pe-1']").text
        price = job.find_element(By.CSS_SELECTOR, "div[class='text-lg-end']").text

        data.append({'title': title, 'url': url, 'location': location, 'price': price, 'page_number': page_count})

        # print(title, url, location, price)
    
    return data

In [16]:
# WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h5[class="title mb-3"]')))

### Pagination

In [None]:
data = []
page_count = 1
while True:
    page_content = analyse_page(page_count, driver)
    data.extend(page_content)
    
    # Navigator location
    ## Next page strategy
    # next_page = driver.find_element(By.CSS_SELECTOR, "span[class='fas fa-angle-right']")
    # next_page.click()

    ## Page numbers as attribute values strategy
    try:
        driver.find_element(By.CSS_SELECTOR, f'li[page-num="{page_count + 1}"]').click()
    ## Signifies we have reached the last page. Exit Loop
    except NoSuchElementException:
        break

    page_count += 1

    time.sleep(random.uniform(1.5, 3))

In [21]:
driver.quit()

#### Alternative Navigator Location Strategy

##### Page Numbers as attribute values

In [None]:
# driver.find_element(By.CSS_SELECTOR, 'li[page-num="2"]')

##### Others

In [None]:
# navigator_bar = driver.find_element(By.CSS_SELECTOR, "ul[class='page_navigation']")
# navigators = navigator_bar.find_elements(By.TAG_NAME, 'li')
# navigators[-1].click()

In [None]:
# for navigator in driver.find_elements(By.CSS_SELECTOR, "li[class='page-item bg-white ']"):
#     next_page = navigator.find_element(By.CSS_SELECTOR, "a[class='page-link ']")
#     if next_page == None:
#         raise Exception("No next page")

In [None]:
len(data)

## Save Data

In [19]:
import json

In [20]:
with open("data/workwisse.jsonl", "w", encoding="utf-8") as f:
    for line in data:
        json.dump(line, f)
        f.write("\n")       # Critical newline character