In [1]:
import time
import re
import json
from pathlib import Path

import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# Extract url from settings.json
settings_path = Path("../settings.json")
settings = json.loads(settings_path.open().read())
url = settings["URL1"]

In [3]:
# Initialize the WebDriver
driver = webdriver.Edge()
# driver = webdriver.Chrome()
# driver.execute_script("options.addArguments("--disable-search-engine-choice-screen");").

# Open the homepage
driver.get(url)

# Wait for the page to load
time.sleep(2)

try:
    # Use the id to locate the element
    element = driver.find_element(By.ID, "cookiescript_close")
    element.click()  # Perform the click action
    print("Element clicked successfully!")

except Exception as e:
    print(f"Error occurred while finding/clicking the element: {e}")

The msedgedriver version (128.0.2739.67) detected in PATH at C:\Program Files\webdrivers\msedgedriver.exe might not be compatible with the detected MicrosoftEdge version (129.0.2792.52); currently, msedgedriver 129.0.2792.52 is recommended for MicrosoftEdge 129.*, so it is advised to delete the driver in PATH and retry


Element clicked successfully!


In [4]:
try:
    # Use the full class name to locate the element
    element = driver.find_element(By.CLASS_NAME, "MuiTab-iconWrapper")

    # Get the text content of the element
    offers_text = element.text

    # Use regex to extract the integer, allowing for spaces as thousand separators
    offers_number = re.search(
        r"(\d+\s?)+", offers_text
    )  # This captures numbers with optional spaces
    if offers_number:
        # Remove spaces and convert to an integer
        offers_number = int(offers_number.group().replace(" ", ""))
        print("Number of offers:", offers_number)

except Exception as e:
    print(f"Error occurred while finding the element: {e}")

Number of offers: 179


In [5]:
# Init the dataframe with offers
offers_df = pd.DataFrame(
    columns=["data_index", "link", "position_name", "company_name"]
)

# Locate the parent div (using your unique identifier, e.g., data-test-id)
parent_div = driver.find_element(By.CSS_SELECTOR, '[data-test-id="virtuoso-item-list"]')
parent_div.find_elements(By.XPATH, "./div")[0]

# Retrieve the height of a single offer, window height, whole page height and current scroll position
order_height = parent_div.find_elements(By.XPATH, "./div")[0].size["height"]
window_height = driver.execute_script("return window.innerHeight;")
document_height = driver.execute_script("return document.body.scrollHeight;")
current_scroll_position = driver.execute_script("return window.scrollY;")

In [6]:
# If current_scroll_position + window_height >= document_height
while not current_scroll_position + window_height >= document_height:

    # Find all child divs inside the parent div
    child_divs = parent_div.find_elements(By.XPATH, "./div")

    # Loop through each child div and find nested a tags with links
    for index, child_div in enumerate(child_divs):

        # Extract the 'data-index' attribute from each child div
        data_index = child_div.get_attribute("data-index")

        if not offers_df["data_index"].isin([data_index]).any():

            inner_div_1 = child_div.find_element(By.TAG_NAME, "div")
            inner_div_2 = inner_div_1.find_element(By.TAG_NAME, "div")
            # Now locate the <a> tag inside the first inner div
            link_element = inner_div_2.find_element(By.TAG_NAME, "a")
            # Extract the href (link) from the <a> tag
            link = link_element.get_attribute("href")

            position_name = inner_div_2.text.split("\n")[0]
            company_name = inner_div_2.text.split("\n")[3]

            new_row = pd.DataFrame(
                [[data_index, link, position_name, company_name]],
                columns=["data_index", "link", "position_name", "company_name"],
            )

            # Append the row using pd.concat()
            offers_df = pd.concat([offers_df, new_row], ignore_index=True)

            print(data_index)
            print(link)
            print(position_name)
            print(company_name)
            print()

    # Scroll height is an order height times number of orders
    scroll_height = order_height * len(child_divs)
    # Scroll so that more offers load
    driver.execute_script(f"window.scrollBy(0, {scroll_height});")
    
    # Get the current scroll position
    current_scroll_position = driver.execute_script("return window.scrollY;")

    time.sleep(2)

0
https://justjoin.it/offers/allegro-research-engineer-nlp--warszawa-ai
Research Engineer (NLP)
Allegro

1
https://justjoin.it/offers/allegro-machine-learning-engineer-machine-learning-research-lab--warszawa-ai
Machine Learning Engineer (Machine Learning Research Lab)
Allegro

2
https://justjoin.it/offers/golance-devops-engineer-with-some-backend--poland-remote--ai
DevOps Engineer (with some Backend)
goLance

3
https://justjoin.it/offers/moderna-poland-sp-z-o-o--principal-systems-engineer-ariba--warszawa-ai
Principal Systems Engineer (Ariba)
MODERNA POLAND SP. Z O.O.

4
https://justjoin.it/offers/grape-up-ai-engineer-wroclaw-ai
AI Engineer
Grape Up

5
https://justjoin.it/offers/aptiv-services-poland-s-a--expert-data-scientist---machine-learning-krakow-ai
Expert Data Scientist - Machine Learning
Aptiv Services Poland S.A.

6
https://justjoin.it/offers/aptiv-services-poland-s-a--ai-ml-engineer-planning-behavior-adas-krakow-ai-8ea8db64
AI/ML Engineer Planning & Behavior ADAS
Aptiv Service

KeyboardInterrupt: 

In [None]:
offers_df

In [8]:
# Close the browser
driver.quit()