In [61]:
import csv, requests, time
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

outputfile = 'pultegroup_jobs.csv'

# By press F12 to open the developer tools, we can find the URL of the job list page
url = 'https://pultegroup.wd1.myworkdayjobs.com/en-US/PGI/jobs'

options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open the website
driver.get(url)
# Wait for the page to load
time.sleep(3)

jobList = list()
pageNum = 1

while True:
    # Debugging: if code can successfully scrape the first page
    # if pageNum == 2:
    #     break

    # - By checking the HTML structure of the job list page, we can find that
    #   all the job cards are in the <li> tag with class name "css-1q2dra3"
    # - Similarly, we can find the tag or the data-automation-id of the
    #   job title, location, link, post date and job ID
    cards = driver.find_elements(By.CSS_SELECTOR, 'li[class="css-1q2dra3"]')
    if not cards:
        break

    for card in cards:
        # Get the job title
        try:
            txt = card.find_elements(By.TAG_NAME, 'a')
            for t in txt:
                if t.get_attribute('textContent'):
                    jobInfo = t.get_attribute('textContent')
                    jobInfo = jobInfo.split(' - ')
                    jobTitle = jobInfo[0]
        except Exception as e:
            jobTitle = ''

        # Get the job location
        try:
            locationInfo = card.find_elements(By.CSS_SELECTOR, 'div[data-automation-id="locations"]')
            for locInfo in locationInfo:
                loc = locInfo.find_elements(By.TAG_NAME, 'dd')
                for l in loc:
                    jobLocation = l.get_attribute('textContent') if l.get_attribute('textContent') else ''
        except Exception as e:
            jobLocation = ''

        # Get the link which contains detailed information of the job
        try:
            jobLink = card.find_element(By.TAG_NAME, 'a').get_attribute('href')
        except Exception as e:
            jobLink = ''

        # Get the post date
        try:
            timeInfo = card.find_elements(By.CSS_SELECTOR, 'div[data-automation-id="postedOn"]')
            for tmInfo in timeInfo:
                time_ = tmInfo.find_elements(By.TAG_NAME, 'dd')
                for tm in time_:
                    postDate = tm.get_attribute('textContent') if tm.get_attribute('textContent') else ''
        except Exception as e:
            postDate = ''
        
        # Get the job requisition ID
        try:
            jobIDInfo = card.find_elements(By.CSS_SELECTOR, 'ul[data-automation-id="subtitle"]')
            for idInfo in jobIDInfo:
                id_ = idInfo.find_elements(By.TAG_NAME, 'li')
                for i in id_:
                    jobID = i.get_attribute('textContent') if i.get_attribute('textContent') else ''
        except Exception as e:
            jobID = ''

        jobList.append({
                "Job Litle": jobTitle,
                "Job Location": jobLocation,
                "Link": jobLink,
                "Post Date": postDate,
                "Requisition ID": jobID,
            })
        
    # Try to load next page by clicking the "Next" button
    try:
        nextBtn = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="next"]')
        # execute JavaScript to scroll the button into view and click it
        driver.execute_script("arguments[0].scrollIntoView();", nextBtn)
        time.sleep(1)
        nextBtn.click()
        pageNum += 1
        time.sleep(3)
    except Exception as e:
        break
        
driver.quit()

# Debugging: if code can successfully scrape the information
# print(jobList)

# Save the job data to a CSV file
jobs = pd.DataFrame(jobList)
jobs.to_csv(outputfile, index=False, encoding='utf-8-sig')
            
