In [100]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
import time

driver = webdriver.Chrome()

In [None]:
all_job_cards_id = 'mosaic-provider-jobcards'

close_card_id = "mosaic-desktopserpjapopup"

In [108]:
class IndeedClient:

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.driver.implicitly_wait(5)
        self.wait = WebDriverWait(self.driver, 5)
        self.output = {}

    def __get_job_listings(self, job) -> dict:
        output_dic = dict(company_name=[], company_url=[], job_title=[], apply_now_url=[], description=[])
        has_next_page: bool = True
        page_number = 0
        curr_url = self.generate_job_listing_url(job, page_number)

        while has_next_page:
            self.driver.get(curr_url)
            
            job_postings_div = self.driver.find_element(By.ID, 'mosaic-provider-jobcards')
            job_postings_list = job_postings_div.find_element(By.TAG_NAME, "ul").find_elements(By.XPATH, './li')

            if len(job_postings_list) != 18: has_next_page = False

            for index, posting in enumerate(job_postings_list):
                if index not in (5, 11, 17):
                    self.__scrape_desired_information_from_page(posting, output_dic)

            page_number += 1
            curr_url = self.generate_job_listing_url(job, page_number)
            time.sleep(5)

        return output_dic


    def __scrape_desired_information_from_page(self, posting, output_dict):
        self.driver.execute_script("arguments[0].scrollIntoView(true);", posting)
        time.sleep(2)

        # Show the job posting information on the right side of the screen
        posting.click()
        posting_description = self.wait.until(EC.presence_of_element_located((By.ID, "jobsearch-ViewjobPaneWrapper")))

        # Company Name
        company_name = posting_description.find_element(By.TAG_NAME, 'a').get_attribute('aria-label').split(' (')[0]
        output_dict['company_name'].append(company_name)

        # Company URL
        company_url = posting_description.find_element(By.TAG_NAME, 'a').get_attribute("href")
        output_dict['company_url'].append(company_url)

        # Job Title
        job_title = posting_description.find_element(By.XPATH, "//h2[@data-testid='jobsearch-JobInfoHeader-title']").find_element(By.TAG_NAME, 'span').text.split('\n')[0]
        print(job_title)

        # Apply Now URL
        try:
            apply_container = posting_description.find_element(By.ID, 'applyButtonLinkContainer')
            output_dict['apply_now_url'].append(apply_container.find_element(By.TAG_NAME, 'button').get_attribute('href'))
        except NoSuchElementException:
            output_dict['apply_now_url'].append(None)

        # Job Description
        description = posting_description.find_element(By.ID, 'jobDescriptionText').text
        output_dict['description'].append(description)
        
    def scrape_job_listings(self, lst_of_jobs: list[str]):
        try:
            for job in lst_of_jobs:
                self.output[job] = self.__get_job_listings(job)
                time.sleep(5)
        except Exception as e:
            print(e)
        finally:
            self.close()

    def perform_initial_cleanups(self):
        self.driver.get(self.generate_job_listing_url('software engineer', 0))
        time.sleep(3)
        self.clear_popups()

    def clear_popups(self):
        self.driver.refresh()
        time.sleep(5)

    def generate_job_listing_url(self, job, page_number):
        base_url = f'https://sg.indeed.com/jobs?q={"+".join(job.split())}&l=Singapore&radius=10&fromage=1&start={page_number * 10}'
        return base_url

    def get_scraped_items(self):
        return self.output

    def close(self):
        self.driver.close()

In [109]:
test_driver = IndeedClient()

test_driver.perform_initial_cleanups()
test_driver.scrape_job_listings(['software engineer'])


Software Engineer, React/React Native
Preinstalled Image Engineer/PRISM Admin - Software Systems Engineering
College Intern - Embedded Software Engineer
Senior Software Engineer (Front-End)
Founding Software Engineer
Software Engineer
Full Stack JavaScript developer
LLM Developer


KeyboardInterrupt: 

In [103]:
test_driver.get_scraped_items()

{'software engineer': {'company_name': ['NodeFlair',
   'Hewlett Packard',
   'Hewlett Packard',
   'NodeFlair',
   'Cleric',
   'A2000 Solutions Pte Ltd',
   'NodeFlair',
   'TECHNOLOGY SERVICES GROUP PTE. LTD.',
   'SERVITA PRIVATE LIMITED',
   'Cleric',
   'NodeFlair',
   'Activate Interactive Pte Ltd',
   'NodeFlair',
   'Capgemini',
   'BYTECENTURE CONSULTING PTE. LTD.',
   'SERVITA PRIVATE LIMITED',
   'Michael Page',
   'NodeFlair',
   'NodeFlair',
   'Beyondsoft International',
   'Dyson',
   'Capgemini',
   'RECRUITPEDIA PTE. LTD.',
   'NodeFlair',
   '5 HEALTH PTE. LTD.',
   'CAREERNEXUS PTE. LTD.',
   'RECRUITPEDIA PTE. LTD.',
   'BioQuest Advisory',
   'RECRUITPEDIA PTE. LTD.',
   'JDA WMS PTE. LTD.',
   'BioQuest Advisory',
   'BioQuest Advisory',
   'ScienTec Consulting',
   'SAP',
   'NodeFlair',
   'BYTECENTURE CONSULTING PTE. LTD.',
   'SAP',
   'NodeFlair',
   'ScienTec Consulting',
   'Capgemini',
   'NIVABIZ PTE. LTD.',
   'ScienTec Consulting',
   'ScienTec Consult