In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    # Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    # Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    # options.add_argument('headless')
    
    # Change the path to where chromedriver is in your home folder.
    service = Service("/Users/Kedar/DATA ANALYTICS/data_science_salary_project/chromedriver.exe")
    
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_window_size(1120, 1000)

    url = f'https://www.glassdoor.com/Job/jobs.htm?sc.keyword={keyword}&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    driver.get(url)
    jobs = []

    wait = WebDriverWait(driver, 10)

    while len(jobs) < num_jobs:  # If true, should be still looking for new jobs.

        # Let the page load
        time.sleep(4)

        # Check for the "Sign Up" prompt
        try:
            sign_up_element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "selected")))
            sign_up_element.click()
        except (ElementClickInterceptedException, TimeoutException):
            pass

        time.sleep(1)

        # Close modal if present
        try:
            close_modal = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "ModalStyle__xBtn___29PT9")))
            close_modal.click()
        except (NoSuchElementException, TimeoutException):
            pass

        # Check for job listings
        job_buttons = driver.find_elements(By.CLASS_NAME, "JobsList_wrapper__EyUF6")
        print(job_buttons)
        if not job_buttons:
            print("No job elements found on this page.")
            break

        for job_button in job_buttons:
            
            if len(jobs) >= num_jobs:
                break

            job_button.click()
            time.sleep(1)
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element(By.XPATH,'.//span[@class="EmployerProfile_compactEmployerName__LE242"]').text
                    location = driver.find_element(By.XPATH,'.//div[@class="JobCard_location__rCz3x"]').text
                    job_title = driver.find_element(By.XPATH,'.//a[@class="JobCard_jobTitle___7I6y"]').text
                    job_description = driver.find_element(By.XPATH,'.//div[@class="JobCard_location__rCz3x"]').text
                    salary_estimate = driver.find_element(By.XPATH, './/div[@class="JobCard_salaryEstimate__arV5J"]').text
                    rating = driver.find_element(By.XPATH, './/div[@class="EmployerProfile_ratingContainer__ul0Ef"]').text
                    collected_successfully = True
                except Exception as e:
                    print(f"Error collecting job details: {e}")
                    time.sleep(5)

            try:
                salary_estimate = driver.find_element(By.XPATH, './/div[@class="JobCard_salaryEstimate__arV5J"]').text
                print(salary_estimate)
                
            except NoSuchElementException:
                salary_estimate = -1

            try:
                rating = driver.find_element(By.XPATH, './/div[@class="EmployerProfile_ratingContainer__ul0Ef"]').text
                print(rating)
            except NoSuchElementException:
                rating = -1

            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))

            try:
                driver.find_element(By.XPATH, './/div[@class="tab" and @data-tab-type="overview"]').click()

                try:
                    headquarters = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
                except NoSuchElementException:
                    headquarters = -1

                try:
                    size = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
                except NoSuchElementException:
                    size = -1

                try:
                    founded = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
                except NoSuchElementException:
                    founded = -1

                try:
                    type_of_ownership = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
                except NoSuchElementException:
                    type_of_ownership = -1

                try:
                    industry = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
                except NoSuchElementException:
                    industry = -1

                try:
                    sector = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
                except NoSuchElementException:
                    sector = -1

                try:
                    revenue = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
                except NoSuchElementException:
                    revenue = -1

                try:
                    competitors = driver.find_element(By.XPATH, './/div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
                except NoSuchElementException:
                    competitors = -1

            except NoSuchElementException:
                headquarters = -1
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1
                competitors = -1

            if verbose:
                print("Headquarters: {}".format(headquarters))
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("Competitors: {}".format(competitors))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            jobs.append({
                "Job Title": job_title,
                "Salary Estimate": salary_estimate,
                "Job Description": job_description,
                "Rating": rating,
                "Company Name": company_name,
                "Location": location,
                "Headquarters": headquarters,
                "Size": size,
                "Founded": founded,
                "Type of Ownership": type_of_ownership,
                "Industry": industry,
                "Sector": sector,
                "Revenue": revenue,
                "Competitors": competitors
            })

        try:
            next_button = wait.until(EC.element_to_be_clickable((By.XPATH, './/li[@class="next"]//a')))
            next_button.click()
        except (NoSuchElementException, TimeoutException):
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break

    return pd.DataFrame(jobs)  # This line converts the dictionary object into a pandas DataFrame.


In [2]:
# This line will open a new chrome window and start the scraping.
df = get_jobs("data scientist", 5, False)
print(df)

[<selenium.webdriver.remote.webelement.WebElement (session="7bc24341c2d9b9b90a6faf73c45c3488", element="f.44FEAAEEAA746A30CBA56594EBCDBBFC.d.CF1BAD41D862808B6880141AA3A95BFB.e.25")>]
Edwards Lifesciences
California
Senior Principal, Statistical Programming
California
$145K - $205K (Employer est.)
4.0
[<selenium.webdriver.remote.webelement.WebElement (session="7bc24341c2d9b9b90a6faf73c45c3488", element="f.44FEAAEEAA746A30CBA56594EBCDBBFC.d.CF1BAD41D862808B6880141AA3A95BFB.e.25")>]
Edwards Lifesciences
California
Senior Principal, Statistical Programming
California
$145K - $205K (Employer est.)
4.0
[<selenium.webdriver.remote.webelement.WebElement (session="7bc24341c2d9b9b90a6faf73c45c3488", element="f.44FEAAEEAA746A30CBA56594EBCDBBFC.d.CF1BAD41D862808B6880141AA3A95BFB.e.25")>]
Edwards Lifesciences
California
Senior Principal, Statistical Programming
California
$145K - $205K (Employer est.)
4.0


KeyboardInterrupt: 