In [21]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime, timedelta

def scrape_simplyhired_jobs(job_title="Data Scientist", location="United States", radius=10, days_ago=30):
    
    # Set up Selenium WebDriver
    chrome_options = Options()
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    driver.maximize_window()

    # Search URL with filters for radius, days ago, and sorting by date
    base_url = "https://www.simplyhired.com/search"
    search_url = f"{base_url}?q={job_title.replace(' ', '+')}&l={location.replace(' ', '+')}&s=d&sr=50&t=30"
    
    # filter for 10 miles radius and jobs posted in the last 30 days
    search_url += f"&radius={radius}&posted=30"
    
    # Adding sorting by date to sort the job listings by latest first
    search_url += "&sort=date"

    driver.get(search_url)
    time.sleep(5)  

    # Initialize result lists
    job_qualifications = []
    job_descriptions = []
    job_links = []

    # Find the total number of pages safely
    try:
        # Try to find pagination elements 
        pagination_elements = driver.find_elements(By.XPATH, "//a[@aria-label='Page number']")
        if pagination_elements:
            total_pages = int(pagination_elements[-1].text)  # Get the last page number
            print(f"Total pages available: {total_pages}")
        else:
            total_pages = 5  # If pagination elements aren't found, assume there's only 5 page
    except Exception as e:
        print(f"Error finding total pages: {e}")
        total_pages = 5  # Default to 5 page if there's an issue

    pages_to_scrape = total_pages
    print("pages to scrape:", pages_to_scrape)

    # Pagination loop 
    current_page = 1
    while current_page <= pages_to_scrape:
        print(f"Scraping page {current_page}...")
        
        try:
            # Find all jobs on the current page
            job_elements = driver.find_elements(By.XPATH, "//*[@id='job-list']/li")
            for i in range(1, len(job_elements) + 1):
                try:
                    # Select the job
                    job_xpath = f"//*[@id='job-list']/li[{i}]"
                    job_element = driver.find_element(By.XPATH, job_xpath)
                    job_element.click()
                    time.sleep(3)  # Allow the job details to load

                    # Extract job details
                    try:
                        qualification_elements = driver.find_elements(By.XPATH, "//ul[@class='chakra-wrap__list css-19lo6pj']/li/span[@data-testid='viewJobQualificationItem']")
                        qualifications = [qual.text for qual in qualification_elements]
                    except:
                        qualifications = []

                    try:
                        description = driver.find_element(By.XPATH, "//div[@data-testid='viewJobBodyJobFullDescriptionContent']").text
                    except:
                        description = "No description available"

                    try:
                        link = driver.current_url
                    except:
                        link = "No link available"

                    # Append details to lists
                    job_qualifications.append(qualifications)
                    job_descriptions.append(description)
                    job_links.append(link)

                except Exception as e:
                    print(f"Error processing job {i}: {e}")
                    continue

            # Move to the next page if applicable
            if current_page < pages_to_scrape:
                try:
                    next_button = driver.find_element(By.XPATH, "//a[contains(@aria-label, 'Next')]")
                    next_button.click()
                    time.sleep(5)
                    current_page += 1
                except Exception as e:
                    print(f"Error navigating to next page: {e}")
                    break
            else:
                print("Reached the limit of pages to scrape.")
                break

        except Exception as e:
            print(f"Error processing page {current_page}: {e}")
            break

    driver.quit()

    # Create a DataFrame to store the data
    df = pd.DataFrame({
        "Job Description": job_descriptions,
        "Qualifications": ["; ".join(qual) for qual in job_qualifications],
        "Job Link": job_links
    })

    return df

def main():
    # Multiple job roles and locations input
    job_titles_input = input("Enter the job titles separated by commas (e.g., Data Scientist, Software Engineer): ").strip().split(',')
    location_input = input("Enter the locations separated by commas (e.g., New York, California): ").strip().split(',')

    job_titles_input = [title.strip() for title in job_titles_input]
    location_input = [loc.strip() for loc in location_input]

    # Iterate over each combination of job role and location
    all_job_data = []
    for job_title in job_titles_input:
        for location in location_input:
            print(f"Scraping jobs for '{job_title}' in '{location}'...")
            df = scrape_simplyhired_jobs(job_title=job_title, location=location)
            all_job_data.append(df)

    # Combine all results into one DataFrame
    final_df = pd.concat(all_job_data, ignore_index=True)

    # Save the combined DataFrame to a CSV file
    final_df.to_csv("simplyhired_jobs.csv", index=False)
    print("All job details saved to simplyhired_all_jobs.csv")

if __name__ == "__main__":
    main()


Enter the job titles separated by commas (e.g., Data Scientist, Software Engineer):  Data Scientist
Enter the locations separated by commas (e.g., New York, California):  New York, Dallas


Scraping jobs for 'Data Scientist' in 'New York'...
pages to scrape: 5
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Reached the limit of pages to scrape.
Scraping jobs for 'Data Scientist' in 'Dallas'...
pages to scrape: 5
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Reached the limit of pages to scrape.
All job details saved to simplyhired_all_jobs.csv
