In [7]:
!pip install selenium
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading numpy-2.2.0-cp312-cp312-macosx_14_0_arm64.whl (5.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: tzdata, numpy, pandas
Successfully installed numpy-2.2.0 pandas-2.2.3 tzdata-2024.2


In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time
import random
import os

def create_driver():
    # Setup Chrome options for better performance and stealth
    chrome_options = Options()
    
    # Optional: Add these to make scraping less detectable
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    # Uncomment if you want to run without GUI
    # chrome_options.add_argument("--headless")
    
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Setup WebDriver 
    #service = Service('/path/to/chromedriver')  # Update this path
    driver = webdriver.Chrome( options=chrome_options)
    
    # Optional: Add additional browser configurations
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver



def scrape_ziprecruiter_jobs(base_url, max_pages=5):
    # Create driver
    driver = create_driver()
    
    # Consolidated jobs data
    all_jobs_data = []

    try:
        # Iterate through pages
        for page_num in range(1, max_pages + 1):
            # Construct URL with pagination
            url = f"{base_url}&page={page_num}"
            print(f"Scraping page {page_num}: {url}")
            
            # Navigate to the URL
            driver.get(url)
            
            # Wait for page to load completely
            time.sleep(random.uniform(3, 5))
            
            # Try multiple selectors and methods
            job_selectors = [
                ".job_result_two_pane",
                "[class*='job_result']",
                "div[data-testid='job-list-item']"
            ]
            
            job_listings = []
            
            # Try different methods to find job listings
            for selector in job_selectors:
                try:
                    # Wait and find elements
                    wait = WebDriverWait(driver, 10)
                    job_listings = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, selector)))
                    
                    if job_listings:
                        print(f"Found {len(job_listings)} jobs using selector: {selector}")
                        break
                except TimeoutException:
                    continue
            
            if not job_listings:
                # Take screenshot for debugging
                driver.save_screenshot(f'debug_screenshot_page_{page_num}.png')
                print(f"No job listings found on page {page_num}")
                continue

            # Extract job information
            for job in job_listings:
                try:
                    # Job Title
                    title_elem = job.find_element(By.CSS_SELECTOR, "h2 a")
                    title = title_elem.text
                    # Company
                    company_elem = job.find_element(By.CSS_SELECTOR, "[data-testid='job-card-company']")
                    company = company_elem.text

                    # Location
                    location_elem = job.find_element(By.CSS_SELECTOR, "[data-testid='job-card-location']")
                    location = location_elem.text
            

                    # Salary
                    try:
                        salary_elem = job.find_element(By.XPATH, ".//p[contains(text(), '$')]")
                        salary = salary_elem.text
                    except NoSuchElementException:
                        salary = "Salary not specified"

                    
                    #Job URL
                    job_url = title_elem.get_attribute("href")  # Extract the URL from the anchor tag


                    all_jobs_data.append({
                        'Title': title,
                        'Company': company,
                        'Location': location,
                        'Salary': salary,
                        'Job URL': job_url
                    })
                except Exception as job_error:
                    print(f"Error extracting individual job on page {page_num}: {job_error}")

            # Optional: Add a small delay between pages to avoid rate limiting
            time.sleep(random.uniform(2, 4))

        return pd.DataFrame(all_jobs_data)
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()  # Return empty DataFrame
    
    finally:
        # Always close the browser
        driver.quit()  
 

def main():
    # Get user inputs
    job_titles = input("Enter job titles separated by commas (e.g., Data Analyst, Data Engineer): ").split(",")
    job_titles = [title.strip() for title in job_titles]
    max_pages = int(input("Enter the maximum number of pages to scrape for each title: "))

    # Create a folder to save the CSV files
    output_folder = "ziprecruiter_jobs"
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over job titles and scrape jobs
    for job_title in job_titles:
        job_title_url = job_title.replace(" ", "+")
        base_url = f"https://www.ziprecruiter.com/jobs-search?search={job_title_url}&location=Remote+%28USA%29"
        
        # Scrape jobs
        print(f"Scraping jobs for '{job_title}'...")
        jobs_df = scrape_ziprecruiter_jobs(base_url, max_pages)
        jobs_df['Job Name'] = job_title
        
        # Save to CSV
        file_name = f"zipRecruiter_{job_title.replace(' ', '_')}.csv"
        file_path = os.path.join(output_folder, file_name)
        jobs_df.to_csv(file_path, index=False)
        print(f"Saved: {file_path}")
    
    print("Scraping completed. All files saved in the folder:", output_folder)

if __name__ == "__main__":
    main()

Enter job titles separated by commas (e.g., Data Analyst, Data Engineer):  Data Analyst, Business Analyst, Data Engineer, Machine Learning Engineer, Data Engineer, Data Scientist, Software Engineer
Enter the maximum number of pages to scrape for each title:  3


Scraping jobs for 'Data Analyst'...
Scraping page 1: https://www.ziprecruiter.com/jobs-search?search=Data+Analyst&location=Remote+%28USA%29&page=1
Found 20 jobs using selector: .job_result_two_pane
Scraping page 2: https://www.ziprecruiter.com/jobs-search?search=Data+Analyst&location=Remote+%28USA%29&page=2
No job listings found on page 2
Scraping page 3: https://www.ziprecruiter.com/jobs-search?search=Data+Analyst&location=Remote+%28USA%29&page=3
No job listings found on page 3
Saved: ziprecruiter_jobs/zipRecruiter_Data_Analyst.csv
Scraping jobs for 'Business Analyst'...
Scraping page 1: https://www.ziprecruiter.com/jobs-search?search=Business+Analyst&location=Remote+%28USA%29&page=1
Found 20 jobs using selector: .job_result_two_pane
Scraping page 2: https://www.ziprecruiter.com/jobs-search?search=Business+Analyst&location=Remote+%28USA%29&page=2
Found 20 jobs using selector: .job_result_two_pane
Scraping page 3: https://www.ziprecruiter.com/jobs-search?search=Business+Analyst&locatio

In [10]:
import os
import pandas as pd
import re
import numpy as np

def combine_csv_files(folder_path):
    # List to hold individual DataFrames
    df = pd.DataFrame()

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            temp_df = pd.read_csv(file_path)
            df = pd.concat([df, temp_df], ignore_index=True)
            print(f"Added: {file_name}")
    
    return df

# Example usage
folder_path = "ziprecruiter_jobs"  # Replace with your folder path
df = combine_csv_files(folder_path)

# Display the combined DataFrame
df.head()


Added: zipRecruiter_Business_Analyst.csv
Added: zipRecruiter_Software_Engineer.csv
Added: zipRecruiter_Data_Engineer.csv
Added: zipRecruiter_Machine_Learning_Engineer.csv
Added: zipRecruiter_Data_Scientist.csv
Added: zipRecruiter_Data_Analyst.csv


Unnamed: 0,Title,Company,Location,Salary,Job URL,Job Name
0,Sr. RevOps Business Analyst,ZipRecruiter,"Santa Monica, CA",$108K - $150K / yr,https://www.ziprecruiter.com/k/l/AAJWhSUtFb29f...,Business Analyst
1,JD Edwards/Application Business Analyst - REMO...,CyberCoders,"New York, NY",$130K - $170K / yr,https://www.ziprecruiter.com/k/l/AAJFV8y9lttzn...,Business Analyst
2,Business Analyst,Birddog Traffic Control,"Los Angeles, CA",$80K - $120K / yr,https://www.ziprecruiter.com/k/l/AAKRF12sJwNHG...,Business Analyst
3,Trust & Custody Business Analyst (Matrix Trust...,Enterprise Iron,"Colorado Springs, CO",Salary not specified,https://www.ziprecruiter.com/k/l/AAJxJTcOJcOhN...,Business Analyst
4,Junior Level Business AnalystNEW!,Technology Talent Network LLC,"Los Angeles, CA",$27 - $30 / hr,https://www.ziprecruiter.com/k/l/AAKcl4PzasoKr...,Business Analyst


In [114]:
def extract_location_details(location):
    # Handle potential null or non-string values
    if pd.isna(location) or not isinstance(location, str):
        return pd.Series([np.nan, np.nan])
    
    # Remove ' • Remote'
    location = location.replace(' • Remote', '')
    
    # Split city and state
    match = re.match(r'(.*),\s*(\w{2})', location)
    return pd.Series(match.groups() if match else (location, np.nan))
df[['City', 'State']] = df['Location'].apply(extract_location_details)

In [115]:
df.head()

Unnamed: 0,Title,Company,Location,Salary,Job URL,Job Name,City,State
0,Sr. RevOps Business Analyst,ZipRecruiter,"Santa Monica, CA",$108K - $150K / yr,https://www.ziprecruiter.com/k/l/AAK6WGDGyOSZO...,business analyst,Santa Monica,CA
1,Business Systems Analyst Consultant,PNC Financial Services Group,"Dallas, TX",Salary not specified,https://www.ziprecruiter.com/k/l/AAJKzNteE3qye...,business analyst,Dallas,TX
2,Business AnalystNEW!,Technology Talent Network LLC,"Paterson, NJ",$27 - $30 / hr,https://www.ziprecruiter.com/k/l/AALqW8lMxvWMv...,business analyst,Paterson,NJ
3,"Epicor Analyst, Epicor Admin, Epicor Business ...",CyberCoders,"Compton, CA",$80K - $110K / yr,https://www.ziprecruiter.com/k/l/AAKwo9cSP62mv...,business analyst,Compton,CA
4,Business Analyst,Birddog Traffic Control,"Los Angeles, CA",$80K - $120K / yr,https://www.ziprecruiter.com/k/l/AALRJqToY61qH...,business analyst,Los Angeles,CA


In [116]:
def process_salary_column(df):
    def calculate_yearly_salary(salary):
        try:
            if '/ hr' in salary:
                # Extract hourly range
                hourly_range = salary.replace('$', '').replace('/ hr', '').split(' - ')
                hourly_range = [float(rate.strip()) for rate in hourly_range]
                yearly_range = [hourly_rate * 40 * 52 for hourly_rate in hourly_range]  # Assuming 40 hours/week, 52 weeks/year
            elif '/ yr' in salary:
                # Extract yearly range and handle "K" suffix and commas
                yearly_range = (
                    salary.replace('$', '')
                    .replace('/ yr', '')
                    .replace('K', '000')
                    .replace(',', '')
                    .split(' - ')
                )
                yearly_range = [float(rate.strip()) for rate in yearly_range]
            else:
                return [np.nan, np.nan, np.nan]  # Handle unspecified salary
            
            min_yearly, max_yearly = yearly_range[0], yearly_range[-1]
            median_yearly = np.mean(yearly_range)
            return [min_yearly, max_yearly, median_yearly]
        except Exception:
            return [np.nan, np.nan, np.nan]  # Handle unexpected formatting

    # Apply the function to the Salary column
    salary_data = df['Salary'].apply(lambda x: calculate_yearly_salary(x) if isinstance(x, str) else [np.nan, np.nan, np.nan])
    df[['min_yearly', 'max_yearly', 'median_yearly']] = pd.DataFrame(salary_data.tolist(), index=df.index)
    
    return df

# Apply the improved function to the dataset
df = process_salary_column(df)

# Preview the updated dataset
df = df.drop(columns=['Location', 'Salary','Job Type'], errors='ignore')

df.head()


Unnamed: 0,Title,Company,Job URL,Job Name,City,State,min_yearly,max_yearly,median_yearly
0,Sr. RevOps Business Analyst,ZipRecruiter,https://www.ziprecruiter.com/k/l/AAK6WGDGyOSZO...,business analyst,Santa Monica,CA,108000.0,150000.0,129000.0
1,Business Systems Analyst Consultant,PNC Financial Services Group,https://www.ziprecruiter.com/k/l/AAJKzNteE3qye...,business analyst,Dallas,TX,,,
2,Business AnalystNEW!,Technology Talent Network LLC,https://www.ziprecruiter.com/k/l/AALqW8lMxvWMv...,business analyst,Paterson,NJ,56160.0,62400.0,59280.0
3,"Epicor Analyst, Epicor Admin, Epicor Business ...",CyberCoders,https://www.ziprecruiter.com/k/l/AAKwo9cSP62mv...,business analyst,Compton,CA,80000.0,110000.0,95000.0
4,Business Analyst,Birddog Traffic Control,https://www.ziprecruiter.com/k/l/AALRJqToY61qH...,business analyst,Los Angeles,CA,80000.0,120000.0,100000.0


In [76]:
df[['Title','Company','Job URL']]

Unnamed: 0,Title,Company,Job URL
0,Data AnalystNEW!,CoverPoint Partners,https://www.ziprecruiter.com/k/l/AAIFDlNP18oI3...
1,Securities Data Management Analyst,MassMutual,https://www.ziprecruiter.com/k/l/AALIYmircDA1R...
2,Data Research Analyst,AuctionIQ,https://www.ziprecruiter.com/k/l/AAIwPaOK0kY9P...
3,Financial Data Analyst,Race Telecommunications LLC,https://www.ziprecruiter.com/k/l/AAIP9oklVYBxz...
4,Data AnalystNEW!,NTT DATA,https://www.ziprecruiter.com/k/l/AALadYZfzHhq1...


# Work in progress

In [109]:
for i in df['Job URL'].head(1):
    res = extract_job_description(i)

In [108]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def extract_education_and_preferred_experience(url):
    # Initialize the WebDriver (Make sure the driver is in your PATH)
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    try:
        # Open the URL
        driver.get(url)

        # Wait for the job description to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[@class="job_description"]'))
        )

        # Extract "Education and Experience" section
        education_xpath = '//p[strong[contains(text(), "Education and Experience:")]]/following-sibling::ul[1]'
        education_elements = driver.find_elements(By.XPATH, education_xpath)
        
        education_text = ""
        if education_elements:
            education_text = "\n".join([element.text for element in education_elements]).strip()

        # Extract "Preferred Experience" section
        preferred_experience_xpath = '//p[strong[contains(text(), "Preferred Experience:")]]/following-sibling::ul[1]'
        preferred_experience_elements = driver.find_elements(By.XPATH, preferred_experience_xpath)
        
        preferred_experience_text = ""
        if preferred_experience_elements:
            preferred_experience_text = "\n".join([element.text for element in preferred_experience_elements]).strip()

        # Combine both sections
        combined_text = f"Education and Experience:\n{education_text}\n\nPreferred Experience:\n{preferred_experience_text}"

        return combined_text.strip()

    except Exception as e:
        print(f"Error: {e}")
        return None
    
    finally:
        driver.quit()



In [110]:
with open("Job_desc.txt", 'w') as file:
    file.writelines(res)