In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd

In [11]:
# chromedriver path
CHROMEDRIVER_PATH = r"C:\Users\ishra\Downloads\Data Analytics\chromedriver-win64\chromedriver.exe"


# Indeed URLs
urls = [
   'https://www.indeed.com/jobs?q=data+science&l=California&vjk=3a3da246454f8157',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=10&vjk=03e45ff39cbde6f6',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=20&vjk=ec21d7d37d7fe709',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=30&vjk=e572b54cc84a2486',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=40&vjk=26506c4137a97e7f',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=50&vjk=bf1988ef4a9631f9',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=60&vjk=dd23491cc0f0bd29',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=70&vjk=5f744dd7e6cf71fe',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=80&vjk=74d9f0b189c3499d',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=90&vjk=d94bad2d8da8ae6f',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=100&vjk=fc81fd13b05976cb',
 'https://www.indeed.com/jobs?q=data+science&l=California&start=110&vjk=602c28691cf6ac82'
]

# start Chrome WebDriver
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service)

# create extracted data list
jobs_data = []

# create function to extract data from each job card
def extract_job_data(job_card):
    try:
        job_title = job_card.find_element(By.CLASS_NAME, 'jobTitle').text
    except:
        job_title = None

    try:
        company = job_card.find_element(By.XPATH, ".//span[@data-testid='company-name']").text
    except:
        company = None

    try:
        location = job_card.find_element(By.XPATH, ".//div[@data-testid='text-location']").text
    except:
        location = None

    try:
        date_posted = job_card.find_element(By.XPATH, ".//div[@class='metadata']//span[contains(text(), 'ago')]").text

    except:
        date_posted = None

    try:
        salary = job_card.find_element(By.XPATH, ".//div[@data-testid='attribute_snippet_testid']").text

    except:
        salary = None

    
    try:
        job_mode = job_card.find_element(By.XPATH, ".//div[@data-testid='text-location']").text
        if "Hybrid" in job_mode:
            job_mode = "Hybrid"
        elif "Remote" in job_mode:
            job_mode = "Remote"
        else:
            job_mode = "On-site"
    except:
        job_mode = "On-site"


    return {
        'Job Title': job_title,
        'Company': company,
        'Location': location,
        'Date Posted': date_posted,
        'Salary': salary,
        'Job Mode': job_mode
    }

# create loop
for url in urls:
    driver.get(url)
    time.sleep(3)
    
    job_cards = driver.find_elements(By.CLASS_NAME, 'resultContent')
    
    for job_card in job_cards:
        job_info = extract_job_data(job_card)
        jobs_data.append(job_info)
    
    time.sleep(2)

# close the driver
driver.quit()

# convert data
jobs_df = pd.DataFrame(jobs_data)

# print data
print(jobs_df)

                                             Job Title  \
0                             AI Operations Technician   
1                                         Data Analyst   
2                              Data Scientist, Product   
3    Data Science Graduate (TikTok-Product-Data Sci...   
4                              Data Research Associate   
..                                                 ...   
175  Data Scientist (L5) - Live Discovery and Engag...   
176        Data Scientist 4 - Principal Data Scientist   
177                       AI Engineer, Computer Vision   
178                           Senior Applied Scientist   
179  Machine Learning Research Engineer, Apple Inte...   

                            Company                           Location  \
0                         ALERTWest                          Chico, CA   
1                      Leftbank Art                La Mirada, CA 90638   
2    An Autonomous Mobility Company                    Foster City, CA   
3      

In [12]:
import pandas as pd
import re

# function to clean location column
def clean_location_info(location):
    # remove job mode information
    location = re.sub(r'(Remote in|Hybrid work in)\s*', '', location).strip()
    # regular expression to split the location into city, state, and zip code
    location_parts = re.split(r',\s*|\n', location)
    # extract info
    city = location_parts[0].strip() if len(location_parts) > 0 else None
    state_zip = location_parts[1].strip() if len(location_parts) > 1 else None
    state = state_zip.split()[0] if state_zip else None
    zip_code = state_zip.split()[1] if state_zip and len(state_zip.split()) > 1 else None
    # if state is not 'CA', assume it's the state of California
    if state != 'CA' and 'California' in city:
        city = 'State of California'
        state = 'CA'
    
    return pd.Series([city, state, zip_code])

# apply function and create new columns for City, State, and Zip_Code
jobs_df[['City', 'State', 'Zip_Code']] = jobs_df['Location'].apply(clean_location_info)

# print
print(jobs_df)

                                             Job Title  \
0                             AI Operations Technician   
1                                         Data Analyst   
2                              Data Scientist, Product   
3    Data Science Graduate (TikTok-Product-Data Sci...   
4                              Data Research Associate   
..                                                 ...   
175  Data Scientist (L5) - Live Discovery and Engag...   
176        Data Scientist 4 - Principal Data Scientist   
177                       AI Engineer, Computer Vision   
178                           Senior Applied Scientist   
179  Machine Learning Research Engineer, Apple Inte...   

                            Company                           Location  \
0                         ALERTWest                          Chico, CA   
1                      Leftbank Art                La Mirada, CA 90638   
2    An Autonomous Mobility Company                    Foster City, CA   
3      

In [13]:
# function to clean salary column
def clean_salary_info(salary):
    # Default values
    salary_range = None
    salary_schedule = None
    employment_type = None
    
    # if there is no salary mentioned
    if pd.isna(salary):
        return pd.Series([salary_range, salary_schedule, employment_type])
    
    # employment type
    if 'Part-time' in salary:
        employment_type = 'Part-time'
    elif 'Full-time' in salary:
        employment_type = 'Full-time'
    elif 'Internship' in salary:
        employment_type = 'Internship'
    elif 'Permanent' in salary:
        employment_type = 'Full-time'
    elif 'Contract' in salary:
        employment_type = 'Contract'
    
    # regular expressions to extract salary
    salary_match = re.search(r'\$([\d,]+) - \$([\d,]+)', salary)
    if salary_match:
    # format the salary range
        salary_range = f"${salary_match.group(1)} - ${salary_match.group(2)}"

    
    # salary schedule
    if 'month' in salary:
        salary_schedule = 'monthly'
    elif 'year' in salary:
        salary_schedule = 'yearly'
    elif 'hour' in salary:
        salary_schedule = 'hourly'
    
    return pd.Series([salary_range, salary_schedule, employment_type])

# Apply the function to the Salary column and create new columns
jobs_df[['Salary range', 'Salary schedule', 'Employment type']] = jobs_df['Salary'].apply(clean_salary_info)

# Show the updated DataFrame
print(jobs_df)

# save data
jobs_df.to_csv('indeed_job_data_Ishraque.csv', index=False)

                                             Job Title  \
0                             AI Operations Technician   
1                                         Data Analyst   
2                              Data Scientist, Product   
3    Data Science Graduate (TikTok-Product-Data Sci...   
4                              Data Research Associate   
..                                                 ...   
175  Data Scientist (L5) - Live Discovery and Engag...   
176        Data Scientist 4 - Principal Data Scientist   
177                       AI Engineer, Computer Vision   
178                           Senior Applied Scientist   
179  Machine Learning Research Engineer, Apple Inte...   

                            Company                           Location  \
0                         ALERTWest                          Chico, CA   
1                      Leftbank Art                La Mirada, CA 90638   
2    An Autonomous Mobility Company                    Foster City, CA   
3      