# Indeed Webscraper

## Imports

In [4]:
# This module provides various time-related functions.
import time
# The datetime module supplies classes for manipulating dates and times.
import datetime
# API to access Selenium WebDrivers like Firefox, Ie, Chrome
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys

# os gives access to the operating system
import os
# CSV File Reading and Writing¶
import csv
# standard python library for logging
import logging
# Pandas is an open source data analysis and manipulation tool
import pandas as pd

## Indeed Job Search

In [23]:
# time until next command is executed after a new window loaded
loadWindow_Time = 70
# time between clicking different jobs
sleepTime_betweenJobs = 100
# ammount of attempts the software needs to execute before going to next step
attempts = 3

# function that starts Google Chrome
def startChromeEngine():
    # Website that will get scraped
    Indeed_Url = "https://de.indeed.com/?r=us"
    # Path to webdriver
    ser = Service("/Applications/chromedriver")

    # start chrome driver
    driver = webdriver.Chrome(service=ser)
    # load Indeed URL
    driver.get(Indeed_Url)

    # wait for cookies window
    time.sleep(loadWindow_Time)
    
    #return driver when window is loaded 
    return driver

# function that starts the search engine on Indeed.com
def startSearchEngine(driver, job_title, location):
    # accept cookies
    driver.find_element(By.XPATH, "//*[@id=\"onetrust-accept-btn-handler\"]").click()

    # enter job title
    driver.find_element(By.XPATH, "//*[@id=\"text-input-what\"]").send_keys(job_title)
    # enter location
    driver.find_element(By.XPATH, "//*[@id=\"text-input-where\"]").send_keys(location)
    # start search
    driver.find_element(By.XPATH, "//*[@id=\"text-input-where\"]").send_keys(Keys.RETURN)

    # wait for page to load 
    time.sleep(loadWindow_Time)
    
    # sort result by publish data
    driver.find_element(By.XPATH, "//*[@id=\"resultsCol\"]/div[3]/div[4]/div[1]/span[2]/a").click()
    
    # wait for random pop up window
    time.sleep(loadWindow_Time)    
    # close random pop up window
    driver.find_element(By.XPATH,"//*[@id=\"popover-x\"]/button").click()
    
    # wait for page to load
    time.sleep(loadWindow_Time)

    return driver

# function that scrapes every job page 
def startScrapingPage(driver, all_jobs_lst):
    # iterating over attempts because sometimes the container doesnt get found immediately
    for attempt in range(attempts):
        # try to find the container where all jobs are presented
        try:
            job_list_container = driver.find_element(By.XPATH, "//*[@id=\"mosaic-provider-jobcards\"]")
        #try again after 10 seconds...
        except:
            time.sleep(10)

            
    # get all jobs in container
    job_list = job_list_container.find_elements(By.TAG_NAME, "a")

    # get all job ids in job_list
    job_ids = []
    try:
        # iterate over every job in job_list
        for job in job_list:
            # get the job id to later click on every job...
            id = job.get_attribute("id")
            # if job id is not empty add do job_ids list
            if id != "":
                job_ids.append(id)
    except:
        # sleep time at this point is maybe overkill but i keep it in to help against reCaptcha 
        time.sleep(loadWindow_Time)

    # iterate over every job in job_ids
    for job_id in job_ids:
        try:
            # try to click on current job_id in order to see full job description
            driver.find_element(By.XPATH, f"//*[@id=\"{job_id}\"]").click()
            # wait until description is loaded
            time.sleep(loadWindow_Time)
        except:
            continue

        # attempt multiple times because its not always working immediately
        for attempt in range (attempts):
            # try to switch to Job Description iframe in order to be able to interact with it
            try:
                # find iframe
                iframe = driver.find_element(By.XPATH,"//*[@id=\"vjs-container-iframe\"]")
                # switch to iframe
                driver.switch_to.frame(iframe)
                break
            except:
                # try again 
                time.sleep(loadWindow_Time)
                
        # attempt multiple times because its not always working immediately
        for attempt in range(attempts):
            # get job top card
            try:
                topCard = driver.find_element(By.XPATH, "//*[@id=\"viewJobSSRRoot\"]/div/div[1]/div/div/div/div[1]/div/div[1]")
            except:
                # if we cant find the jobcard we wont find the below informations...
                job_title = ""
                company_name = ""
                company_location = ""
                job_type = ""
                time.sleep(loadWindow_Time)
                continue

            # get job title
            try:
                job_title = topCard.find_element(By.CSS_SELECTOR,".icl-u-xs-mb--xs.icl-u-xs-mt--none.jobsearch-JobInfoHeader-title.is-embedded").text
            except:
                job_title = ""

            # get company name
            try:
                company_name = driver.find_element(By.CSS_SELECTOR, ".icl-u-lg-mr--sm.icl-u-xs-mr--xs").text
            except:
                company_name = ""

            # get company location
            try:
                company_location = driver.find_element(By.CSS_SELECTOR,"div[class='jobsearch-CompanyInfoWithoutHeaderImage'] div:nth-child(2)").text
            except:
                company_location = ""
            # get type of job
            try:
                job_type = driver.find_element(By.CSS_SELECTOR, "body > div:nth-child(4) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(1) > div:nth-child(2) > span:nth-child(1)").text
            except:
                job_type = ""



        # attempt multiple times because its not always working immediately
        for attempt in range(attempts):
            # try to get full job description
            try:
                job_Description = driver.find_element(By.XPATH, "//*[@id=\"jobDescriptionText\"]").text
                break
            except:
                job_Description = ""
                time.sleep(loadWindow_Time)

        # get current datetime
        date_time = datetime.datetime.now().strftime("%d%b%Y-%H:%M:%S")

        # add all_jobs to all_jobs_lst
        job = [job_id, job_title, company_name, company_location, job_type, job_Description, date_time]
        all_jobs_lst.append(job)

        # switch back to default frame to click on next job
        driver.switch_to.default_content()

    return all_jobs_lst

# function that clicks on next page
def getNextPage(driver):
    # click on next Page button
    driver.find_element(By.XPATH,"//*[@id=\"resultsCol\"]/nav/div/ul/li[6]/a").click()
    #wait until page is loaded
    time.sleep(loadWindow_Time)

    return driver

# function that saves jobs as csv 
def saveAsCSV(all_jobs_lst, page_nr):
    dateTime=datetime.datetime.now()    
    filename_dateTime = dateTime.strftime("%d%m%Y_%H_%M_%S")
    df = pd.DataFrame(all_jobs_lst)
    df.to_csv("/Users/jan/Documents/7.Semester/Datenanalyse in der Praxis/SeminarArbeit/Data/Indeed_Jobs" + str(page_nr) + str(filename_dateTime)  + ".csv", index=False)
    

# MAIN

In [24]:
# max number of pages we try to scrape, usually ReCaptcha catches us before we get there...
ammount_Of_Pages=35
# start chrome engine
driver = startChromeEngine()
# start the search
driver = startSearchEngine(driver, "Data Analyst", "Deutschland")
# iterate over pages
for page_Number in range(ammount_Of_Pages):
    # initialize job list
    all_jobs_lst=[]
    # start scraping current page
    all_jobs_lst = startScrapingPage(driver,all_jobs_lst)
    #save as csv
    saveAsCSV(all_jobs_lst, page_Number)
    #get next page
    driver = getNextPage(driver)



NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="resultsCol"]/nav/div/ul/li[6]/a"}
  (Session info: chrome=96.0.4664.110)
Stacktrace:
0   chromedriver                        0x0000000100d7389c __gxx_personality_v0 + 538736
1   chromedriver                        0x0000000100d09b64 __gxx_personality_v0 + 105272
2   chromedriver                        0x0000000100912ad0 chromedriver + 158416
3   chromedriver                        0x00000001009420b8 chromedriver + 352440
4   chromedriver                        0x000000010096b38c chromedriver + 521100
5   chromedriver                        0x00000001009374d8 chromedriver + 308440
6   chromedriver                        0x0000000100d36ca0 __gxx_personality_v0 + 289908
7   chromedriver                        0x0000000100d4adc8 __gxx_personality_v0 + 372124
8   chromedriver                        0x0000000100d4f94c __gxx_personality_v0 + 391456
9   chromedriver                        0x0000000100d4bd04 __gxx_personality_v0 + 376024
10  chromedriver                        0x0000000100d2cc88 __gxx_personality_v0 + 248924
11  chromedriver                        0x0000000100d64bd0 __gxx_personality_v0 + 478116
12  chromedriver                        0x0000000100d64d44 __gxx_personality_v0 + 478488
13  chromedriver                        0x0000000100d7a458 __gxx_personality_v0 + 566316
14  libsystem_pthread.dylib             0x000000018d3414ec _pthread_start + 148
15  libsystem_pthread.dylib             0x000000018d33c2d0 thread_start + 8
