In [1]:
import requests
import re
from bs4 import BeautifulSoup as bs
import json
import random
import pandas as pd
from selenium import webdriver as wb
import datetime

In [35]:
class JobFinder:
    def __init__(self, search, place):
        self.search = search
        self.place = place
        self.search_components = self.search.split()
        # empty list for the job info
        self.joblist = []
        
        # get the user agent list so that I dont get blocked
        with open("user_agent_list.json", "r") as f:
            self.user_agent_list = json.load(f)
        
    def search_stepstone(self):
        
        print("searching on stepstone...")
        # modify stepstone url
        stepstone_search = self.search.replace(" ", "%20")
        stepstone_place = self.place.replace(" ", "%20")
        
        # steptstone url
        stepstone_url = "https://www.stepstone.de/5/ergebnisliste.html?stf=freeText&ns=1&companyid=0&sourceofthesearchfield=resultlistpage%3Ageneral&qs=[]&cityid=0&ke={}&ws={}&radius=30&suid=e4f10731-b7c4-4e30-a419-08dcd96f8eed&ob=date&of={}"
        # list for the pages
        n = 25
        m = n * 5
        # how many pages do we want to scrape?
        indeed_page_list = range(0, m, n) # scrape first 10 pages
        
        stepstone_page_list = range(0, m, n)

        for page in stepstone_page_list:
            
            # get random user agent
            headers = {"User-Agent" :random.choice(self.user_agent_list)}
            try:
                r = requests.get(stepstone_url.format(stepstone_search, stepstone_place, page), headers = headers)
                soup = bs(r.content, "lxml")
                
                body = soup.find("div", class_ = "ResultsSectionContainer-gdhf14-0 gvBCse")
                divs = body.find_all("div", class_ = "sc-fzXfOu")
                
                for div in divs:
                    try:
                        word_not_found = False
                        title = div.find("div", class_ = "sc-fzXfOw").text
                        for word in self.search_components:
                            if word.lower() not in title.lower():
                                word_not_found = True
                        if word_not_found == True:
                            continue
                        
                    except:
                        title = ""
                    try:
                        company = div.find("div", {"data-at" : "job-item-company-name"}).text
                    except:
                        company = ""
                    try:
                        city = div.find("li", {"data-at" : "job-item-location"}).text
                    except:
                        city = ""
                    try: 
                        summary = div.find("a", {"data-offer-meta-text-snippet-link" : "true"}).text
                    except:
                        summary = ""
                    try:
                        job_link = div.find("a")
                        link = "https://www.stepstone.de" + job_link.attrs["href"]
                    except:
                        link = stepstone_url.format(stepstone_search, stepstone_place, page)
                    try:
                        time = div.find("time").text
                        try:
                            number, unit = div.find("time").text.split()[1:3]
                            number = int(number)
                            if unit == "Stunde" or unit == "Stunden":
                                time = round(number/24, 2)
                            elif unit == "Tag" or unit == "Tagen":
                                time = number
                            elif unit == "Woche" or unit == "Wochen":
                                time = number * 7    
                        except:
                            pass
                    except:
                        time = None
                    
                    source = "stepstone"

                    job = {
                                "title" : title,
                                "company" : company,
                                "city" : city,
                                "time" : time,
                                "summary" : summary,
                                "source" : source,
                                "link" : link
                            }

                    self.joblist.append(job)
                    
            except:
                print("no data found on page " + str(int(page / n) + 1))
            
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
    
    def search_indeed(self):
        
        print("searching on indeed...")
        n = 10
        m = n * 5
        # how many pages do we want to scrape?
        indeed_page_list = range(0, m, n) # scrape first 10 pages
        
        # modify indeed url
        indeed_search = self.search.replace(" ", "+")
        indeed_place = self.place.replace(" ", "+")
        
        # indeed url
        indeed_url = "https://de.indeed.com/jobs?q={}&l={}&sort=date&start={}"

        for page in indeed_page_list:

            # get random user agent
            headers = {"User-Agent" :random.choice(self.user_agent_list)}
            try:
                
                r = requests.get(indeed_url.format(indeed_search, indeed_place, page), headers = headers)
                # check if we can access the website

                soup = bs(r.content, "lxml")
                divs = soup.find_all("div", class_ = "jobsearch-SerpJobCard")
            

                for div in divs:
                    try:
                        word_not_found = False
                        title = div.find("a").text.strip()
                        for word in self.search_components:
                            if word.lower() not in title.lower():
                                word_not_found = True
                        if word_not_found == True:
                            continue
                    except:
                        title = ""
                    try:
                        company = div.find("span", class_ = "company").text.strip()
                    except:
                        company = ""
                    try:
                        city = div.find("span", class_ = "location accessible-contrast-color-location").text.strip()
                    except:
                        city = ""
                    try:
                        summary = div.find("div", {"class" : "summary"}).text.strip().replace("\n","")
                    except:
                        summary = ""
                    try:
                        job_link = div.find("h2", {"class" : "title"}).find("a")["href"]
                        link = "https://www.indeed.com" + job_link
                    except:
                        link = indeed_url.format(indeed_search, indeed_place, page)
                    try:
                        time = div.find("span", {"class" : "date"}).text
                        try:
                            unit = time.split()[0]
                            if unit == "Gerade":
                                time = 0
                            elif unit == "Heute":
                                time = 0.5
                            else:
                                number = int(time.split()[1])
                                time = number
                        except:
                            pass
                           
                    except:
                        time = None
                    source = "indeed"

                    job = {
                                "title" : title,
                                "company" : company,
                                "city" : city,
                                "time" : time,
                                "summary" : summary,
                                "source" : source,
                                "link" : link
                            }

                    self.joblist.append(job)
            except:
                print("no data found on page " + str(int(page / n) + 1))
            
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
        
    def search_stack_overflow(self):
        
        print("searching on stack overflow...")
        
        # list for the pages
        stack_overflow_page_list = range(1, 5)
        # stack overflow url
        stack_overflow_url = "https://stackoverflow.com/jobs?d=20&l={}&q={}&u=Km&pg={}"
        
        # modify stack_overflow url
        stack_overflow_search = self.search.replace(" ", "+")
        stack_overflow_place = self.place.replace(" ", "+")
        
        for page in stack_overflow_page_list:
            # get random user agent
            headers = {"User-Agent" :random.choice(self.user_agent_list)}
            try:
                r = requests.get(stack_overflow_url.format(stack_overflow_place, stack_overflow_search, page), headers = headers)
                soup = bs(r.content, "lxml")

                # list of the jobs
                body = soup.find("div", class_ = "listResults")
                jobs = body.find_all("div", {"class" : "-job"})
           

                for job in jobs:
                    try:
                        word_not_found = False
                        title = job.find("a", {"class" : "s-link stretched-link"}).text
                        for word in self.search_components:
                            if word.lower() not in title.lower():
                                word_not_found = True
                        if word_not_found == True:
                            continue
                    except:
                        title = ""
                    try:    
                        company = job.find("h3", {"class" : "fc-black-700 fs-body1 mb4"}).find("span").text.strip()
                    except:
                        company = ""
                    try:
                        city = job.find("h3", {"class" : "fc-black-700 fs-body1 mb4"}).find("span", {"class" : "fc-black-500"}).text.strip()
                    except:
                        city = ""
                    try:
                        time = int(job.find("ul", {"class" : "mt4"}).li.span.text.split("d")[0])
                    except:
                        time = None
                    summary = ""
                    source = "stack overflow"

                    job = {
                                "title" : title,
                                "company" : company,
                                "city" : city,
                                "time" : time,
                                "summary" : summary,
                                "source" : source,
                                "link" : stack_overflow_url.format(stack_overflow_place, stack_overflow_search, page)
                            }

                    self.joblist.append(job)
            
            except:
                print("no data found on page " + str(page))
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
        
    def search_xing(self):
        
        print("searching on XING...")
        
        xing_page_list = range(1, 5)

        # xing url
        xing_url = "https://www.xing.com/jobs/search?page={}&utf8=%E2%9C%93&nrs=1&keywords={}&location={}&radius=&sort=date"
        chromedriver = r"C:/Users/Leonhard/Downloads/chromedriver_win32/chromedriver.exe"
        
        # for xing url
        xing_search = self.search.replace(" ", "%20")
        xing_place = self.place.replace(" ", "%20")
        
        # I dont want chrome to open
        options = wb.ChromeOptions()
        options.add_argument("headless")
        # add user agent
        headers = random.choice(self.user_agent_list)
        options.add_argument(f"user-agent:{headers}")
        
        for page in xing_page_list:
            try:
                # need to use selenium
                webD = wb.Chrome(chromedriver, options=options)
                webD.get(xing_url.format(page, xing_search, xing_place))
                soup = bs(webD.page_source, "lxml")
                body = soup.body
                job_list = body.find("div", {"class" : "result-list-result-list-container-8d38ca5b"})
                jobs = job_list.find_all("div", {"class" : "result-result-container-6e907078"})
                
                infos = [job.a for job in jobs]
                for info in infos:
                    try:
                        word_not_found = False
                        title = info.find("h2").text.strip()
                        for word in self.search_components:
                            if word.lower() not in title.lower():
                                word_not_found = True
                        if word_not_found == True:
                            continue
                    except:
                        title = ""
                    try:
                        company = info.find("div", {"class" : "result-result-subtitle-99125938"}).text.strip().split(",")[0]
                    except:
                        company = ""
                    try:
                        city = info.find("div", {"class" : "result-result-subtitle-99125938"}).text.strip().split(",")[1]
                    except:
                        city = ""
                    try:
                        time = info.time.text.strip()
                        try:
                            time_components = time.split()
                            value = int(time_components[0])
                            unit = time_components[1]
                            if unit == "minutes" or unit == "minute":
                                time = round(value / 1440, 2)
                            elif unit == "hour" or unit == "hours":
                                time = round(value / 24, 2)
                            elif unit == "day" or unit == "days":
                                time = value
                        except:
                            pass
                    except:
                        time = None
                    try:
                        summary = info.find("div", {"class" : "result-result-description-c7581001"}).text.strip()
                    except:
                        summary = ""
                        
                    source = "XING"

                    job = {
                                "title" : title,
                                "company" : company,
                                "city" : city,
                                "time" : time,
                                "summary" : summary,
                                "source" : source,
                                "link" : xing_url.format(page, xing_search, xing_place)
                            }
                    self.joblist.append(job)
            
            except:
                print("no data found on page " + str(page))
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
        
    def search_linkedin(self):
        
        print("Searching on LinkedIn...")
        n = 25
        m = n * 5
        # how many pages do we want to scrape?
        linkedin_page_list = range(0, m, n) # scrape first 5 pages
        
        linkedin_url = "https://www.linkedin.com/jobs/search/?geoId=103035651&keywords={}&location={}&sortBy=DD&start={}"
        
        # to modify url
        linkedin_search = self.search.replace(" ", "%20")
        linkedin_place = self.place.replace(" ", "%20")
        
        for page in linkedin_page_list:
            try:
                headers = {"User-Agent" : random.choice(self.user_agent_list)}
                r = requests.get(linkedin_url.format(linkedin_search, linkedin_place, page), headers = headers)
                soup = bs(r.content, "lxml")
                job_list = soup.find("section", {"class" : "results__list"}).find("ul")

                for job in job_list:
                    try:
                        word_not_found = False
                        title = job.a.span.text.strip()
                        for word in self.search_components:
                            if word.lower() not in title.lower():
                                word_not_found = True
                        if word_not_found == True:
                            continue
                    except:
                        title = ""
                    try:
                        company = job.find("h4").text.strip()
                    except:
                        company = ""
                    try:
                        city = job.find("span", {"class" : "job-result-card__location"}).text.strip()
                    except:
                        city = ""
                    try:
                        time = job.find("time").text
                        try:
                            time_components = time.split()
                            value = int(time_components[0])
                            unit = time_components[1]
                            if unit == "minutes" or unit == "minute":
                                time = round(value / 1440, 2)
                            elif unit == "hour" or unit == "hours":
                                time = round(value / 24, 2)
                            elif unit == "day" or unit == "days":
                                time = value
                        except:
                            pass
                    except:
                        time = None
                        
                    summary = ""
                    source = "LinkedIn"
                 
                    job = {
                                "title" : title,
                                "company" : company,
                                "city" : city,
                                "time" : time,
                                "summary" : summary,
                                "source" : source,
                                "link" : linkedin_url.format(linkedin_search, linkedin_place, page)
                            }

                    self.joblist.append(job)
                    
            except:
                print("no data found on page " + str(int(page / n) + 1))
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
                
                
                
    def search_all(self):
        
        self.search_indeed()
        self.search_stepstone()
        self.search_stack_overflow()
        self.search_xing()
        self.search_linkedin()
        
        df = pd.DataFrame(self.joblist)
        
        try:
            df = df.sort_values(by = "time")
            df.reset_index(drop = True, inplace = True)
        except:
            pass
        
        print("Joblist is ready!")

        return df
    
    
search = input("What are you looking for?")
place = input("Where are you searching?")

#search = "Data Analyst"
#place = "Berlin"

now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
jobs = JobFinder(search, place)
df = jobs.search_stepstone()

# save as excel file
#df.to_excel(f"{now}_job_{search}_in_{place}.xlsx", index = False)


What are you looking for? data
Where are you searching? berlin


searching on stepstone...


In [36]:
df.link[0]

'https://www.stepstone.de/stellenangebote--Data-Scientist-w-m-d-Berlin-Rhenus-SN-digital-GmbH-Co-KG--6980135-inline.html'