In [1]:
import requests
import re
from bs4 import BeautifulSoup as bs
import json
import random
import pandas as pd
from selenium import webdriver as wb

In [7]:
search = input("What are you looking for")
place = input("Where are you searching?")

#search = "Data Analyst"
#place = "Berlin"

class JobFinder:
    def __init__(self, search, place):
        self.search = search
        self.place = place
        
        # empty list for the job info
        self.joblist = []
        
        # get the user agent list so that I dont get blocked
        with open("../Data/user_agent_list.json", "r") as f:
            self.user_agent_list = json.load(f)
        
    def search_steptstone(self):
        
        print("searching on stepstone...")
        # modify stepstone url
        stepstone_search = self.search.replace(" ", "%20")
        stepstone_place = self.place.replace(" ", "%20")
        
        # steptstone url
        stepstone_url = "https://www.stepstone.de/5/ergebnisliste.html?stf=freeText&ns=1&companyid=0&sourceofthesearchfield=resultlistpage%3Ageneral&qs=[]&cityid=0&ke={}&ws={}&radius=30&suid=e4f10731-b7c4-4e30-a419-08dcd96f8eed&ob=date&of={}"

        # list for the pages
        stepstone_page_list = range(0, 200, 25)

        for page in stepstone_page_list:
            
            # get random user agent
            headers = {"User-Agent" :random.choice(self.user_agent_list)}
            try:
                r = requests.get(stepstone_url.format(stepstone_search, stepstone_place, page), headers = headers)
                soup = bs(r.content, "lxml")

                body = soup.find("div", class_ = "ResultsSectionContainer-gdhf14-0 gvBCse")
                divs = body.find_all("div", class_ = "sc-fzXfOu")
                
                for div in divs:
                    try:
                        title = div.find("div", class_ = "sc-fzXfOw").text
                    except:
                        title = ""
                    try:
                        company = div.find("div", {"data-at" : "job-item-company-name"}).text
                    except:
                        company = ""
                    try:
                        city = div.find("li", {"data-at" : "job-item-location"}).text
                    except:
                        city = ""
                    try: 
                        summary = div.find("a", {"data-offer-meta-text-snippet-link" : "true"}).text
                    except:
                        summary = ""
                    source = "stepstone"

                    job = {
                        "title" : title,
                        "company" : company,
                        "city" : city,
                        "summary" : summary,
                        "source" : source
                    }

                    self.joblist.append(job)
                    
            except:
                print("no data found on page " + str(page))
            
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
    
    def search_indeed(self):
        
        print("searching on indeed...")
        # how many pages do we want to scrape?
        indeed_page_list = range(0, 100, 10) # scrape first 10 pages
        
        # modify indeed url
        indeed_search = self.search.replace(" ", "+")
        indeed_place = self.place.replace(" ", "+")
        
        # indeed url
        indeed_url = "https://de.indeed.com/jobs?q={}&l={}&sort=date&start={}"

        for page in indeed_page_list:

            # get random user agent
            headers = {"User-Agent" :random.choice(self.user_agent_list)}
            try:
                
                r = requests.get(indeed_url.format(indeed_search, indeed_place, page), headers = headers)
                # check if we can access the website

                soup = bs(r.content, "lxml")
                divs = soup.find_all("div", class_ = "jobsearch-SerpJobCard")
            

                for div in divs:
                    try:
                        title = div.find("a").text.strip()
                    except:
                        title = ""
                    try:
                        company = div.find("span", class_ = "company").text.strip()
                    except:
                        company = ""
                    try:
                        city = div.find("span", class_ = "location accessible-contrast-color-location").text.strip()
                    except:
                        city = ""
                    try:
                        summary = div.find("div", {"class" : "summary"}).text.strip().replace("\n","")
                    except:
                        summary = ""
                    source = "indeed"

                    job = {
                        "title" : title,
                        "company" : company,
                        "city" : city,
                        "summary" : summary,
                        "source" : source
                    }

                    self.joblist.append(job)
            except:
                print("no data found on page " + str(page))
            
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
        
    def search_stack_overflow(self):
        
        print("searching on stack overflow...")
        
        # list for the pages
        stack_overflow_page_list = range(0, 5)
        # stack overflow url
        stack_overflow_url = "https://stackoverflow.com/jobs?d=20&l={}&q={}&u=Km&pg={}"
        
        # modify stack_overflow url
        stack_overflow_search = self.search.replace(" ", "+")
        stack_overflow_place = self.place.replace(" ", "+")
        
        for page in stack_overflow_page_list:
            # get random user agent
            headers = {"User-Agent" :random.choice(self.user_agent_list)}
            try:
                r = requests.get(stack_overflow_url.format(stack_overflow_place, stack_overflow_search, page), headers = headers)
                soup = bs(r.content, "lxml")

                # list of the jobs
                body = soup.find("div", class_ = "listResults")
                jobs = body.find_all("div", {"class" : "-job"})
           

                for job in jobs:
                    try:
                        title = job.find("a", {"class" : "s-link stretched-link"}).text
                    except:
                        title = ""
                    try:    
                        company = job.find("h3", {"class" : "fc-black-700 fs-body1 mb4"}).find("span").text.strip()
                    except:
                        company = ""
                    try:
                        city = job.find("h3", {"class" : "fc-black-700 fs-body1 mb4"}).find("span", {"class" : "fc-black-500"}).text.strip()
                    except:
                        city = ""
                    summary = ""
                    source = "stack overflow"

                    job = {
                        "title" : title,
                        "company" : company,
                        "city" : city,
                        "summary" : summary,
                        "source" : source
                    }

                    self.joblist.append(job)
            
            except:
                print("no data found on page " + str(page))
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
        
    def search_xing(self):
        
        print("searching on XING...")
        
        xing_page_list = range(1, 5)

        # xing url
        xing_url = "https://www.xing.com/jobs/search?page={}&utf8=%E2%9C%93&nrs=1&keywords={}&location={}&radius=&sort=date"
        chromedriver = r"C:/Users/Leonhard/Downloads/chromedriver_win32/chromedriver.exe"
        
        # for xing url
        xing_search = self.search.replace(" ", "%20")
        xing_place = self.place.replace(" ", "%20")
        
        # I dont want chrome to open
        options = wb.ChromeOptions()
        options.add_argument("headless")
        # add user agent
        headers = random.choice(self.user_agent_list)
        options.add_argument(f"user-agent={headers}")
        
        for page in xing_page_list:
            try:
                # need to use selenium
                webD = wb.Chrome(chromedriver, options=options)
                webD.get(xing_url.format(page, xing_search, xing_place))
                soup = bs(webD.page_source, "lxml")
                body = soup.body
                job_list = body.find("div", {"class" : "result-list-result-list-container-8d38ca5b"})
                jobs = job_list.find_all("div", {"class" : "result-result-container-6e907078"})

                infos = [job.a for job in jobs]
                for info in infos:
                    try:
                        title = info.find("h2").text.strip()
                    except:
                        title = ""
                    try:
                        company = info.find("div", {"class" : "result-result-subtitle-99125938"}).text.strip().split(",")[0]
                    except:
                        company = ""
                    try:
                        city = info.find("div", {"class" : "result-result-subtitle-99125938"}).text.strip().split(",")[1]
                    except:
                        city = ""
                    try:
                        time = info.time.text.strip()
                    except:
                        time = ""
                    try:
                        summary = info.find("div", {"class" : "result-result-description-c7581001"}).text.strip()
                    except:
                        summary = ""
                        
                    source = "XING"

                    job = {
                                "title" : title,
                                "company" : company,
                                "city" : city,
                                "summary" : summary,
                                "source" : source
                            }

                    self.joblist.append(job)
            
            except:
                print("no data found on page " + str(page))
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
        
    def search_linkedin(self):
        
        print("Searching on LinkedIn...")
        
        # list for the pages
        linkedin_page_list = range(0, 100, 25)

        linkedin_url = "https://www.linkedin.com/jobs/search/?geoId=103035651&keywords={}&location={}&sortBy=DD&start={}"
        
        # to modify url
        linkedin_search = self.search.replace(" ", "%20")
        linkedin_place = self.place.replace(" ", "%20")
        
        for page in linkedin_page_list:
            try:
                print(page)
                headers = {"User-Agent" : random.choice(self.user_agent_list)}
                r = requests.get(linkedin_url.format(linkedin_search, linkedin_place, page), headers = headers)
                soup = bs(r.content, "lxml")
                job_list = soup.find("section", {"class" : "results__list"}).find("ul")
                print(len(job_list))

                for job in job_list:
                    try:
                        title = job.a.span.text.strip()
                    except:
                        title = ""
                    try:
                        company = job.find("h4").text.strip()
                    except:
                        company = ""
                    try:
                        city = job.find("span", {"class" : "job-result-card__location"}).text.strip()
                    except:
                        city = ""

                    summary = ""
                    source = "LinkedIn"
                    try:
                        time = job.find("time", {"class" : "job-result-card__listdate--new"}).text.strip()
                    except:
                        time = ""
                    
                    job = {
                                "title" : title,
                                "company" : company,
                                "city" : city,
                                "summary" : summary,
                                "source" : source
                            }

                    self.joblist.append(job)
                    
            except:
                print("no data found on page " + str(page))
        try:
            df = pd.DataFrame(self.joblist)
            return df
        except:
            return None
                
                
                
    def search_all(self):
        
        self.search_indeed()
        self.search_steptstone()
        self.search_stack_overflow()
        self.search_xing()
        self.search_linkedin()
        
        df = pd.DataFrame(self.joblist)
        
        return df
    

jobs = JobFinder(search, place)
df = jobs.search_all()
#df.to_excel(f"job_{search}_in_{place}.xlsx")


what are you looking for data scientist
Where are you searching? deutschland


searching on indeed...
searching on stepstone...
searching on stack overflow...
searching on XING...
no data found on page 1
no data found on page 2
no data found on page 3
no data found on page 4
Searching on LinkedIn...
0
25
25
25
50
25
75
25
