In [2]:
import requests
from bs4 import BeautifulSoup
import datetime
import time
import pandas as pd
import numpy as np


In [3]:
# example url
# https://www.indeed.com/jobs?q=swe+intern&l=Seattle%2C+WA&fromage=3

#### Fields ####
# query : q
# salary estimate(included in query params): $35,000
# date posted: fromage : int (day)
# jt: job type (jt=fulltime, parttime, contract, temporary, internship)
# location : l (l=Seattle,+WA)
# distance/ within _ miles: radius=(5,10,15,25,50,100)
# experience level: explvl=entry_level, senior_level, mid_level



In [4]:

'''
A python crawler that uses BeautifulSoup4 and requests module to access
Indeed job posts based on certain query parameters, returns a list of job posts
with the format of :
{
    "job_url": job_url,
    "job_title": job_title,
    "company_url": company_url,
    "company_name": company_name,
    "job description": job_description,
    "date": date,
    'new': new
}
'''
class IndeedAPI:
    
    currentTime = datetime.datetime.now()
    source_url = "https://www.indeed.com"
    # Indeed website base url to build on 
    base_url = "https://www.indeed.com/"
    
    query_params = {
    'q':None, 
    'fromage':None, 
    'jt': ['fulltime', 'parttime', 'contract', 'temporary', 'internship'], 
    'radius': None, 
    'salary_estimate': None,
    'l': None,
    'explvl': ['entry_level', 'senior_level', 'mid_level']
    }
    

    # Initialize class
    def __init__(self):
        self.queries =  {}
    

    # Lets client know the query parameter meaning and options
    def listAllQueryOptions(self):
        print(
        '\tq -- representing \"job query parameters\" \n\
        fromage -- \"Days since job posted\" \n\
        jt -- \"job type\" (ex: fulltime, parttime, contract, temporary, internship) \n\
        radius -- \"distance within miles\" \n\
        salary_estimate --  \"Estimated salary\" (included in query parameters)\n\
        l -- \"location\" (ex: Seattle,+ WA or Seattle+WA) \n\
        explvl: \"experience level\" (ex: entry_level, senior_level, mid_level)'
        )


    # Validate user input of query parameter key value pair
    def keyValueValidation(self, key, value):
        if key not in self.query_params:
            print(f"{key} is not in the mandated query params, which are {self.query_params.keys()}")
            return None, None
        else:
            # key is in query parameters
            res = self.query_params[key]
            if res != None:
                # needs to check if the value is allowed
                if value not in res:
                    print(f"{value} is not a valid option for {key}. The valid options are {res}")
                    return None, None
                else:
                    return key, value
            
            # if res is None, clean value before returning
            return key, value.strip()



    # Cleans all the dates fetched from the website into a date string
    def cleanDate(self, date):
        # EX: ['Active 5 days ago', '30+ days ago', '11 days ago', 'Today', '2 days ago'] -> '2021-4-30'
        date = date.replace('Active', '')
        
        if 'days ago' in date:
            date = date.replace('days ago', '')
        if 'Today' in date:
            date = '1'
        
        days = int(date.strip().replace('+',''))

        modified_date = self.currentTime - datetime.timedelta(days = days)
        return f"{modified_date.year}-{modified_date.month}-{modified_date.day}"



    # add queries to the dictionary "queries" that stores all query parameter and values as key-value pair
    def addQueries(self, key, value):
        key, value = self.keyValueValidation(key,value)
        if key != None and  value != None:
            self.queries[key] = value


    # Build up the search url from the base_url
    def constructSearchJobUrl(self):
        # EX:
        # https://www.indeed.com/jobs?q=Software+Engineer+Intern&l=Bellevue%2C+WA
        QUERY_ENDING = "&"
        self.base_url += "jobs?"
        # loop over the queries dictionary
        for k, v in self.queries.items():
            if k == "q":
                self.base_url = self.base_url + f"{k}=" + "+".join(v.split(" ")) + QUERY_ENDING
            elif k != "l":
                self.base_url = self.base_url + f"{k}={v}" + QUERY_ENDING
        
        if "l" in self.queries:
            self.base_url = self.base_url + "l=" + "+".join(self.queries['l'].replace(" ","").split(","))
        else:
            self.base_url = self.base_url + "l="


    # Fetch data from  a list of our targeted Beautiful Soup instance
    def getJobPost(self, foundSoup):        
        NEW_FLAG = 'new'
        
        try:
            job_url = self.source_url + foundSoup.find("h2", "title").a['href']
        except:
            job_url = None
        try:
            job_title = foundSoup.find("h2", "title").get_text().strip()
            if NEW_FLAG in job_title:
                new = True
            else:
                new = False
            job_title = job_title.replace(NEW_FLAG, '').strip()
        except:
            job_title = None
        try:
            company_url = self.source_url + foundSoup.find("span", "company").a['href']
        except:
            company_url = None
        
        try:
            company_name = foundSoup.find("span", "company").get_text().strip()
        except:
            company_name = None
        
        
        try:
            job_description = foundSoup.find("div", "summary").li.get_text()
        except:
            job_description = None
        

        try:
            date = self.cleanDate(foundSoup.find("span", "date-a11y").get_text())
        except:
            date = None


        return {
            "job_url": job_url,
            "job_title": job_title,
            "company_url": company_url,
            "company_name": company_name,
            "job description": job_description,
            "date": date,
            'new': new
        }

    # Use the formatted url to make a GET request, and use helper method getJobPost to parse
    # the data. If http status code doesn't return 200, then we abort, and print out a message.
    def makeRequestToFirstPage(self):    
        res = requests.get(self.base_url)
        if res.status_code != 200:
            print('Http request has error... abort')
            return None
        else:
            res_text = res.text
            res_soup = BeautifulSoup(res_text, "html.parser")
            allJobPosts = res_soup.find_all("div", "jobsearch-SerpJobCard")
            res = list(map(lambda x: self.getJobPost(x), allJobPosts))
            print(f"First page got {len(res)} job posts")
            return res


    def makeRequestToNextPage(self):
        restJobs = []
        TRAVERSE_ADDON = "&start="
        TRAVERSE_UNIT = 10
        TRAVERSE_NUMBER = 10
        # 6 pages
        
        while TRAVERSE_NUMBER <= 60: 
            print(f"Trying to access {self.base_url + TRAVERSE_ADDON + str(TRAVERSE_NUMBER)}")
            res = requests.get(self.base_url + TRAVERSE_ADDON + str(TRAVERSE_NUMBER)) 
            if res.status_code != 200:
                print('Http request has error... abort')
                return
            else:
                res_text = res.text
                res_soup = BeautifulSoup(res_text, "html.parser")
                allJobPosts = res_soup.find_all("div", "jobsearch-SerpJobCard")
                restJobs += list(map(lambda x: self.getJobPost(x), allJobPosts))
            
            time.sleep(1)
            TRAVERSE_NUMBER += TRAVERSE_UNIT
        
        print("Page traversal got : ", len(restJobs), "jobs")
        return restJobs
    

    def makeRequest(self):
        total = self.makeRequestToFirstPage() + self.makeRequestToNextPage()
        print(f"Got total of {len(total)} of job posts")
        return total


In [154]:
# Indeed API usage
indeed = IndeedAPI()
indeed.addQueries('q','software engineer intern')
indeed.addQueries('l', 'seattle, wa')
indeed.constructSearchJobUrl()

jobs = indeed.makeRequest()
jobs


First page got 15 job posts
Trying to access https://www.indeed.com/jobs?q=software+engineer+intern&l=seattle+wa&start=10
Trying to access https://www.indeed.com/jobs?q=software+engineer+intern&l=seattle+wa&start=20
Trying to access https://www.indeed.com/jobs?q=software+engineer+intern&l=seattle+wa&start=30
Trying to access https://www.indeed.com/jobs?q=software+engineer+intern&l=seattle+wa&start=40
Trying to access https://www.indeed.com/jobs?q=software+engineer+intern&l=seattle+wa&start=50
Trying to access https://www.indeed.com/jobs?q=software+engineer+intern&l=seattle+wa&start=60
Page traversal got :  80 jobs
Got total of 95 of job posts


clk?jk=740fcfa18e65e970&fccid=2ac0dbed95f0e3bf&vjs=3',
  'job_title': 'Platform Cloud (Media/Infrastructure) Services Engineering I...',
  'company_url': None,
  'company_name': 'VIZIO, Inc.',
  'job description': 'Put comments in code as appropriate, and produces external documentation for more complex software components.',
  'date': '2021-3-31',
  'new': False},
 {'job_url': 'https://www.indeed.com/rc/clk?jk=aca2e6c965808bd7&fccid=aef928e89977f7f0&vjs=3',
  'job_title': 'Site Reliability Engineering Intern (Remote - Summer 2021)',
  'company_url': 'https://www.indeed.com/cmp/Splunk',
  'company_name': 'Splunk',
  'job description': 'You will design, develop, and test software systems.',
  'date': '2021-3-31',
  'new': False},
 {'job_url': 'https://www.indeed.com/rc/clk?jk=90b6c1c0c40dac70&fccid=da3c7fed78dd1607&vjs=3',
  'job_title': 'Back End, Full Stack Engineering Intern',
  'company_url': 'https://www.indeed.com/cmp/Samsung-Electronics-9',
  'company_name': 'Samsung Electronics'

In [5]:
# Indeed API usage
indeed = IndeedAPI()
indeed.addQueries('q','project management')
indeed.addQueries('l', 'phoenix')
indeed.constructSearchJobUrl()

jobs = indeed.makeRequest()
jobs


First page got 15 job posts
Trying to access https://www.indeed.com/jobs?q=project+management&l=phoenix&start=10
Trying to access https://www.indeed.com/jobs?q=project+management&l=phoenix&start=20
Trying to access https://www.indeed.com/jobs?q=project+management&l=phoenix&start=30
Trying to access https://www.indeed.com/jobs?q=project+management&l=phoenix&start=40
Trying to access https://www.indeed.com/jobs?q=project+management&l=phoenix&start=50
Trying to access https://www.indeed.com/jobs?q=project+management&l=phoenix&start=60
Page traversal got :  94 jobs
Got total of 109 of job posts


  'job description': 'This job requires extensive technical and project management experience.',
  'date': '2021-4-24',
  'new': True},
 {'job_url': 'https://www.indeed.com/rc/clk?jk=92c2570b40a6f29a&fccid=86f58d29912aa934&vjs=3',
  'job_title': 'Project Manager',
  'company_url': 'https://www.indeed.com/cmp/Southwest-Metalsmiths',
  'company_name': 'Southwest Metalsmiths',
  'job description': 'Management responsibility generally limited to smaller less critical projects.',
  'date': '2021-4-1',
  'new': False},
 {'job_url': 'https://www.indeed.com/rc/clk?jk=614a8efeacea1d77&fccid=7a7b422618eedf69&vjs=3',
  'job_title': 'Project Coordinator / Accounts Payable Assistant',
  'company_url': 'https://www.indeed.com/cmp/Toll-Brothers',
  'company_name': 'Toll Brothers',
  'job description': 'This position requires accounts payable experience, proficiency in Microsoft Word and Excel, and good phone skills.',
  'date': '2021-4-1',
  'new': False},
 {'job_url': 'https://www.indeed.com/rc/clk?