In [16]:
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup

# those libraries are used for controlling request frequency
from time import sleep
from random import randint
from warnings import warn
from IPython.core.display import clear_output
from time import time


In [2]:
# jobs information are located in tags with different names - extract every job in a page
def get_job(soup):
    all_jobs_in_page = []
    for result in ['row result', ' row result', 'lastRow row result', 'row sjlast result']:
        jobs = soup.find_all('div', {'class':result})
        all_jobs_in_page += jobs
    return(all_jobs_in_page)

In [3]:
# extract job title - the job title could be in 'a' tag or 'h2' tag
def get_job_title(job):
    for name_tag in ['a', 'h2']:
        job_title = job.find(name_tag, {'data-tn-element':'jobTitle'})
        if job_title is not None:
            return(job_title.text)

In [22]:
# extract company name - some in 'span' tag, some in 'a' tag nested by 'span' tag
def get_company_name(job):
    try:
        company_name = job.find('span', {'class':'company'}).a.text.strip()
    except:
        try:
            company_name = job.find('span', {'class':'company'}).text.strip()
        except:
            company_name = np.nan
    return(company_name)

In [5]:
# extract location - in 'div' tag/ 'span' tag or nested in 'a' tag inside 'span' tag

def get_job_location(job):
    try:
        location = job.find('div', {'class':'location'}).text
    except:
        try:
            location = job.find('span', {'class':'location'}).text
        except:
            location = job.find('span', {'class':'location'}).a.text
    return(location)

In [6]:
# extract job summary
def get_job_summary(job):
    summary = job.find('span', {'class':'summary'}).text.strip()
    return(summary)

In [7]:
# extract salary
def get_job_salary(job):
    salary = job.find('span', {'class':'no-wrap'})
    if salary is None:
        salary = '0'
    else:
        salary = salary.text.strip()
    return(salary)

In [8]:
# add jobs to the pre-created lists
def combine_jobs(all_jobs_in_page, est_salary):
    for job in all_jobs_in_page:
        job_titles.append(get_job_title(job))
        companies_name.append(get_company_name(job))
        locations.append(get_job_location(job))
        job_summaries.append(get_job_summary(job))
        salaries.append(get_job_salary(job))
        est_salaries.append(int(est_salary+'000'))


In [23]:
# create the list of data
job_titles = []
companies_name = []
locations = []
job_summaries = []
salaries = []
est_salaries = []

# variables for monitoring the scraping process
request = 0
start_time = time()

# set up maximum number of results, keywords to search and salary range
maximum_results = 500
occupation_list = ['data+scientist', 'data+analyst', 'machine+learning+engineer', 
                  'business+intelligence']
salary_range = ['80', '100', '120', '140', '160']


for keyword in occupation_list: 
    for est_salary in salary_range:
        for i in range(0, maximum_results, 10):
            # make a request
            url = 'https://au.indeed.com/jobs?q='+keyword+'+%24'+est_salary+'%2C000&l=australia&start='+str(i)
            response = requests.get(url)

            # pause the loop for a while to prevent getting banned
            sleep(randint(1,2))

            # monitor the request
            request += 1
            elapsed_time = time() - start_time
            print('Request:{}; Frequency:{}'.format(request, request/elapsed_time))
            
            # warning
            if response.status_code != 200:
                warn('Request: {}; Status: {}'.format(request, response.status_code))

            # parse the content
            soup = BeautifulSoup(response.text, 'html.parser')
            # get all jobs in each page
            all_jobs_in_page = get_job(soup)
            # combine all jobs information in each page to generated list
            combine_jobs(all_jobs_in_page, est_salary)
            
            
print('Scraping finished!')

Request:1; Frequency:0.43572870135559566
Request:2; Frequency:0.5433804181394029
Request:3; Frequency:0.49346755838913464
Request:4; Frequency:0.46731376315599993
Request:5; Frequency:0.4556724049778417
Request:6; Frequency:0.48374730514094194
Request:7; Frequency:0.5073572003073438
Request:8; Frequency:0.494399051279973
Request:9; Frequency:0.484439749064384
Request:10; Frequency:0.5009502973767741
Request:11; Frequency:0.4917284624843034
Request:12; Frequency:0.4848865951165243
Request:13; Frequency:0.49764286593514395
Request:14; Frequency:0.49115893925154075
Request:15; Frequency:0.5019191571432794
Request:16; Frequency:0.5112368306579816
Request:17; Frequency:0.5193074860936836
Request:18; Frequency:0.5127884240256743
Request:19; Frequency:0.5206847212808838
Request:20; Frequency:0.5273260870812895
Request:21; Frequency:0.5341866438541015
Request:22; Frequency:0.5277411180826618
Request:23; Frequency:0.5217431947321332
Request:24; Frequency:0.5277621593395272
Request:25; Frequency

Request:199; Frequency:0.52617269075589
Request:200; Frequency:0.5255152021499867
Request:201; Frequency:0.526127911186247
Request:202; Frequency:0.5253205419057101
Request:203; Frequency:0.5243987317649588
Request:204; Frequency:0.5250969478842807
Request:205; Frequency:0.5244135912993573
Request:206; Frequency:0.5237688989371179
Request:207; Frequency:0.5245163327920669
Request:208; Frequency:0.5251846799337386
Request:209; Frequency:0.5244096802961471
Request:210; Frequency:0.5250827730990946
Request:211; Frequency:0.5255810348979644
Request:212; Frequency:0.5249846422385372
Request:213; Frequency:0.5242790952549443
Request:214; Frequency:0.5248257830326655
Request:215; Frequency:0.524143093632875
Request:216; Frequency:0.5246398505658929
Request:217; Frequency:0.5252249847325704
Request:218; Frequency:0.5258863767618231
Request:219; Frequency:0.525282985814879
Request:220; Frequency:0.5242368743945484
Request:221; Frequency:0.5247132985290373
Request:222; Frequency:0.52363277235621

Request:395; Frequency:0.5139550796655789
Request:396; Frequency:0.5136638606973265
Request:397; Frequency:0.513968555049126
Request:398; Frequency:0.5141649846206985
Request:399; Frequency:0.5144974617808543
Request:400; Frequency:0.5141920221826123
Request:401; Frequency:0.5144804667212295
Request:402; Frequency:0.5148338937368068
Request:403; Frequency:0.5145278276372723
Request:404; Frequency:0.5148554531944248
Request:405; Frequency:0.5152066398778308
Request:406; Frequency:0.515554885009346
Request:407; Frequency:0.5152665892770888
Request:408; Frequency:0.5156346539915013
Request:409; Frequency:0.515310985292939
Request:410; Frequency:0.5148269776999345
Request:411; Frequency:0.5151469123177378
Request:412; Frequency:0.5155042206708291
Request:413; Frequency:0.5149326789322088
Request:414; Frequency:0.5146574063105781
Request:415; Frequency:0.51431339106702
Request:416; Frequency:0.5140116084006924
Request:417; Frequency:0.5143075903347237
Request:418; Frequency:0.51463562865596

Request:591; Frequency:0.5127081514263722
Request:592; Frequency:0.5125071904846772
Request:593; Frequency:0.5123127048861837
Request:594; Frequency:0.5125435879815667
Request:595; Frequency:0.5123505406737201
Request:596; Frequency:0.5125962139206647
Request:597; Frequency:0.5124048156292446
Request:598; Frequency:0.5126240155695087
Request:599; Frequency:0.5128360177051232
Request:600; Frequency:0.5126339494953693
Request:601; Frequency:0.5124520844563456
Request:602; Frequency:0.512710556800023
Request:603; Frequency:0.5125372892938629
Request:604; Frequency:0.5123415603844659
Request:605; Frequency:0.5121437889237382
Request:606; Frequency:0.5123618028251415
Request:607; Frequency:0.5121730279764718
Request:608; Frequency:0.5123780549528909
Request:609; Frequency:0.5121838030287893
Request:610; Frequency:0.5124355265128667
Request:611; Frequency:0.5122621969077678
Request:612; Frequency:0.5119461372029519
Request:613; Frequency:0.5120835679810255
Request:614; Frequency:0.5123075835

Request:787; Frequency:0.5099002389048629
Request:788; Frequency:0.5097469048866741
Request:789; Frequency:0.5096057415524197
Request:790; Frequency:0.5097791709031997
Request:791; Frequency:0.5099711150000118
Request:792; Frequency:0.5101302595785449
Request:793; Frequency:0.5099871592994457
Request:794; Frequency:0.510145401662647
Request:795; Frequency:0.51000020692006
Request:796; Frequency:0.5098569762773839
Request:797; Frequency:0.5097007392053851
Request:798; Frequency:0.5095532206253385
Request:799; Frequency:0.5094118826029009
Request:800; Frequency:0.5092115246377114
Request:801; Frequency:0.5090589582826409
Request:802; Frequency:0.5092375979581442
Request:803; Frequency:0.5090973059022808
Request:804; Frequency:0.5089180339266435
Request:805; Frequency:0.5087361043666634
Request:806; Frequency:0.5089250450934824
Request:807; Frequency:0.5087820032229075
Request:808; Frequency:0.5086497156549521
Request:809; Frequency:0.5088256737386961
Request:810; Frequency:0.509014583807

Request:983; Frequency:0.5118134742593909
Request:984; Frequency:0.5119569482976596
Request:985; Frequency:0.511831437936605
Request:986; Frequency:0.5117263699461493
Request:987; Frequency:0.5116094489104174
Request:988; Frequency:0.5114951951908002
Request:989; Frequency:0.5113691676124338
Request:990; Frequency:0.5112588931643566
Request:991; Frequency:0.5114134891090668
Request:992; Frequency:0.5115552251069725
Request:993; Frequency:0.5114153375845412
Request:994; Frequency:0.5115685279002952
Request:995; Frequency:0.5114627698233791
Request:996; Frequency:0.5116184760886634
Request:997; Frequency:0.5117703031320106
Request:998; Frequency:0.5116579997031152
Request:999; Frequency:0.511529534986855
Request:1000; Frequency:0.5114111807564219
Scraping finished!


In [129]:
# put all jobs into a dataframe
jobs_data = pd.DataFrame({'job_title': job_titles,
                          'location': locations,
                          'company': companies_name,
                          'salary': salaries,
                          'est_salary': est_salaries,
                          'summary': job_summaries})

# data without estimated salary
jobs_data_no_est = jobs_data.drop('est_salary', axis=1)
# remove duplicated jobs
jobs_data_unique = jobs_data_no_est.drop_duplicates()
# now join back the estimated salary
final_jobs_data = jobs_data_unique.join(jobs_data['est_salary']).reset_index(drop=True)

In [137]:
# export to csv 
final_jobs_data.to_csv('indeed_jobs_data.csv', encoding='utf-8')