In [1]:
import pandas as pd

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import requests
import re 

from scrapy.selector import Selector

from selenium import webdriver
from time import sleep

# Strategy for collection

1. Pull job links from a search site
2. Iterate through collected links to gather data on specific jobs from indivual pages:
    * job title
    * company 
    * company rating
    * location
    * salary (if provided)
    * brief description

Query url follow a simple format, so we can easily generate search queries. For example a search for "data scientist" in Melbourne, Vic is: https://au.indeed.com/jobs?q=data+scientist&l=Melbourne+VIC <br>


### These functions allow us to do the basic data collection

In [13]:
#storing both selenium and request version to allow switching

def make_fresh_soup(url):
    '''Accepts a url and returns a BeautifulSoup object'''
    
    #testing just using request
    response = requests.get(url)
    html = response.text
    
#     #use driver and wait one second for javascript to run before capturing html
#     driver = webdriver.Chrome(executable_path="./chromedriver/chromedriver.exe")
#     driver.get(url)
#     sleep(1)
#     html = driver.page_source     
    
    #convert to soup object
    soup = BeautifulSoup(html)
    
#     #close the driver to keep things clean
#     driver.close()

    return soup

In [46]:


def indeed_search(search_terms, location=['Melbourne','VIC']):
    
    '''Function accepts lists of job search terms and optional location strings 
    Returns the url for and indeeed search
    Search_terms should be a list of terms e.g.['data','scientist']
    Location should be a list of city, state; default is ['Melbourne','VIC']'''
    
    #setting and formatting terms for search
    search_string = search_terms[0]
    for term in search_terms[1:]:
        search_string = search_string + '+' + term

    #adding location if provided
    location = ['Melbourne', 'VIC']

    if location != []:
        location_string = location[0] + '+' + location[1]
        search_string = search_string + '&l=' + location_string
        
    #setting url for scraping
    search_url = 'https://au.indeed.com/jobs?q=' + search_string



    return(search_url)



In [4]:


def get_job_links(soup_search):
    
    '''Function to collect the links from an indeed search page
    Accepts a BeautifulSoup object of an indeed search page as input
    Returns a list of all the links to jobs on the page'''
    
    #adds website root to collected page elements
    links = [('https://au.indeed.com' + x.get('href')) 
                 for x in soup_search.find_all('a', attrs={'data-tn-element':'jobTitle'})]
    return links

In [5]:


def get_next_link(soup_search):
    
    '''Function to pull out the link to the next page of jobs for the search
    Accepts a BeautifulSoup object of the search page as input
    Returns the link to the next results page, or None on the last page'''
    
    #find the last of the links to new pages
    last_page_link = soup_search.find('div',{'class':'pagination'}).find_all('a')[-1]

    #if the text for that link is Next, grab the link
    if last_page_link.text.strip().startswith('Next') == True: 
        next_link = 'https://au.indeed.com' + last_page_link.get('href')
    else:
        next_link = 'end'


    return next_link

In [52]:
def do_search(url, links_list, count=1):
    
    #including error handling to return result even if next page is not found
    try:
        original_links = links_list

        #make soup object for url
        soup_search = make_fresh_soup(url)
        count += 1

        #save the links from the soup object
        page_links = get_job_links(soup_search)
        all_links = original_links + page_links

        #find the link to the next page
        next_url = get_next_link(soup_search)

        #repeat with next link until last page located
        if next_url == 'end':
            print(str(count) + 'pages of jobs searched')
            return all_links

        else:    
            return do_search(next_url, all_links, count)
    
    except:
        return all_links

In [7]:
def get_job_details(job_url):
    
    '''Extracts the job title, company, company rating, 
    job description, and salary from a job listing'''
    
    #use selenium driver to freeze javascript and capture html
    soup_job = make_fresh_soup(job_url)
    
    job_details = {}
    
    #these elements are almost always present, but adding exception management just in case
    try:
        job_details['job_title'] = soup_job.find('h3').text.strip()
    except: 
        job_details['job_title'] = None

    try:
        job_details['company'] = soup_job.find('div', {'class':"icl-u-lg-mr--sm icl-u-xs-mr--xs"}).text.strip()
    except:
        job_details['company'] = None
        
    try:
        job_details['job_description_all_text'] = soup_job.find('div',{'id':'jobDescriptionText'}).text.strip()
    except:
        job_details['job_description_all_text'] = None
        
    #formatting is slightly inconsistent on ratings
    try:
        job_details['company_rating'] = float(soup_job.find('div', {'class':"icl-Ratings-starsCountWrapper"})
                                              .get('aria-label')[0:3])
    except:
        try:
            job_details['company_rating'] = (soup_job.find('div', {'class':"icl-Ratings-starsCountWrapper"})
                                         .get('aria-label')[0:3])
        except:
            job_details['company_rating'] = None
            
    #salary data is often missing
    try:
        job_details['salary_data_text'] = soup_job.find('span', {'class':"icl-u-xs-mr--xs"}).text.strip()
    except:
        job_details['salary_data_text'] = None

    return(job_details)


In [114]:
def collect_and_save_job_details(job_links, filename):
    
    """Iterates through a list of links for individual jobs
    Collects the details of each job
    Converts to dataframe and saves to csv"""
    
    file_path = './data/' + filename
    jobs = []

    for job in job_links:
        new_job = get_job_details(job)
        jobs.append(new_job)

    jobs_df = pd.DataFrame(jobs)
    jobs_df.to_csv(file_path, index=False)
    
#     return jobs_df

### Options for improvement: 
* Use the suggested searches at the bottom of the final page
    * Continue until?? Some arbitrary level of data reached
* Improve the dataframe created to include the industry from the original search
    * Currently need to add manually, using something like the two cells below



In [115]:
job_links_df = pd.read_csv('./data/finance_job_links.csv')
finance_links = list(job_links_df['links'])    
new_filename = 'finance_job_details.csv'

collect_and_save_job_details(finance_links, new_filename)

In [116]:
finance_df = pd.read_csv('./data/finance_job_details.csv')
finance_df['industry'] = 'finance'
finance_df.to_csv('./data/finance_job_details.csv', index=False)
finance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1493 entries, 0 to 1492
Data columns (total 6 columns):
company                     1488 non-null object
company_rating              671 non-null object
job_description_all_text    1492 non-null object
job_title                   1492 non-null object
salary_data_text            680 non-null object
industry                    1493 non-null object
dtypes: object(6)
memory usage: 70.1+ KB


### Code below saves additional code used to adjust and store data

In [90]:
for industry in [['construction'], ['hospitality'], ['nursing'], ['manufacturing'], ['retail'], ['finance']]:
    url = indeed_search(industry)
    links_list = []

    resulting_links = do_search(url, links_list, count=1)

    links_df = pd.DataFrame(resulting_links)
    links_df['industry'] = industry[0]
    links_df = links_df.rename({0:'links'},axis='columns')

    file_path = './data/' + industry[0] + '_job_links.csv'
    links_df.to_csv(file_path)

56pages of jobs searched
99pages of jobs searched
101pages of jobs searched
101pages of jobs searched
101pages of jobs searched
101pages of jobs searched


In [91]:
links_df.head()

Unnamed: 0,links,industry
0,https://au.indeed.com/pagead/clk?mo=r&ad=-6NYl...,finance
1,https://au.indeed.com/pagead/clk?mo=r&ad=-6NYl...,finance
2,https://au.indeed.com/pagead/clk?mo=r&ad=-6NYl...,finance
3,https://au.indeed.com/pagead/clk?mo=r&ad=-6NYl...,finance
4,https://au.indeed.com/company/Lucky-Ent-Pty-Lt...,finance


In [95]:
job_links_df = pd.read_csv('./data/construction_job_links.csv')
construction_links = list(job_links_df['links'])    
new_filename = 'construction_job_details.csv'

collect_and_save_job_details(construction_links, new_filename)

In [106]:
construction_df = pd.read_csv('./data/construction_job_details.csv')
construction_df = construction_df.drop('Unnamed: 0', axis=1)
construction_df['industry'] = 'construction'
construction_df.to_csv('./data/construction_job_details.csv', index=False)

In [108]:
construction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 881 entries, 0 to 880
Data columns (total 6 columns):
company                     873 non-null object
company_rating              355 non-null object
job_description_all_text    881 non-null object
job_title                   881 non-null object
salary_data_text            417 non-null object
industry                    881 non-null object
dtypes: object(6)
memory usage: 41.4+ KB


In [99]:
job_links_df = pd.read_csv('./data/hospitality_job_links.csv')
hospitality_links = list(job_links_df['links'])    
new_filename = 'hospitality_job_details.csv'

collect_and_save_job_details(hospitality_links, new_filename)

In [109]:
hospitality_df = pd.read_csv('./data/hospitality_job_details.csv')
hospitality_df = hospitality_df.drop('Unnamed: 0', axis=1)
hospitality_df['industry'] = 'hospitality'
hospitality_df.to_csv('./data/hospitality_job_details.csv', index=False)
hospitality_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 6 columns):
company                     1383 non-null object
company_rating              439 non-null object
job_description_all_text    1383 non-null object
job_title                   1383 non-null object
salary_data_text            554 non-null object
industry                    1408 non-null object
dtypes: object(6)
memory usage: 66.1+ KB


In [100]:
job_links_df = pd.read_csv('./data/nursing_job_links.csv')
nursing_links = list(job_links_df['links'])    
new_filename = 'nursing_job_details.csv'

collect_and_save_job_details(nursing_links, new_filename)

In [110]:
nursing_df = pd.read_csv('./data/nursing_job_details.csv')
nursing_df = nursing_df.drop('Unnamed: 0', axis=1)
nursing_df['industry'] = 'nursing'
nursing_df.to_csv('./data/nursing_job_details.csv', index=False)
nursing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1466 entries, 0 to 1465
Data columns (total 6 columns):
company                     1465 non-null object
company_rating              770 non-null object
job_description_all_text    1465 non-null object
job_title                   1465 non-null object
salary_data_text            310 non-null object
industry                    1466 non-null object
dtypes: object(6)
memory usage: 68.8+ KB


In [101]:
job_links_df = pd.read_csv('./data/manufacturing_job_links.csv')
manufacturing_links = list(job_links_df['links'])    
new_filename = 'manufacturing_job_details.csv'

collect_and_save_job_details(manufacturing_links, new_filename)

In [111]:
manufacturing_df = pd.read_csv('./data/manufacturing_job_details.csv')
manufacturing_df = manufacturing_df.drop('Unnamed: 0', axis=1)
manufacturing_df['industry'] = 'manufacturing'
manufacturing_df.to_csv('./data/manufacturing_job_details.csv', index=False)
manufacturing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 6 columns):
company                     1422 non-null object
company_rating              724 non-null object
job_description_all_text    1469 non-null object
job_title                   1469 non-null object
salary_data_text            580 non-null object
industry                    1470 non-null object
dtypes: object(6)
memory usage: 69.0+ KB


In [102]:
job_links_df = pd.read_csv('./data/retail_job_links.csv')
retail_links = list(job_links_df['links'])    
new_filename = 'retail_job_details.csv'

collect_and_save_job_details(retail_links, new_filename)

In [112]:
retail_df = pd.read_csv('./data/retail_job_details.csv')
retail_df = retail_df.drop('Unnamed: 0', axis=1)
retail_df['industry'] = 'retail'
retail_df.to_csv('./data/retail_job_details.csv', index=False)
retail_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1409 entries, 0 to 1408
Data columns (total 6 columns):
company                     1346 non-null object
company_rating              851 non-null object
job_description_all_text    1346 non-null object
job_title                   1346 non-null object
salary_data_text            209 non-null object
industry                    1409 non-null object
dtypes: object(6)
memory usage: 66.1+ KB


In [103]:
job_links_df = pd.read_csv('./data/early_learning_job_links.csv')
early_learning_links = list(job_links_df['links'])    
new_filename = 'early_learning_job_details.csv'

collect_and_save_job_details(early_learning_links, new_filename)

In [113]:
early_learning_df = pd.read_csv('./data/early_learning_job_details.csv')
early_learning_df = early_learning_df.drop('Unnamed: 0', axis=1)
early_learning_df['industry'] = 'early_learning'
early_learning_df.to_csv('./data/early_learning_job_details.csv', index=False)
early_learning_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 6 columns):
company                     879 non-null object
company_rating              436 non-null object
job_description_all_text    879 non-null object
job_title                   879 non-null object
salary_data_text            142 non-null object
industry                    879 non-null object
dtypes: object(6)
memory usage: 41.3+ KB


### Combining all datasets into one 

In [126]:
data_df = pd.read_csv('./data/final_data_job_details.csv')
data_df = data_df.drop('Unnamed: 0', axis=1)
data_df.head()

Unnamed: 0,company,company_rating,industry,job_description_all_text,job_title,salary_data_text
0,ANZ Banking Group,4.1,data,"Consultant Data Scientist – Data Activation, M...",Data Scientist Consultant,
1,NAB - National Australia Bank,3.8,data,Work type: Permanent Full time\nLocation: Aust...,Data Scientist,
2,ANZ Banking Group,4.1,data,The Team\n\nThe Customer Service Operations Au...,Senior Data Scientist,
3,NAB - National Australia Bank,3.8,data,Work type: Permanent Full time\nLocation: Aust...,Data Analytics Engineer,
4,Coles,3.8,data,About Us\n\nThe Digital team is focused on rei...,Data Scientist,


In [127]:
frames = [data_df, construction_df, manufacturing_df, retail_df, 
          hospitality_df, finance_df, nursing_df, early_learning_df]
full_dataset_1 = pd.concat(frames)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [128]:
full_dataset_1.head()

Unnamed: 0,company,company_rating,industry,job_description_all_text,job_title,salary_data_text
0,ANZ Banking Group,4.1,data,"Consultant Data Scientist – Data Activation, M...",Data Scientist Consultant,
1,NAB - National Australia Bank,3.8,data,Work type: Permanent Full time\nLocation: Aust...,Data Scientist,
2,ANZ Banking Group,4.1,data,The Team\n\nThe Customer Service Operations Au...,Senior Data Scientist,
3,NAB - National Australia Bank,3.8,data,Work type: Permanent Full time\nLocation: Aust...,Data Analytics Engineer,
4,Coles,3.8,data,About Us\n\nThe Digital team is focused on rei...,Data Scientist,


In [129]:
full_dataset_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12957 entries, 0 to 878
Data columns (total 6 columns):
company                     12796 non-null object
company_rating              6509 non-null object
industry                    12957 non-null object
job_description_all_text    12866 non-null object
job_title                   12866 non-null object
salary_data_text            4066 non-null object
dtypes: object(6)
memory usage: 708.6+ KB


In [130]:
full_dataset_1.to_csv('./data/full_dataset_1.csv', index=False)

In [131]:
test_data = pd.read_csv('./data/full_dataset_1.csv')
print(test_data.info())
test_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12957 entries, 0 to 12956
Data columns (total 6 columns):
company                     12796 non-null object
company_rating              6509 non-null object
industry                    12957 non-null object
job_description_all_text    12866 non-null object
job_title                   12866 non-null object
salary_data_text            4066 non-null object
dtypes: object(6)
memory usage: 607.4+ KB
None


Unnamed: 0,company,company_rating,industry,job_description_all_text,job_title,salary_data_text
0,ANZ Banking Group,4.1,data,"Consultant Data Scientist – Data Activation, M...",Data Scientist Consultant,
1,NAB - National Australia Bank,3.8,data,Work type: Permanent Full time\nLocation: Aust...,Data Scientist,
2,ANZ Banking Group,4.1,data,The Team\n\nThe Customer Service Operations Au...,Senior Data Scientist,
3,NAB - National Australia Bank,3.8,data,Work type: Permanent Full time\nLocation: Aust...,Data Analytics Engineer,
4,Coles,3.8,data,About Us\n\nThe Digital team is focused on rei...,Data Scientist,


### Initial testing of process

#setting starting values for search
search_url = indeed_search(['data','scientist'],['Melbourne','VIC'])
initial_list = []

job_links = do_search(search_url, initial_list)

#more links than expected; not sure why; investigated a little with selenium & manually, but unclear
len(job_links)

#pulling job details out of data science search links
jobs = [get_job_details(job) for job in job_links]

#checking that it worked as expected
len(jobs)

jobs_df = pd.DataFrame(jobs)

jobs_df.columns

column_order = ['job_title', 'company', 'company_rating', 'job_description_all_text', 'salary_data_text']
jobs_df = jobs_df[column_order]
jobs_df.head(10)

jobs_df.info()

jobs_df.to_csv('./data/data_scientist_search_results.csv')

### Mostly looks as expected
Very few salaries, some missing company ratings. Will convert salaries and company ratings to numericals once full dataset is collected.

### Collecting additional jobs via alternate search terms

#setting up search urls
search_options = [['data', 'analyst'],
                  ['machine', 'learning'],
                  ['business', 'analyst'],
                  ['business', 'intelligence'],
                  ['researcher'],
                  ['junior', 'data', 'scientist'],
                  ['data', 'engineer'],
                  ['python']]

search_urls = [indeed_search(option,['Melbourne','VIC']) for option in search_options]  

search_urls

job_links = []
for url in search_urls:
    initial_list = []
    job_links = job_links + do_search(url, initial_list)
    print(len(job_links))


len(job_links)

job_links_df = pd.DataFrame(job_links)
job_links_df.to_csv('./data/data_job_links.csv')

job_links_df = pd.read_csv('./data/data_job_links.csv')


job_links_df.columns = ['number','link']
job_links_df.head()

job_links = list(job_links_df.link)

collect_and_save_job_details(job_links,'other_data_job_details.csv')

test_df = pd.read_csv('./data/other_data_job_details.csv')
test_df.head()

test_df.info()

test_df['industry'] = 'data'
test_df.head()

test_df.to_csv('./data/other_data_job_details.csv')
data_df = pd.read_csv('./data/other_data_job_details.csv')
data_df.head()

data_df = data_df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

data_df.info()

datascience_df = pd.read_csv('./data/data_scientist_search_results.csv')
datascience_df.head()

datascience_df['industry'] = 'data'
datascience_df = datascience_df.drop('Unnamed: 0', axis=1)
datascience_df.head()



data_jobs_df = pd.concat([datascience_df, data_df], axis=0)
data_jobs_df.head()

data_jobs_df.info()

data_jobs_df.to_csv('./data/final_data_job_details.csv')


all_data_jobs = pd.read_csv('./data/final_data_job_details.csv')
all_data_jobs.info()

### Notes from writing function to collect job details

In [None]:
# Job Title location in individual page
job_title = job_soup.find('h3')
job_title.text.strip()

In [None]:
#Company in individual page
company = job_soup.find('div', {'class':"icl-u-lg-mr--sm icl-u-xs-mr--xs"})
company.text.strip()

In [None]:
#Company rating in individual page
company_rating = float(job_soup.find('div', {'class':"icl-Ratings-starsCountWrapper"}).get('aria-label')[0:3])
company_rating
# company_rating.get('aria-label')
# <div tabindex="0" class="icl-Ratings-starsCountWrapper" aria-label="4.1 out of 5"><div class="icl-Ratings-starsWrapper"><div class="icl-Ratings-starsUnfilled"><div class="icl-Ratings-starsFilled" style="width: 76.19999885559082px;"></div></div></div><div class="icl-Ratings-count" aria-hidden="true">983 reviews</div></div>

In [None]:
#Full job description in individual page
description = job_soup.find('div',{'id':'jobDescriptionText'}).text.strip()
description

In [None]:
#Job description in individual page - switched to above
description= job_soup.find('span', {'class':"jobsearch-JobMetadataHeader-iconLabel"}).text.strip()
description


In [None]:
#Salary in individual page
try:
    salary_data = soup_job2.find('span', {'class':"icl-u-xs-mr--xs"}).text.strip()
except:
    salary_data = None
salary_data