In [54]:
import pandas as pd

from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import requests
import re 

from scrapy.selector import Selector

from selenium import webdriver
from time import sleep

# Strategy for collection

1. Pull job links from a search site
2. Iterate through collected links to gather data on specific jobs from indivual pages:
    * job title
    * company 
    * company rating
    * location
    * salary (if provided)
    * brief description

Query url follow a simple format, so we can easily generate search queries. For example a search for "data scientist" in Melbourne, Vic is: https://au.indeed.com/jobs?q=data+scientist&l=Melbourne+VIC <br>


### These functions allow us to do the basic data collection

In [2]:
#storing both selenium and request version to allow switching

def make_fresh_soup(url):
    '''Accepts a url and returns a BeautifulSoup object'''
    
    #testing just using request
    response = requests.get(url)
    html = response.text
    
#     #use driver and wait one second for javascript to run before capturing html
#     driver = webdriver.Chrome(executable_path="./chromedriver/chromedriver.exe")
#     driver.get(url)
#     sleep(1)
#     html = driver.page_source     
    
    #convert to soup object
    soup = BeautifulSoup(html)
    
#     #close the driver to keep things clean
#     driver.close()

    return soup

In [3]:
def indeed_search(search_terms, location=['Melbourne','VIC']):
    
    '''Function accepts lists of job search terms and optional location strings 
    Returns the url for and indeeed search
    Search_terms should be a list of terms e.g.['data','scientist']
    Location should be a list of city, state; default is ['Melbourne','VIC']'''
    
    #setting and formatting terms for search
    search_string = search_terms[0]
    for term in search_terms[1:]:
        search_string = search_string + '+' + term

    #adding location if provided
    location_string = location[0] + '+' + location[1]
    search_string = search_string + '&l=' + location_string
        
    #setting url for scraping
    search_url = 'https://au.indeed.com/jobs?q=' + search_string



    return(search_url)



In [4]:
def get_job_links(soup_search):
    
    '''Function to collect the links from an indeed search page
    Accepts a BeautifulSoup object of an indeed search page as input
    Returns a list of all the links to jobs on the page'''
    
    #adds website root to collected page elements
    links = [('https://au.indeed.com' + x.get('href')) 
                 for x in soup_search.find_all('a', attrs={'data-tn-element':'jobTitle'})]
    return links

In [5]:
def get_next_link(soup_search):
    
    '''Function to pull out the link to the next page of jobs for the search
    Accepts a BeautifulSoup object of the search page as input
    Returns the link to the next results page, or None on the last page'''
    
    #find the last of the links to new pages
    last_page_link = soup_search.find('div',{'class':'pagination'}).find_all('a')[-1]

    #if the text for that link is Next, grab the link
    if last_page_link.text.strip().startswith('Next') == True: 
        next_link = 'https://au.indeed.com' + last_page_link.get('href')
    else:
        next_link = 'end'


    return next_link

In [6]:
def do_search(url, links_list, count=1):
    
    '''Finds and returns a list of job pages found in a particular search
    Accepts a search url and a list to collect the links from that search'''
    
    #including error handling to return result even if next page is not found
    try:
        original_links = links_list

        #make soup object for url
        soup_search = make_fresh_soup(url)
        count += 1

        #save the links from the soup object
        page_links = get_job_links(soup_search)
        all_links = original_links + page_links

        #find the link to the next page
        next_url = get_next_link(soup_search)

        #repeat with next link until last page located
        if next_url == 'end':
            print(str(count) + 'pages of jobs searched')
            return all_links

        else:    
            return do_search(next_url, all_links, count)
    
    except:
        return all_links

In [7]:
def get_job_details(job_url):
    
    '''Extracts the job title, company, company rating, 
    job description, and salary from a job listing'''
    
    #use selenium driver to freeze javascript and capture html
    soup_job = make_fresh_soup(job_url)
    
    job_details = {}
    
    #these elements are almost always present, but adding exception management just in case
    try:
        job_details['job_title'] = soup_job.find('h3').text.strip()
    except: 
        job_details['job_title'] = None

    try:
        job_details['company'] = soup_job.find('div', {'class':"icl-u-lg-mr--sm icl-u-xs-mr--xs"}).text.strip()
    except:
        job_details['company'] = None
          
    try:
        job_details['location'] = soup_job.find('div', {'class':"location"}).text.strip()
    except:
        job_details['location'] = None
        
    try:
        job_details['job_description_all_text'] = soup_job.find('div',{'id':'jobDescriptionText'}).text.strip()
    except:
        job_details['job_description_all_text'] = None

    #formatting is slightly inconsistent on ratings
    try:
        job_details['company_rating'] = float(soup_job.find('div', {'class':"icl-Ratings-starsCountWrapper"})
                                              .get('aria-label')[0:3])
    except:
        try:
            job_details['company_rating'] = (soup_job.find('div', {'class':"icl-Ratings-starsCountWrapper"})
                                         .get('aria-label')[0:3])
        except:
            job_details['company_rating'] = None
            
    #salary data is often missing
    try:
        job_details['salary_data_text'] = soup_job.find('span', {'class':"icl-u-xs-mr--xs"}).text.strip()
    except:
        job_details['salary_data_text'] = None

    return(job_details)


In [8]:
def collect_and_save_job_details(job_links, industry, location, filename):
    
    """Iterates through a list of links for individual jobs
    Collects the details of each job
    Converts to dataframe and saves to csv"""
    
    file_path = './data/' + filename
    jobs = []

    for job in job_links:
        new_job = get_job_details(job)
        jobs.append(new_job)

    jobs_df = pd.DataFrame(jobs)
    jobs_df['industry'] = industry
    jobs_df['location'] = location
    jobs_df.to_csv(file_path, index=False)
    

In [11]:
def industry_location_search(industries, locations):
    
    '''Collects job details from and Indeed search including search terms and location
    industries must be a list of lists of search terms (single terms must be a single list)
    locations must be a list of [city, state] e.g. ['Melbourne', 'VIC']'''
    
    for industry in industries:
        
        for location in locations:
            
            job_links = []
            
            search_url = indeed_search(industry, location)
            job_links = do_search(search_url, job_links)
        
            location_string = location[0].lower()
            industry_string = industry[0]
            for term in industry[1:]:
                industry_string = industry_string + term
    
            filename = industry_string + '_' + location_string + '.csv' 
        
            collect_and_save_job_details(job_links, industry_string, location_string, filename)
        

### Functions to retrieve and combine data from specific industries & locations

In [4]:
def get_file_names(industries, locations):
    
    '''Combines previously saved .csv files for a location and industry into one file
    industries and locations need to be from the lists of lists used for searching'''
    
    filenames = []
    
    for industry in industries:
        
        for location in locations:
                    
            location_string = location[0].lower()
            
            industry_string = industry[0]
            for term in industry[1:]:
                industry_string = industry_string + term
    
            filename = './data/' + industry_string + '_' + location_string + '.csv' 
            filenames.append(filename)
    
    return filenames

In [28]:
def combine_files(industries, locations, final_file_path = './data/new_combined_data.csv'):
    
    '''Combines previously saved .csv files for a location and industry into one file
    industries and locations need to be from the lists of lists used for searching.
    Name for filepath to save combined csv is optional.
    Saves a csv file and returns a dataframe of combined data'''
    
    frames = []
    
    for industry in industries:
        
        for location in locations:
                    
            location_string = location[0].lower()
            
            industry_string = industry[0]
            for term in industry[1:]:
                industry_string = industry_string + term
    
            filename = './data/' + industry_string + '_' + location_string + '.csv' 
            
            df = pd.read_csv(filename)
            frames.append(df)
    
    full_dataset = pd.concat(frames)
    full_dataset.to_csv(final_file_path, index=False)
        
    return(full_dataset)

### Collecting some data

In [None]:
#collected data for melbourne previously
#collecting more location data
industries = [['data', 'science'], ['data', 'analyst'], ['construction'], ['hospitality'], 
              ['nursing'], ['manufacturing'], ['retail'], ['finance'],['early','learning']]
locations = [['Sydney', 'NSW'], ['Perth','WA'], ['Adelaide','SA'],
             ['Canberra','ACT'], ['Brisbane','QLD'], ['Darwin','NT']]

industry_location_search(industries,locations)

In [29]:
#combining all industries
industries = [['data', 'science'], ['data', 'analyst'], ['construction'], ['hospitality'], 
              ['nursing'], ['manufacturing'], ['retail'], ['finance'],['early','learning']]
locations = [['Sydney', 'NSW'], ['Perth','WA'], ['Adelaide','SA'],
             ['Canberra','ACT'], ['Brisbane','QLD'], ['Darwin','NT']]

df = combine_files(industries, locations)
df.shape

(28156, 7)

In [31]:
#pulling out just data jobs for project 4
industries = [['data', 'science'], ['data', 'analyst']]
locations = [['Sydney', 'NSW'], ['Perth','WA'], ['Adelaide','SA'],
             ['Canberra','ACT'], ['Brisbane','QLD'], ['Darwin','NT']]

df = combine_files(industries, locations)
df.shape

(4643, 7)

### Options for improvement: 
* Use the suggested searches at the bottom of the final page
    * Continue until?? Some arbitrary level of data reached
