In [None]:
#Import necessary modules
import time # to be able to run the sleep function
import csv
import re

In [None]:
#import this to access shared functions allowing us to share code across notebooks
import sharedFunctions

In [None]:
#This function receives a page an extracts the total jobs count.
def getIndeedSearchCountPages(parsed_html):
    section = parsed_html.body.find('div', attrs={'id':'searchCountPages'}).text
    text = [t for t in section.split(' ') if t != '' and t != '\n'] # removes empty spaces. 
    return int(re.sub("[^0-9]", "", text[3])) #remove all non numeric chars from this string 

In [None]:
#This function will receive a single page then extract all job items. 
def parseIndeedPage(parsed_html):
    return parsed_html.body.find_all('div', attrs={'class':'title'})

In [None]:
# Create a function to receive a listing page, then download all job links.
def scrapeSearchPage(queryLink, baseUrl):
    print("Starting with " + queryLink)
    mainPage = sharedFunctions.downloadAndParseLink(queryLink)
    totalItemsCount = getIndeedSearchCountPages(mainPage)
    
    print("Processing for {0} items".format(totalItemsCount))
    if totalItemsCount > 0:
        totalProcessed = 0
        while totalProcessed < totalItemsCount:
            try:
                link = queryLink + "&start=" + str(totalProcessed)            
                sections = None
                if totalProcessed == 0:
                    print("Processing main page")
                    sections = parseIndeedPage(mainPage)
                else:
                    print("Downloading {0}".format(link))
                    sections = parseIndeedPage(sharedFunctions.downloadAndParseLink(link))                
                
                for section in sections:
                    lnk = baseUrl + sharedFunctions.getAttrFromTag(section.a, 'href')
                    if lnk not in allJobLinks:
                        allJobLinks.append(lnk)
            
                totalProcessed += 10 #each page has 10 links.            
                #wait for 2 seconds
                time.sleep(2)
            except: 
                print("Exception happened")
                #sometimes I noticed some internal errors like error 500, so lets wait a little bit more
                print("Waiting 10 seconds")
                time.sleep(10)
                pass

In [None]:
#This method receives a link and extract the needed info, returning a tuple. 
def downloadAndCreateObject(link):
    try:
        print("Downloading {0}".format(link))
        soup = sharedFunctions.downloadAndParseLink(link)
    
        companyName = soup.find('div', attrs={'class':'icl-u-lg-mr--sm'}).text.encode("utf-8")
        title = soup.find('h3', attrs={'class':'jobsearch-JobInfoHeader-title'}).text.encode("utf-8")
        location = soup.find('span', attrs={'class':'jobsearch-JobMetadataHeader-iconLabel'}).text.encode("utf-8")
        
        return (title, companyName, location, link)
    except:
        print("Waiting 10 seconds cause of an exception")
        time.sleep(10)        
        pass

In [None]:
#this list will store all the job links scraped
allJobLinks = []

In [None]:
scrapeSearchPage("https://ca.indeed.com/jobs?q=remote+developer", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=java+developer", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=.net+developer", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=ruby+on+rails", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=machine+learning", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=react+developer", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=mobile+developer", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=salesforce", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=database+administrator", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=game+developer", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=python+developer", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=data+scientist", "https://ca.indeed.com")
scrapeSearchPage("https://ca.indeed.com/jobs?q=devops", "https://ca.indeed.com")

In [None]:
print("{0} jobs found!".format(len(allJobLinks)))

In [None]:
#will store all the job info retrieved until we save it to a csv file
parsedInfo = []

In [None]:
downloaded = 0
for job in allJobLinks:
    #scrape this link info
    item = downloadAndCreateObject(job)
    if item != None:
        parsedInfo.append(item)
        downloaded += 1
        print("{0} downloaded of {1}".format(downloaded, len(allJobLinks)))

In [None]:
with open('JobInfo.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['PositionTitle','CompanyName', 'Location', 'JobLink'])
    for row in parsedInfo:
        csv_out.writerow(row)