In [30]:
#Import necessary modules
import time # to be able to run the sleep function
import csv
import re
import datetime

In [2]:
#import this to access shared functions allowing us to share code across notebooks
import sharedFunctions

In [3]:
#This function receives a page an extracts the total jobs count.
def getIndeedSearchCountPages(parsed_html):
    section = parsed_html.body.find('div', attrs={'id':'searchCountPages'}).text
    text = [t for t in section.split(' ') if t != '' and t != '\n'] # removes empty spaces. 
    return int(re.sub("[^0-9]", "", text[3])) #remove all non numeric chars from this string 

In [4]:
#This function will receive a single page then extract all job items. 
def parseIndeedPage(parsed_html):
    return parsed_html.body.find_all('div', attrs={'class':'title'})

In [5]:
# Create a function to receive a listing page, then download all job links.
def scrapeSearchPage(queryLink, baseUrl):
    print("Starting with " + queryLink)
    mainPage = sharedFunctions.downloadAndParseLink(queryLink)
    totalItemsCount = getIndeedSearchCountPages(mainPage)
    
    print("Processing for {0} items".format(totalItemsCount))
    if totalItemsCount > 0:
        totalProcessed = 0
        while totalProcessed < totalItemsCount:
            try:
                link = queryLink + "&start=" + str(totalProcessed)            
                sections = None
                if totalProcessed == 0:
                    print("Processing main page")
                    sections = parseIndeedPage(mainPage)
                else:
                    print("Downloading {0}".format(link))
                    sections = parseIndeedPage(sharedFunctions.downloadAndParseLink(link))                
                
                for section in sections:
                    lnk = baseUrl + sharedFunctions.getAttrFromTag(section.a, 'href')
                    if lnk not in allJobLinks:
                        allJobLinks.append(lnk)
            
                totalProcessed += 10 #each page has 10 links.            
                #wait for 2 seconds
                time.sleep(2)
            except: 
                print("Exception happened")
                #sometimes I noticed some internal errors like error 500, so lets wait a little bit more
                print("Waiting 10 seconds")
                time.sleep(10)
                pass

In [76]:
#This method receives a link and extract the needed info, returning a tuple. 
def downloadAndCreateObject(link):
    try:
        print("Downloading {0}".format(link))
        soup = sharedFunctions.downloadAndParseLink(link)
    
        companyName = soup.find('div', attrs={'class':'icl-u-lg-mr--sm'}).text.encode("utf-8")
        title = soup.find('h3', attrs={'class':'jobsearch-JobInfoHeader-title'}).text.encode("utf-8")
        location = soup.find('span', attrs={'class':'jobsearch-JobMetadataHeader-iconLabel'}).text.encode("utf-8")
        
        postedWhenData = soup.find('div', attrs={'class':'jobsearch-JobMetadataFooter'}).text.encode("utf-8")
        postedData = str(postedWhenData).split('-')[1].strip()
        postedOn = None
        if(postedData == 'Just posted' or postedData == 'Today'):
            postedOn = datetime.date.today()
        else:
            days = re.sub("[^0-9]", "", postedData.replace('+', '').split(' ')[0])
            postedOn = datetime.datetime.now() - datetime.timedelta(days=int(days))
            
        return (title, companyName, location, str(postedOn.strftime('%Y-%m-%d')), link)
    except Exception as ex:
        print("Waiting 10 seconds cause of an exception", ex)
        time.sleep(10)        
        pass

In [71]:
#this list will store all the job links scraped
allJobLinks = []

In [72]:
scrapeSearchPage("https://ca.indeed.com/jobs?q=remote+developer", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=java+developer", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=.net+developer", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=ruby+on+rails", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=machine+learning", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=react+developer", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=mobile+developer", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=salesforce", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=database+administrator", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=game+developer", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=python+developer", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=data+scientist", "https://ca.indeed.com")
#scrapeSearchPage("https://ca.indeed.com/jobs?q=devops", "https://ca.indeed.com")

Starting with https://ca.indeed.com/jobs?q=remote+developer
Processing for 492 items
Processing main page
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=10
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=20
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=30
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=40
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=50
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=60
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=70
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=80
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=90
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=100
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=110
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=120
Downloading https://ca.indeed.com/jobs?q=remote+developer&start=130
Downloading https:/

In [73]:
print("{0} jobs found!".format(len(allJobLinks)))

442 jobs found!


In [77]:
#will store all the job info retrieved until we save it to a csv file
parsedInfo = []

In [78]:
downloaded = 0
for job in allJobLinks:
    #scrape this link info
    item = downloadAndCreateObject(job)
    if item != None:
        parsedInfo.append(item)
        downloaded += 1
        print("{0} downloaded of {1}".format(downloaded, len(allJobLinks)))

Downloading https://ca.indeed.com/rc/clk?jk=3d79048cbb1efb2e&fccid=70f409019b3ef74c&vjs=3
1 downloaded of 442
Downloading https://ca.indeed.com/company/Kemkar-Projects/jobs/Data-Analytic-Developer-8e7e71f3710dcae7?fccid=46efa5dc1ef56ad9&vjs=3
2 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=cdf848812a5e205b&fccid=41765cf8195baf38&vjs=3
3 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=bbe2c232712d841d&fccid=caf3a6ce25406d04&vjs=3
4 downloaded of 442
Downloading https://ca.indeed.com/company/Accuhealth-Technologies-LLC/jobs/Splunk-Developer-5926261c26b75fe0?fccid=395d5b8ffc304e7f&vjs=3
5 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=e507b42500e8c85a&fccid=30646f984e9fca70&vjs=3
6 downloaded of 442
Downloading https://ca.indeed.com/company/South-Vancouver-Medical-Clinic/jobs/Full-Stack-Developer-9c901ec743b71343?fccid=0c2992a0a1cc5cd8&vjs=3
7 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=f5374c75e9b96fe9&fccid=3ff8c203713b7f5f&

58 downloaded of 442
Downloading https://ca.indeed.com/company/Metronome-Growth-Systems/jobs/Based-Startup-Angularj-Scala-Developer-5e5c55a4114d99d3?fccid=6c02ae7b4436e4ea&vjs=3
59 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=f87c99e9f063d4d1&fccid=9b9c2d27939c9760&vjs=3
60 downloaded of 442
Downloading https://ca.indeed.com/company/J--Squared-Technology-Inc./jobs/Embedded-Firmware-Developer-aa79e7919498bdc6?fccid=79536e36e52b60ce&vjs=3
61 downloaded of 442
Downloading https://ca.indeed.com/company/baver-IQ/jobs/Full-Stack-Developer-6f27d58bf827374a?fccid=b1d5532d18c7bd3f&vjs=3
62 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=5ea5f63abbc01437&fccid=77b7f45f470063b1&vjs=3
63 downloaded of 442
Downloading https://ca.indeed.com/company/RCI-technology-inc./jobs/Senior-Salesforce-Developer-f73a7f63e5883532?fccid=4fdf69d8c03b45c3&vjs=3
64 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=950dded353858da7&fccid=8a83c3cbe98b2895&vjs=3
65 downloaded 

125 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=464b910cbbf266dd&fccid=aa53b551f9df0210&vjs=3
126 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=c15a10c4ec0b8baa&fccid=30646f984e9fca70&vjs=3
127 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=cd58ed2c0029f1b6&fccid=9f1dba2b821d74ba&vjs=3
128 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=1b09e530f7b6aef8&fccid=382d52309d5d13f2&vjs=3
129 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=a82a444b3b9c236e&fccid=3d3faf08f32f5d69&vjs=3
130 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=f7ded6aa23e44fc9&fccid=6e69bf2577dd069d&vjs=3
131 downloaded of 442
Downloading https://ca.indeed.com/company/Enrich-Software-Corp./jobs/Quality-Assurance-Specialist-4cf8c78292a3aaf3?fccid=fbe0bb43bbbb6cc8&vjs=3
132 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=69123b44a86d020b&fccid=71a0e1993e45dfae&vjs=3
133 downloaded of 442
Downloading https://ca.indee

193 downloaded of 442
Downloading https://ca.indeed.com/company/Miipe-Inc/jobs/Technical-Business-Analyst-fed10ed233907cc9?fccid=a5a11f2a23363fc9&vjs=3
194 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=b53fee5482990021&fccid=20bb86b33a3c7256&vjs=3
195 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=0eb2ca02a8b43dec&fccid=6e7306d355e1d04b&vjs=3
196 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=e7e765b56dd678b9&fccid=8d08a26504cbffaa&vjs=3
197 downloaded of 442
Downloading https://ca.indeed.com/company/Evolve-PR-Inc./jobs/PR-Manager-1c5f6690d96d7405?fccid=1f4e5442d9a92382&vjs=3
198 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=486475e4fd55963b&fccid=411c66d8003204e5&vjs=3
199 downloaded of 442
Downloading https://ca.indeed.com/company/INOVA-Systems-Corporation/jobs/Software-Validation-Developer-c79fe488d9da5b0a?fccid=a4c9947ae16387d0&vjs=3
200 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=22c9c5eb75f5c26e&fccid

262 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=df01c851a4a2a96b&fccid=d46039b952140fd4&vjs=3
263 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=900fe9ae111f6f14&fccid=6c6be4ac01722b08&vjs=3
264 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=6540fec6900a1f97&fccid=9222b0c7c41358ed&vjs=3
265 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=e79bff3ed48e92c5&fccid=2525ee368b671cd8&vjs=3
266 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=8d35cbedfc5ffa16&fccid=4b42cbedd59e022d&vjs=3
267 downloaded of 442
Downloading https://ca.indeed.com/company/Ensemble-Systems/jobs/Software-Developer-7d47f53a58002ffc?fccid=13097cbe33b43a23&vjs=3
268 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=3808be6d6fec9030&fccid=df59c447f86e5860&vjs=3
269 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=fddcf7861c45e712&fccid=2d81393e324e67df&vjs=3
270 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk

333 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=524ce7a2edfc7baf&fccid=d8c987644cc029b0&vjs=3
334 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=dc15c29429921319&fccid=5fd06414b67bfa92&vjs=3
Waiting 10 seconds cause of an exception HTTP Error 512: 512
Downloading https://ca.indeed.com/rc/clk?jk=417a8146ded6b2a7&fccid=faa9084487ae22cc&vjs=3
335 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=b701bef84b6d0a29&fccid=e808e30f36e8f9bd&vjs=3
336 downloaded of 442
Downloading https://ca.indeed.com/company/Carleton-Technologies/jobs/Linux-System-Administrator-3b282c20bf00d548?fccid=f814cd077a429235&vjs=3
337 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=9baf52d225b3d0c8&fccid=f0d13dad68530177&vjs=3
338 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=2eb8c64e412cf862&fccid=f10ae92e96dcbaa2&vjs=3
339 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=df85092cfb52b079&fccid=344a005e52614669&vjs=3
340 downloade

404 downloaded of 442
Downloading https://ca.indeed.com/company/Video-Experts-Group/jobs/Technical-Sales-Engineer-acf0fbbf1ed2ee53?fccid=8baadeaac3350b1a&vjs=3
405 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=0737fe8634f95747&fccid=f0d13dad68530177&vjs=3
406 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=a2091d8281d89310&fccid=effce5e423b80707&vjs=3
407 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=ed50448bcbee2709&fccid=532610c5841edaef&vjs=3
408 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=c429cc1d46f81eba&fccid=5c98f9ffc20e640f&vjs=3
409 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=4319f7c3a3013bde&fccid=e808e30f36e8f9bd&vjs=3
410 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=fa22cb08d6d9e187&fccid=a7bc1f8aa3f52a78&vjs=3
411 downloaded of 442
Downloading https://ca.indeed.com/rc/clk?jk=65a99bd59fba3f03&fccid=a54980a110e1e240&vjs=3
412 downloaded of 442
Downloading https://ca.indeed.com/

In [79]:
with open('JobInfoWithPostedDate.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['PositionTitle','CompanyName', 'Location', 'PostedOn', 'JobLink'])
    for row in parsedInfo:
        csv_out.writerow(row)