In [1]:
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
from urllib import urlopen
import re
import numpy as np
from functools import reduce
import os

Unikey and Password: Recommend using environment variables.

In UNIX-based, we can simply `export UNIKEY=<your unikey>` in this folder.
Not sure about windows.

In [2]:
os.environ['UNIKEY'] = "ENTERUNIKEY" 
os.environ['PASS'] = "ENTERPASS"
user = os.environ['UNIKEY']
password = os.environ['PASS']

In [3]:
driver = webdriver.Chrome()
driver.get("https://careerhub.sydney.edu.au/students/login?ReturnUrl=%2f")

In [4]:
# Click current student
driver.find_element_by_xpath("/html/body/div[1]/div/div/div/div[2]/div/div[1]/div[1]/a").click()

# Alternatively, if you like html
#driver.find_element_by_css_selector('div.login-service > a').click()

In [5]:
#identifiying textboxes for login then typing username and password
elem1 = driver.find_element_by_id("userNameInput")
elem1.send_keys(user)

elem2 = driver.find_element_by_id("passwordInput")
elem2.send_keys(password)

#clicking submit
driver.find_element_by_xpath('//*[@id="submitButton"]').click()
time.sleep(2)

In [6]:
#searching jobs (could add extra search parameters here)
driver.find_element_by_xpath('//*[@id="home-feature"]/div/form/div[1]/div/div[2]/button').click()

In [7]:
#showing all jobs (dropdown list) -- comment this out for testing since you want to limit jobs
driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[3]/div[1]/div[2]/div/div/button').click()
driver.find_element_by_xpath('//*[@id="main"]/div[2]/div/div[3]/div[1]/div[2]/div/div/ul/li[6]/a').click()

Basically, get the link for each job. Then, we go to each page individually and scrap.

In [8]:
soup = BeautifulSoup(driver.page_source, 'html.parser')
base_url = 'https://careerhub.sydney.edu.au'
job_pages = []
#sorting through all the job entries
for tag in soup.find_all("div", {"class": "list-group-item"}):
    #using regular expressions to get rid of tags and unwanted text
    suff = tag.find('a', href = re.compile(r'[/]([a-z]|[A-Z])\w+')).attrs['href'] 
    job_href = 'https://careerhub.sydney.edu.au' + suff
    job_pages.append(job_href)
    
# Alternative way
# job_pages = [base_url + (job.find('a')['href']) for job 
#              in soup.find_all('div', class_='list-group-item')]

Columns: Title, Company, Location, Description, Type of Work, Contract Type, Contract Hours, Remuneration, etc.

Let's walk through scraping one page.

In [9]:
sample_page = 'https://careerhub.sydney.edu.au/students/jobs/detail/3382205/graduate-trader'
driver.get(sample_page)

In [10]:
job_soup = BeautifulSoup(driver.page_source, 'html.parser')

In [11]:
# Title, Company, Location(s): They are in `under-nav` class
# Title is h3, Company is h4
# Location(s) inside `p`
container = job_soup.find(class_='under-nav')
title = container.h3.text
company = container.h4.span.text
locations = '#'.join([loc.text.strip() for loc in container.p.find_all('span', class_=None)])

print(title)
print(company)
print(locations)

Graduate Trader
Maven
London, United Kingdom#Hong Kong


In [12]:
# Now, we want the details section and other information.

panels = job_soup.find_all('div', class_='panel-heading')
panels = list(filter(lambda s: s.text in ['Details' , 'Other information'], panels))

DETAILS = 0
INFO = 1

# Panel Content is the next sibling of the header:
# So we do panels.find_next_sibling()

# Our details
description = panels[DETAILS].find_next_sibling().text.strip()

# Other information: This one is a bit tricky since all text blocks are siblings.
# I'm making a dictionary. Key: The header (e.g. Type of Work), Value: The stuff
# The headers are `strong` tags. So we keep getting its siblings until we hit another strong tag.
other_info_dict = {}

for info in panels[INFO].find_next_sibling().find_all('strong'):
    text = []
    # Go to each sibling until next strong
    s = info.find_next_sibling()
    while s != None and s.name != 'strong' and s.name != 'small':
        text.append(s.text)
        s = s.find_next_sibling()
    other_info_dict[info.text] = '#'.join(list(map(lambda s: s.strip(), text)))



In [13]:
description

u"Maven is looking for outstanding recent graduates or students in their final year of study, to commence as traders in Autumn 2019. You will complete an intensive training program run by experienced traders that will include both classroom sessions and time on the trading floor. You will be executing live trades within three months of commencement, or as soon as you demonstrate the necessary ability in training. You will be responsible for the profit and loss of your book and for managing risk, both individually and as part of a team. You will be encouraged to develop new tools to further the company\u2019s existing trading strategies and to develop new strategies. This is a full-time permanent role and we are recruiting for our London and Hong Kong offices.CANDIDATE SPECIFICATIONS:We are looking for applicants from any quantitative background with a competitive nature, a strong understanding of probability and statistics, and an interest in financial markets. Requirements for eligibl

In [14]:
other_info_dict

{u'Commences': u'September 2019',
 u'Contract Hours': u'Full Time',
 u'Contract Type': u'Permanent',
 u'Remuneration': u'\xa350,000',
 u'Residency Requirements': u'All candidates considered including international students',
 u'Type Of Work': u'Graduate Programs#Full Time',
 u'Website': u'Go to website\n/students/jobs/UrlRedirect/3382205'}

Note that for the website stuff, the actual `href` is a redirecting url so it's useless.

## Let's try get 10 jobs

In [15]:
samples = job_pages

In [16]:
# Initial Columns for our dataframe
columns = ['Title', 'Company', 'Location(s)', 'Details']
jobs_df = pd.DataFrame([], columns=columns)
# We don't know how many columns of `other info` so we will wait until we have all of them first.

In [17]:
error_page = []
DETAILS = 0
INFO = 1
# Keep the other info dictionaries in here to merge later
job_other_info = []
for page in samples:
    driver.get(page)
    time.sleep(2)
    job_soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    try:
        # Job Basic Info
        container = job_soup.find(class_='under-nav')
        title = container.h3.text
        company = container.h4.span.text
        locations = '#'.join([loc.text.strip() for loc in container.p.find_all('span', class_=None)])
        
        # Details
        panels = job_soup.find_all('div', class_='panel-heading')
        panels = list(filter(lambda s: s.text in ['Details' , 'Other information'], panels))
        description = panels[DETAILS].find_next_sibling().text.strip()
        
        jobs_df = jobs_df.append(pd.Series([title, company, locations, description],
                                           index=columns), ignore_index=True)
        
        # Other Info
        other_info_dict = {}
        for info in panels[INFO].find_next_sibling().find_all('strong'):
            text = []
            # Go to each sibling until next strong
            s = info.find_next_sibling()
            while s != None and s.name != 'strong' and s.name != 'small':
                text.append(s.text)
                s = s.find_next_sibling()
            other_info_dict[info.text] = '#'.join(list(map(lambda s: s.strip(), text)))
            
        job_other_info.append(other_info_dict)
    except:
        error_page.append(page)

In [18]:
# Get all unique headers from `other information`
other_columns = np.unique(reduce(lambda x, y: x + y, map(lambda l: list(l.keys()), job_other_info)))

other_info_df = pd.DataFrame([], columns=other_columns)
for info in job_other_info:
    other_info_df = other_info_df.append(pd.Series(info), ignore_index=True)

In [19]:
career_df = pd.concat([jobs_df, other_info_df], axis=1)

In [20]:
career_df

Unnamed: 0,Title,Company,Location(s),Details,Agent,Commences,Contract Hours,Contract Type,Reference Code,Remuneration,Residency Requirements,Type Of Work,Website
0,IT/Enterprise: Systems Administrator,DE Shaw & CO.,United States of America,The D. E. Shaw group is seeking motivated indi...,,.,Full Time,Permanent,,.,,Full Time,
1,Secondary Teacher - Relocate to London,Sugarman International,"Brisbane, QLD, Australia#Sydney, NSW, Australi...",Sugarman Education is a Leading Education recr...,Sugarman Group International - International +...,January 2019,Full Time,Temporary,BRISSEPTDP02,£150 a day,,Overseas Opportunities#Full Time,Go to website\n/students/jobs/UrlRedirect/3382968
2,Primary Teacher - Relocate to London,Sugarman International,"Brisbane, QLD, Australia#Melbourne, VIC, Austr...",Sugarman Education is a Leading Education recr...,Sugarman Group International - International +...,January 2019,Full Time,Temporary,BRISSEPTDP01,£150 a day,,Overseas Opportunities#Full Time,
3,Practice Area Lead - Legal,LexisNexis HongKong,"Hong Kong, China",Who are we?LexisNexis gives legal professional...,,as soon as possible,Full Time,Permanent,,negotiable,,Overseas Opportunities,
4,IT Graduate Trainee Programme 2018 - Winter In...,Cathay Pacific Airways Limited (CX),Hong Kong,Cathay Pacific IT Graduate Trainee Programme A...,,Late-Feb/ early-Mar 2019,Full Time,Permanent,,TBA,All candidates considered including internatio...,Graduate Programs#Overseas Opportunities#Full ...,Go to website\n/students/jobs/UrlRedirect/3382533
5,PERMANENT OUTBACK VET EXPERIENCE,Redgum Vet & Pet Boarding,Australia,WANTED!!!!! Unique Country Veterinarian Are...,,12-11-2018,Full Time,Permanent,,"start @ $27.50 hr, incremental inc. over 6 mnths",All candidates considered including internatio...,Full Time,Go to website\n/students/jobs/UrlRedirect/3382954
6,Clubs & Societies Coordinator,University of Sydney Union,"Camperdown, NSW, Australia",Work a 35 hour weekJoin a fun environment with...,,ASAP,Full Time,Temporary,USU4918,"$55,000",All candidates considered including internatio...,Jobs on Campus#Contract#Full Time#Work Experience,Go to website\n/students/jobs/UrlRedirect/3383001
7,Marketing Officer/Manager Full Time in Sydney ...,Advisory Centre for Australian Education (ACAE),"City of Sydney, New South Wales, Australia",We are an experienced and reputable education ...,,Immiediatelly,Full Time,Permanent,,"$42,000 Plus depending on experience",All candidates considered including internatio...,Full Time,Go to website\n/students/jobs/UrlRedirect/3382972
8,Business Development Representative,Insider,"Sydney, NSW, Australia","Meet Insider, the first integrated Growth Mana...",,Flexible,Full Time,Permanent,,60-80K AUD,Australian Citizens and Permanent Residents on...,Alumni (up to five years)#Full Time,Go to website\n/students/jobs/UrlRedirect/3382535
9,Casual Writer,Urbanite Technologies,"Chippendale, NSW, Australia",The Urbanite Journal is an upcoming magazine t...,,As soon as possible,Casual,Permanent,,$25 per hour,All candidates considered including internatio...,Casual/Part-time#Contract,Go to website\n/students/jobs/UrlRedirect/3382523


In [21]:
career_df.to_csv('sample.csv', index=False,encoding="utf-8" )

In [22]:
read_df = pd.read_csv('sample.csv')

In [23]:
read_df

Unnamed: 0,Title,Company,Location(s),Details,Agent,Commences,Contract Hours,Contract Type,Reference Code,Remuneration,Residency Requirements,Type Of Work,Website
0,IT/Enterprise: Systems Administrator,DE Shaw & CO.,United States of America,The D. E. Shaw group is seeking motivated indi...,,.,Full Time,Permanent,,.,,Full Time,
1,Secondary Teacher - Relocate to London,Sugarman International,"Brisbane, QLD, Australia#Sydney, NSW, Australi...",Sugarman Education is a Leading Education recr...,Sugarman Group International - International +...,January 2019,Full Time,Temporary,BRISSEPTDP02,£150 a day,,Overseas Opportunities#Full Time,Go to website\n/students/jobs/UrlRedirect/3382968
2,Primary Teacher - Relocate to London,Sugarman International,"Brisbane, QLD, Australia#Melbourne, VIC, Austr...",Sugarman Education is a Leading Education recr...,Sugarman Group International - International +...,January 2019,Full Time,Temporary,BRISSEPTDP01,£150 a day,,Overseas Opportunities#Full Time,
3,Practice Area Lead - Legal,LexisNexis HongKong,"Hong Kong, China",Who are we?LexisNexis gives legal professional...,,as soon as possible,Full Time,Permanent,,negotiable,,Overseas Opportunities,
4,IT Graduate Trainee Programme 2018 - Winter In...,Cathay Pacific Airways Limited (CX),Hong Kong,Cathay Pacific IT Graduate Trainee Programme A...,,Late-Feb/ early-Mar 2019,Full Time,Permanent,,TBA,All candidates considered including internatio...,Graduate Programs#Overseas Opportunities#Full ...,Go to website\n/students/jobs/UrlRedirect/3382533
5,PERMANENT OUTBACK VET EXPERIENCE,Redgum Vet & Pet Boarding,Australia,WANTED!!!!! Unique Country Veterinarian Are...,,12-11-2018,Full Time,Permanent,,"start @ $27.50 hr, incremental inc. over 6 mnths",All candidates considered including internatio...,Full Time,Go to website\n/students/jobs/UrlRedirect/3382954
6,Clubs & Societies Coordinator,University of Sydney Union,"Camperdown, NSW, Australia",Work a 35 hour weekJoin a fun environment with...,,ASAP,Full Time,Temporary,USU4918,"$55,000",All candidates considered including internatio...,Jobs on Campus#Contract#Full Time#Work Experience,Go to website\n/students/jobs/UrlRedirect/3383001
7,Marketing Officer/Manager Full Time in Sydney ...,Advisory Centre for Australian Education (ACAE),"City of Sydney, New South Wales, Australia",We are an experienced and reputable education ...,,Immiediatelly,Full Time,Permanent,,"$42,000 Plus depending on experience",All candidates considered including internatio...,Full Time,Go to website\n/students/jobs/UrlRedirect/3382972
8,Business Development Representative,Insider,"Sydney, NSW, Australia","Meet Insider, the first integrated Growth Mana...",,Flexible,Full Time,Permanent,,60-80K AUD,Australian Citizens and Permanent Residents on...,Alumni (up to five years)#Full Time,Go to website\n/students/jobs/UrlRedirect/3382535
9,Casual Writer,Urbanite Technologies,"Chippendale, NSW, Australia",The Urbanite Journal is an upcoming magazine t...,,As soon as possible,Casual,Permanent,,$25 per hour,All candidates considered including internatio...,Casual/Part-time#Contract,Go to website\n/students/jobs/UrlRedirect/3382523
