In [1]:
import sys
import re
import requests
import urllib.parse
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim
from geopy.distance import geodesic as GD

import time
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
chrome_options = Options()
chrome_options.add_argument("--start-maximized")  # Start maximized
#chrome_options.add_argument("--disable-popup-blocking")  # Disable pop-ups

service = Service('C:\\Users\\Morgana\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe')  # Update with your path to chromedriver
browser = webdriver.Chrome(service=service, options=chrome_options)

browser.get('https://www.jobbank.gc.ca/jobsearch/jobsearch?flg=E&fcan=1&fss=1&fss=C&fwcl=B&fwcl=C&fwcl=D&fwcl=E&fglo=1&sort=M&fwht=D&fwht=M&fsrc=16&fjnc=1&fexp=0&fexp=1&fprov=AB&fprov=ON&fskl=101020&fskl=101010')

In [3]:
def click_checkbox(label_id):
        try:
            wait = WebDriverWait(browser, 30)
            # Scroll the specific div to make the label visible
            province_list = browser.find_element(By.ID, 'province-type')
            browser.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', province_list)
            time.sleep(1)  # Wait for scroll (Ontario and other provinces)
            
            # Wait until the label is clickable
            label = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f'label[for="{label_id}"]')))
            label.click()
            print(f"{label_id} label clicked successfully.")
            
        except Exception as e:
            print(f"An error occurred with label {label_id}: {e}")

def click_employment_groups():
    try:
        employment_groups_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'h3.legend')))
        browser.execute_script("arguments[0].click();", employment_groups_button)
        print("Clicked on Employment groups button.")

        # Wait for the "Temporary foreign workers" checkbox to be visible
        temporary_foreign_workers_checkbox = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'input#emplgroupitem_13')))
        print("Temporary foreign workers checkbox is visible.")
        temporary_foreign_workers_checkbox.click()

        time.sleep(3)
        
    except Exception as e:
        print(f"An error occurred: {e}")
        
def load_more():
    wait = WebDriverWait(browser, 10)
    more_button = wait.until(EC.element_to_be_clickable((By.ID, 'moreresultbutton')))
    more_button.click()

In [4]:
wait = WebDriverWait(browser, 10)

try:
    # Wait until pop-up appears and then close it.
    popup = wait.until(EC.presence_of_element_located((By.ID, 'outOfCountry-popup')))
    close_button = wait.until(EC.element_to_be_clickable((By.ID, 'j_id_39:outOfCanadaCloseBtn')))
    close_button.click()
    
    #Enable province parameters.  
    click_checkbox('provitem_AB')
    click_checkbox('provitem_ON')

    #Only works allowed for TFW (Temporary Foreign Workers)
    click_employment_groups()
except:
    pass


# Wait until "Show More Results" button shows up and click it.
for repeat in range(1):
    load_more()

# Wait until "Show More Results" button appears again, so we know it's fully loaded.
wait.until(EC.element_to_be_clickable((By.ID, 'moreresultbutton')))
wait = WebDriverWait(browser, 150)

# Extract the new HTML content
new_html = browser.page_source
print(new_html)

    

provitem_AB label clicked successfully.
provitem_ON label clicked successfully.
Clicked on Employment groups button.
An error occurred: Message: 

<html class="js backgroundsize borderimage csstransitions fontface svg details progressbar meter mathml cors xlargeview wb-enable" lang="en" dir="ltr"><head id="j_id_5">
		<meta charset="utf-8">
		
		<title>Available jobs - Search - Job Bank
			      
		</title>
		<meta content="width=device-width,initial-scale=1" name="viewport">
		
		<meta name="dcterms.language" content="eng">
		<meta name="dcterms.creator" content="Employment and Social Development Canada">
		<meta name="dcterms.service" content="ESDC-EDSC_JobBank-GuichetEmplois">
		<meta name="dcterms.accessRights" content="2"><meta name="dcterms.title" content="Search results - Search - Job Bank">
			<meta property="og:title" content="Available jobs - Search">
			<meta property="og:description" name="description" content="View 141,618 job postings near various occupations on Job Bank, 

In [5]:
#url='https://www.jobbank.gc.ca/jobsearch/jobsearch?flg=E&fcan=1&fss=1&fss=C&fwcl=B&fwcl=C&fwcl=D&fwcl=E&fglo=1&sort=M&fwht=D&fwht=M&fsrc=16&fjnc=1&fexp=0&fexp=1&fprov=AB&fprov=ON&fskl=101020&fskl=101010'
#page=requests.get(url)
#soup=BeautifulSoup(page.content, 'html.parser')

soup=BeautifulSoup(new_html, 'html.parser')
geolocator = Nominatim(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0')

job_elements=soup.find_all('a', class_='resultJobItem')
id_search=soup.find_all('a')

**Sending requests and retrieving information from job listing summaries**

To scrape the main webpage for job listings, I accessed the job search URL to retrieve the page content, used an HTML parser to process the content, and extracted all job listing elements by identifying specific HTML tags and classes associated with the job postings, allowing me to gather summary information about various job listings from the main page.

In [7]:
role_list=[]
dates_list=[]
business_list=[]
locations_list=[]
salaries_list=[]
id_list=[]
noccode_list=[]
employment_type_list=[]
job_duration_list=[]
education_list=[]
experience_list=[]
address_list=[]
weekly_hours_list=[]
vacancies_list=[]
work_permit_list=[]

jobs={
    
    'ID':id_list,
    'ROLES':role_list,
    'SALARY':salaries_list,
    'EDUCATION':education_list,
    'EXPERIENCE':experience_list,
    'NOC CODE':noccode_list,
    'BUSINESS':business_list,
    'LOCATIONS':locations_list,
    'PUBLISHED DATES':dates_list,
    'EMPLOYMENT TYPE':employment_type_list,
    'JOB DURATION':job_duration_list,
    'WEEKLY HOURS':weekly_hours_list,
    'VACANCIES':vacancies_list,
    'WORK PERMIT':work_permit_list,
    
}


def role_finder(job_list):
    role=job_list.find('span', class_='noctitle')
    if role:
        fixed_role=role.find(string=True, recursive=False).strip() #Cuando usas Recursive, no pone los datos de los demás "span".
        role_list.append(fixed_role)
    else:
        role_list.append('Null')

def published_date_finder(job_list):
    published_date=job_list.find('li', class_='date')
    if published_date:
        fixed_date=published_date.find(string=True, recursive=False).strip() 
        dates_list.append(fixed_date)
    else:
        dates_list.append('Null')

def business_name_finder(job_list):
    business_name=job_list.find(class_='business')
    if business_name:
        fixed_business=business_name.find(string=True, recursive=False).strip()
        business_list.append(fixed_business)
    else:
        business_list.append('Null')

def location_finder(job_list):
    location=job_list.find('li', class_='location').get_text()
    if location:
        fixed_location=location[10:].strip()
        locations_list.append(fixed_location)
    else:
        locations_list.append('Null')
    
def salary_finder(job_list):
    salary=job_list.find(class_='salary').get_text()
    if salary != None:
        try:
            fixed_salary=salary[30:].strip()
            if 'hourly' in fixed_salary:
                fix_sal=fixed_salary[:5]
            if 'anually' in fixed_salary:
                fix_sal=(fixed_salary[:9].replace(",",""))/2000
            salaries_list.append(f"$"+fix_sal)
        except:
            salaries_list.append('0.00')
    else:
        salaries_list.append('0.00')

def id_number_finder(every_id):
        id=every_id.get('href')
        try:
            id.split('/')[-1:]
            id_number=id[22:30]
            try:
                id_number=int(id_number)
                id_list.append(id_number)
            except:
                pass
        except:
            pass


In [8]:
for job_list in job_elements:
    role_finder(job_list)
    published_date_finder(job_list)
    business_name_finder(job_list)
    location_finder(job_list)
    salary_finder(job_list)

for every_id in id_search:
    id_number_finder(every_id)

print('Published date:', dates_list, '\n')
print('Role:', role_list, '\n')
print('Business:', business_list, '\n')
print('Location:', locations_list, '\n')
print('Salary:', salaries_list, '\n')
print('ID:', id_list)

Published date: ['July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024', 'July 25, 2024'] 

Role: ['bookkeeper', 'automotive mechanic', 'cook', 'reservations supervisor - travel agency', 'cleaner', 'bookkeeper', 'caseworker

**Extract further information**. 

To gather detailed information for each job listing, such as weekly hours, we need to access the specific page associated with each job ID.

While the main page provides a summary of job information, additional details are located on individual job pages. Each job can be accessed via a unique URL in the format: /jobsearch/jobposting/ID_NUMBER

In [10]:
def education_finder():
    education=re.search(r'Education: ([^.]+)', description)
    if education:
        fixed_education=education.group(1)
        education_list.append(fixed_education)
    else:
        education_list.append('Null')

def experience_finder():
    experience=re.search(r'Experience: ([^.]+)', description)
    if experience:
        fixed_experience=experience.group(1)
        experience_list.append(fixed_experience)
    else:
        experience_list.append('Null')
        
def noccode_finder():
    noccode=job_details.find('span', class_='aa_jobbank_job_noccode').get_text()
    if noccode:
        noccode_list.append(noccode)
    else:
        noccode_list.append('Null')
    
def employment_finder():
    employment_type=job_details.find('span', property='employmentType')
    if employment_type:
    #SHIFT TYPE
        fixed_employment_type=employment_type.find(string=True, Recursive=False).get_text()
        employment_type_list.append(fixed_employment_type)

    #JOB DURATION
        for duration in employment_type:
            job_duration=duration.get_text(strip=True)
            job_duration_list.append(job_duration)
            break
    else:
        employment_type_list.append('Null')
        job_duration_list.append('Null')
    
def weekly_hours_finder():
    weekly_hours=job_details.find('span', property='workHours')
    if weekly_hours:
        fixed_weekly_hours=weekly_hours.get_text(strip=True)
        weekly_hours_list.append(fixed_weekly_hours)
    else:
        weekly_hours_list.append('Null')

def address_finder():
    address=job_details.find('span', property='addressLocality').get_text()
    if address:
        address_list.append(address)
    else:
        address_list.append('Null')
        
def vacancies_finder():
    vacancies=job_details.find('span', string=re.compile(r'\d+ vacancies')) #Various vacancies
    if vacancies:
        vacancy_number=re.search(r'\d+', vacancies.get_text(strip=True)).group()
        vacancies_list.append(vacancy_number)
    else:
        vacancies_list.append('1')

def work_permit_finder():
    work_permit=job_details.find(class_='job-posting-detail-apply').get_text()
    if "with or without a valid Canadian work permit" in work_permit:
        work_permit_list.append('No')
    else:
        work_permit_list.append('Yes')

In [None]:
for ids in id_list:
    try:
        id_url='https://www.jobbank.gc.ca/jobsearch/jobposting/'+str(ids)
        id_page=requests.get(id_url)
        id_soup=BeautifulSoup(id_page.content, 'html.parser')
        job_details=id_soup.find('main', class_='container')
    
        description = job_details.find('span', property='description').get_text()

        education_finder()
        experience_finder()
        noccode_finder()
        employment_finder()
        weekly_hours_finder()
        address_finder()
        vacancies_finder()
        work_permit_finder()
    
    except Exception as e:
        print(f'An error occurred: {e}')
        

print('NOC Code:', noccode_list, '\n')
print('Employment Type:', employment_type_list, '\n')
print('Job Duration:', job_duration_list, '\n')
print('Weekly Hours:', weekly_hours_list, '\n')
print('Address:', address_list, '\n')
print('Vacancies:', vacancies_list, '\n')
print('Education:', education_list, '\n')
print('Experience:', experience_list, '\n')
print('Work permit:', work_permit_list, '\n')

**Find nearest major city**

For each job location, I removed any province-specific suffixes to standardize the city name. I then used a geolocation service to get the latitude and longitude of the job's city. I calculated the distance from this location to several predefined major cities (Edmonton, Red Deer, Calgary, Toronto, Ottawa) using geographic coordinates, improving the code's efficiency. The nearest major city and its distance were determined by finding the minimum distance among these calculated values, which were then added to the respective lists for further use.


In [None]:
#ON_AB_localization = {
#    'State': [],
#    'City': [],
#    'Lat': [],
#    'Long': [],
#    }

#ON_AB_localization_ONLY = pd.DataFrame.from_dict(ON_AB_localization)
#ON_AB_localization_ONLY.to_csv("ON_AB_localization.csv", header=True, index=False)

ON_AB_localization = pd.read_csv("ON_AB_localization.csv")
#ON_AB_localization

In [None]:
def search_city(fixed_location):
    location= re.sub(r'\s*\(ON\)|\s*\(AB\)', '', fixed_location)
    global ON_AB_localization
    
    city = ON_AB_localization[ON_AB_localization['City'] == location]
    if not city.empty:
        lat = city['Lat'].values[0]
        long = city['Long'].values[0]
        
        return lat, long
    else:            
        if 'ON' in fixed_location:
            State = 'ON'
            locationX = location + ", Ontario"     
        elif 'AB' in fixed_location:
            State = 'AB'
            locationX = location + ", Alberta"
        else:
            return None, None

        location_city=geolocator.geocode(locationX)

        if location_city is not None:
            city_lat = location_city.latitude
            city_long = location_city.longitude
            
            new_city = pd.DataFrame([{'State': State, 'City': location, 'Lat': city_lat, 'Long': city_long}])
            ON_AB_localization = pd.concat([ON_AB_localization, new_city], ignore_index=True)
    
        return city_lat, city_long

In [None]:
search_city('Edmonton (AB)')

In [None]:
'''for cities in locations_list:
    search_city(cities)
ON_AB_localization_DICT = pd.DataFrame.from_dict(ON_AB_localization)
ON_AB = ON_AB_localization_DICT.drop_duplicates(subset=['City'], ignore_index=True)
ON_AB = ON_AB.dropna()
ON_AB.to_csv("ON_AB_localization.csv", header=True, index=False)

ON_AB_localization_EXCEL=pd.read_csv("ON_AB_localization.csv")
print(ON_AB_localization_EXCEL)'''

In [None]:
print(len(ON_AB_localization['State']))
print(len(ON_AB_localization['City']))
print(len(ON_AB_localization['Lat']))
print(len(ON_AB_localization['Long']))

In [None]:
ON_AB_localization

In [None]:
nearest_city_list=[]
distance_nearest_city_list=[]

In [None]:
def calc_distance(fixed_location):
    location= re.sub(r'\s*\(ON\)|\s*\(AB\)', '', fixed_location)

    if 'ON' in fixed_location:
        city_territory = location + ", Ontario"     
    elif 'AB' in fixed_location:
        city_territory = location + ", Alberta"
    else:
        return None, None

    #big_cities=['Edmonton','Calgary','Red Deer','Toronto','Ottawa']
    edmonton_coords=(53.5462055, -113.491241)
    red_deer_coords=(52.2690628, -113.8141464)
    calgary_coords=(51.0456064, -114.057541)
    toronto_coords=(43.6534817, -79.3839347)
    ottawa_coords=(45.4208777, -75.6901106)
    
    geolocator = Nominatim(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0')

    city=geolocator.geocode(city_territory)
    print(city)
    city_coords=(city.latitude, city.longitude)
    print(city_coords)

    if location!='Various locations':
        
        distance_journal={}
    
        distance=round((GD(city_coords, edmonton_coords).km),2)
        distance_journal['Edmonton']=distance
        
        distance=round((GD(city_coords, red_deer_coords).km),2)
        distance_journal['Red_deer']=distance
        
        distance=round((GD(city_coords, calgary_coords).km),2)
        distance_journal['Calgary']=distance
    
        distance=round((GD(city_coords, ottawa_coords).km),2)
        distance_journal['Ottawa']=distance
    
        distance=round((GD(city_coords, toronto_coords).km),2)
        distance_journal['Toronto']=distance
    
        distance_nearest_city=min(distance_journal.values())
        nearest_city= [key for key, value in distance_journal.items() if value==distance_nearest_city] 
        #print(f'Nearest big city: ',nearest_city,'-',distance_nearest_city,' kms')
    
        nearest_city_list.append(nearest_city)
        distance_nearest_city_list.append(distance_nearest_city)

    else:
        nearest_city_list.append('Null')
        distance_nearest_city_list.append('Null')
        

In [None]:
distances_dict={
    'JOB LOCATION':locations_list,
    'NEAREST CITY':nearest_city_list,
    'DISTANCE':distance_nearest_city_list
    }

for distances in locations_list:
    try:
        calc_distance(distances)
    except:
        nearest_city_list.append('Null')
        distance_nearest_city_list.append('Null')

#print('NEAREST CITY:', nearest_city_list, '\n')
#print(f'DISTANCE:', distance_nearest_city_list, '\n')

print(len(locations_list))
print(len(nearest_city_list))
print(len(distance_nearest_city_list))

print(locations_list)
print(nearest_city_list)
print(distance_nearest_city_list)

try:
    distances_dataframe=pd.DataFrame.from_dict(distances_dict)

    distances_dataframe.to_csv('distances_dataframe.csv', header=True, index=False)
    read_distances_dataframe=pd.read_csv('distances_dataframe.csv')
    
    #print(read_distances_dataframe)
except:
    print('Errorcito, lol.')

In [None]:
distances_dataframe=pd.DataFrame.from_dict(distances_dict)
#print(jobs_dataframe)

distances_dataframe.to_csv('distances_dataframe.csv', header=True, index=False)
improved_distances_dataframe=pd.read_csv('distances_dataframe.csv')
print(improved_distances_dataframe)

**Inserting data into a Dataframe**

To store the job data, the job information dictionary is first converted into a DataFrame using Pandas. This DataFrame is then saved to a CSV file named 'jobs_dataframe.csv' with headers and without row indices. Finally, the CSV file is read back into a new DataFrame named 'improved_jobs_dataframe' and printed to verify the data.

In [None]:
jobs_dataframe=pd.DataFrame.from_dict(jobs)
#print(jobs_dataframe)

jobs_dataframe.to_csv('jobs_dataframe.csv', header=True, index=False)
improved_jobs_dataframe=pd.read_csv('jobs_dataframe.csv')
print(improved_jobs_dataframe)


In [None]:
print(f'ID',len(jobs['ID']))
print(f'ROLES',len(jobs['ROLES']))
print(f'SALARY',len(jobs['SALARY']))
print(f'EDUCATION',len(jobs['EDUCATION']))
print(f'EXPERIENCE',len(jobs['EXPERIENCE']))
print(f'NOC CODE',len(jobs['NOC CODE']))
print(f'BUSINESS',len(jobs['BUSINESS']))
print(f'LOCATIONS',len(jobs['LOCATIONS']))
print(f'PUBLISHED DATES',len(jobs['PUBLISHED DATES']))
print(f'EMPLOYMENT TYPE',len(jobs['EMPLOYMENT TYPE']))
print(f'JOB DURATION',len(jobs['JOB DURATION']))
print(f'WEEKLY HOURS',len(jobs['WEEKLY HOURS']))
print(f'VACANCIES',len(jobs['VACANCIES']))