In [18]:
import sys
import re
import requests
import urllib.parse
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from geopy.distance import geodesic as GD
from geopy.geocoders import Nominatim

In [2]:
url='https://www.jobbank.gc.ca/jobsearch/jobsearch?flg=E&fcan=1&fss=1&fss=C&fwcl=B&fwcl=C&fwcl=D&fwcl=E&fglo=1&sort=M&fwht=D&fwht=M&fsrc=16&fjnc=1&fexp=0&fexp=1&fprov=AB&fprov=ON&fskl=101020&fskl=101010'
page=requests.get(url)

soup=BeautifulSoup(page.content, 'html.parser')

job_elements=soup.find_all('a', class_='resultJobItem')
id_search=soup.find_all('a')

**Sending requests and retrieving information from job listing summaries**

To scrape the main webpage for job listings, I accessed the job search URL to retrieve the page content, used an HTML parser to process the content, and extracted all job listing elements by identifying specific HTML tags and classes associated with the job postings, allowing me to gather summary information about various job listings from the main page.

In [4]:
role_list=[]
dates_list=[]
business_list=[]
locations_list=[]
salaries_list=[]
id_list=[]
noccode_list=[]
employment_type_list=[]
job_duration_list=[]
education_list=[]
experience_list=[]
address_list=[]
weekly_hours_list=[]
vacancies_list=[]
work_permit_list=[]

jobs={
    
    'ID':id_list,
    'ROLES':role_list,
    'SALARY':salaries_list,
    'EDUCATION':education_list,
    'EXPERIENCE':experience_list,
    'NOC CODE':noccode_list,
    'BUSINESS':business_list,
    'LOCATIONS':locations_list,
    'PUBLISHED DATES':dates_list,
    'EMPLOYMENT TYPE':employment_type_list,
    'JOB DURATION':job_duration_list,
    'WEEKLY HOURS':weekly_hours_list,
    'VACANCIES':vacancies_list,
    
}



def role_finder(job_list):
    role=job_list.find('span', class_='noctitle')
    if role:
        fixed_role=role.find(string=True, recursive=False).strip() #Cuando usas Recursive, no pone los datos de los demás "span".
        role_list.append(fixed_role)
    else:
        role_list.append('Null')

def published_date_finder(job_list):
    published_date=job_list.find('li', class_='date')
    if published_date:
        fixed_date=published_date.find(string=True, recursive=False).strip() 
        dates_list.append(fixed_date)
    else:
        dates_list.append('Null')

def business_name_finder(job_list):
    business_name=job_list.find(class_='business')
    if business_name:
        fixed_business=business_name.find(string=True, recursive=False).strip()
        business_list.append(fixed_business)
    else:
        business_list.append('Null')

def location_finder(job_list):
    location=job_list.find('li', class_='location').get_text()
    if location:
        fixed_location=location[10:].strip()
        locations_list.append(fixed_location)
    else:
        locations_list.append('Null')
    
def salary_finder(job_list):
    salary=job_list.find(class_='salary').get_text()
    if salary:
        fixed_salary=salary[30:].strip()
        if 'hourly' in fixed_salary:
            fix_sal=fixed_salary[:5]
        if 'anually' in fixed_salary:
            fix_sal=(fixed_salary[:9].replace(",",""))/2000
        salaries_list.append(f"$"+fix_sal)
    else:
        salaries_list.append('Null')

def id_number_finder(every_id):
        id=every_id.get('href')
        try:
            id.split('/')[-1:]
            id_number=id[22:30]
            try:
                id_number=int(id_number)
                id_list.append(id_number)
            except:
                pass
        except:
            pass


In [5]:
for job_list in job_elements:
    role_finder(job_list)
    published_date_finder(job_list)
    business_name_finder(job_list)
    location_finder(job_list)
    salary_finder(job_list)

for every_id in id_search:
    id_number_finder(every_id)

print('Published date:', dates_list, '\n')
print('Role:', role_list, '\n')
print('Business:', business_list, '\n')
print('Location:', locations_list, '\n')
print('Salary:', salaries_list, '\n')
print('ID:', id_list)

Published date: ['July 22, 2024', 'July 19, 2024', 'July 17, 2024', 'July 17, 2024', 'July 15, 2024', 'July 12, 2024', 'July 10, 2024', 'July 10, 2024', 'July 09, 2024', 'July 08, 2024', 'July 08, 2024', 'July 08, 2024', 'July 08, 2024', 'July 08, 2024', 'July 08, 2024', 'July 07, 2024', 'July 05, 2024', 'July 04, 2024', 'July 04, 2024', 'July 04, 2024', 'July 04, 2024', 'July 04, 2024', 'July 04, 2024', 'July 03, 2024', 'July 03, 2024'] 

Role: ['assembler, plastic products', 'driller - oil and gas drilling', 'insurance agent', 'medical administrative assistant', 'guard, security', 'farm worker, general', 'food service supervisor', 'cook', 'maintenance person - building', 'kitchen helper', 'supervisor, food services', 'cleaner', "cook's helper", 'shift manager - fast food restaurant', 'farm supervisor', 'truck dispatcher', 'delivery drivers supervisor', 'fruit farm labourer', 'long haul truck driver', 'long haul truck driver', 'cook', 'car mechanic', 'landscape worker', 'nursery worke

**Extract further information**. 

To gather detailed information for each job listing, such as weekly hours, we need to access the specific page associated with each job ID.

While the main page provides a summary of job information, additional details are located on individual job pages. Each job can be accessed via a unique URL in the format: /jobsearch/jobposting/ID_NUMBER

In [7]:
def education_finder():
    education=re.search(r'Education: ([^.]+)', description)
    if education:
        fixed_education=education.group(1)
        education_list.append(fixed_education)
    else:
        education_list.append('Null')

def experience_finder():
    experience=re.search(r'Experience: ([^.]+)', description)
    if experience:
        fixed_experience=experience.group(1)
        experience_list.append(fixed_experience)
    else:
        experience_list.append('Null')
        
def noccode_finder():
    noccode=job_details.find('span', class_='aa_jobbank_job_noccode').get_text()
    if noccode:
        noccode_list.append(noccode)
    else:
        noccode_list.append('Null')
    
def employment_finder():
    employment_type=job_details.find('span', property='employmentType')
    if employment_type:
    #SHIFT TYPE
        fixed_employment_type=employment_type.find(string=True, Recursive=False).get_text()
        employment_type_list.append(fixed_employment_type)

    #JOB DURATION
        for duration in employment_type:
            job_duration=duration.get_text(strip=True)
            job_duration_list.append(job_duration)
            break
    else:
        employment_type_list.append('Null')
        job_duration_list.append('Null')
    
def weekly_hours_finder():
    weekly_hours=job_details.find('span', property='workHours')
    if weekly_hours:
        fixed_weekly_hours=weekly_hours.get_text(strip=True)
        weekly_hours_list.append(fixed_weekly_hours)
    else:
        weekly_hours_list.append('Null')

def address_finder():
    address=job_details.find('span', property='addressLocality').get_text()
    if address:
        address_list.append(address)
    else:
        address_list.append('Null')
        
def vacancies_finder():
    vacancies=job_details.find('span', string=re.compile(r'\d+ vacancies')) #Various vacancies
    if vacancies:
        vacancy_number=re.search(r'\d+', vacancies.get_text(strip=True)).group()
        vacancies_list.append(vacancy_number)
    else:
        vacancies_list.append('1')

def work_permit_finder():
    work_permit=job_details.find(class_='job-posting-detail-apply').get_text()
    if "with or without a valid Canadian work permit" in work_permit:
        work_permit_list.append('No')
    else:
        work_permit_list.append('Yes')

In [8]:
for ids in id_list:
    try:
        id_url='https://www.jobbank.gc.ca/jobsearch/jobposting/'+str(ids)
        id_page=requests.get(id_url)
        id_soup=BeautifulSoup(id_page.content, 'html.parser')
        job_details=id_soup.find('main', class_='container')
    
        description = job_details.find('span', property='description').get_text()

        education_finder()
        experience_finder()
        noccode_finder()
        employment_finder()
        weekly_hours_finder()
        address_finder()
        vacancies_finder()
        work_permit_finder()
    
    except Exception as e:
        print(f'An error occurred: {e}')
        

print('NOC Code:', noccode_list, '\n')
print('Employment Type:', employment_type_list, '\n')
print('Job Duration:', job_duration_list, '\n')
print('Weekly Hours:', weekly_hours_list, '\n')
print('Address:', address_list, '\n')
print('Vacancies:', vacancies_list, '\n')
print('Education:', education_list, '\n')
print('Experience:', experience_list, '\n')
print('Work permit:', work_permit_list, '\n')

An error occurred: 'NoneType' object has no attribute 'find'
NOC Code: ['94212', '83101', '63100', '13112', '64410', '85100', '62020', '63200', '73201', '65201', '62020', '65310', '65201', '62020', '82030', '14404', '72024', '85101', '73300', '73300', '63200', '72410', '85121', '85103', '73300'] 

Employment Type: ['Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time', 'Full time'] 

Job Duration: ['Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Permanent employment', 'Perm

**Find nearest major city**

For each job location, I removed any province-specific suffixes to standardize the city name. I then used a geolocation service to get the latitude and longitude of the job's city. I calculated the distance from this location to several predefined major cities (Edmonton, Red Deer, Calgary, Toronto, Ottawa) using geographic coordinates, improving the code's efficiency. The nearest major city and its distance were determined by finding the minimum distance among these calculated values, which were then added to the respective lists for further use.


In [10]:
nearest_city_list=[]
distance_nearest_city_list=[]

def calc_distance(fixed_location):
    try:
        city=re.sub(r'\s*\(AB\)', '', fixed_location)
    except:
        city=re.sub(r'\s*\(ON\)', '', fixed_location)

    edmonton_coords=(53.5462055, -113.491241)
    red_deer_coords=(52.2690628, -113.8141464)
    calgary_coords=(51.0456064, -114.057541)
    toronto_coords=(43.6534817, -79.3839347)
    ottawa_coords=(45.4208777, -75.6901106)
    
    geolocator = Nominatim(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0')
    #big_cities=['Edmonton','Calgary','Red Deer','Toronto','Ottawa']
    
    location_city=geolocator.geocode(city)
    city_coords=(location_city.latitude, location_city.longitude)

    distance_journal={}

    distance=round((GD(city_coords, edmonton_coords).km),2)
    distance_journal['Edmonton']=distance
    
    distance=round((GD(city_coords, red_deer_coords).km),2)
    distance_journal['Red_deer']=distance
    
    distance=round((GD(city_coords, calgary_coords).km),2)
    distance_journal['Calgary']=distance

    distance=round((GD(city_coords, ottawa_coords).km),2)
    distance_journal['Ottawa']=distance

    distance=round((GD(city_coords, toronto_coords).km),2)
    distance_journal['Toronto']=distance

    distance_nearest_city=min(distance_journal.values())
    nearest_city= [key for key, value in distance_journal.items() if value==distance_nearest_city] 
    #print(f'Nearest big city: ',nearest_city,'-',distance_nearest_city,' kms')

    nearest_city_list.append(nearest_city)
    distance_nearest_city_list.append(distance_nearest_city)

In [11]:
for distances in locations_list:
    try:
        calc_distance(distances)
    except:
        nearest_city_list.append('Null')
        distance_nearest_city_list.append('Null')

print('NEAREST CITY:', nearest_city_list, '\n')
print(f'DISTANCE:', distance_nearest_city_list, '\n')

NEAREST CITY: [['Ottawa'], ['Edmonton'], ['Calgary'], ['Toronto'], ['Edmonton'], ['Ottawa'], ['Toronto'], ['Ottawa'], ['Toronto'], ['Calgary'], ['Toronto'], ['Calgary'], ['Toronto'], ['Toronto'], ['Toronto'], ['Toronto'], 'Null', ['Toronto'], ['Calgary'], ['Calgary'], ['Toronto'], ['Toronto'], ['Edmonton'], ['Toronto'], ['Edmonton']] 

DISTANCE: [83.42, 397.56, 0.0, 17.69, 0.0, 27.12, 16.73, 11.65, 45.19, 0.0, 70.89, 0.0, 332.89, 30.54, 125.01, 168.21, 'Null', 258.56, 173.7, 0.0, 153.65, 57.2, 0.0, 232.01, 0.0] 



**Inserting data into a Dataframe**

To store the job data, the job information dictionary is first converted into a DataFrame using Pandas. This DataFrame is then saved to a CSV file named 'jobs_dataframe.csv' with headers and without row indices. Finally, the CSV file is read back into a new DataFrame named 'improved_jobs_dataframe' and printed to verify the data.

In [28]:
print(len(jobs['ID']))
print(len(jobs['ROLES']))
print(len(jobs['SALARY']))
print(len(jobs['EDUCATION']))
print(len(jobs['EXPERIENCE']))
print(len(jobs['NOC CODE']))
print(len(jobs['BUSINESS']))
print(len(jobs['LOCATIONS']))
print(len(jobs['PUBLISHED DATES']))
print(len(jobs['EMPLOYMENT TYPE']))
print(len(jobs['JOB DURATION']))
print(len(jobs['WEEKLY HOURS']))
print(len(jobs['VACANCIES']))

26
25
25
25
25
25
25
25
25
25
25
25
25


In [13]:
jobs_dataframe=pd.DataFrame.from_dict(jobs)
#print(jobs_dataframe)

jobs_dataframe.to_csv('jobs_dataframe.csv', header=True, index=False)
improved_jobs_dataframe=pd.read_csv('jobs_dataframe.csv')
print(improved_jobs_dataframe)


ValueError: All arrays must be of the same length

In [None]:
ON_AB_localization = {
    'State': [],
    'City': [],
    'Lat': [],
    'Long': [],
    }
try:
    pd.read_csv("ON_AB_localization.csv")
    ON_AB_localization['State'] = df['State'].tolist()
    ON_AB_localization['City'] = df['City'].tolist()
    ON_AB_localization['Lat'] = df['Lat'].tolist()
    ON_AB_localization['Long'] = df['Long'].tolist()
except:
    ON_AB_localization_ONLY = pd.DataFrame.from_dict(ON_AB_localization)
    ON_AB_localization_ONLY.to_csv("ON_AB_localization.csv", header=True, index=False)

In [None]:
def search_city(fixed_location):
    
    ON_AB_localization_EXCEL = pd.read_csv("ON_AB_localization.csv")
    location= re.sub(r'\s*\(ON\)|\s*\(AB\)', '', fixed_location)
    
    found = ON_AB_localization_EXCEL[ON_AB_localization_EXCEL['City'] == location]
    if not found.empty:
        lat = found['Lat'].values[0]
        long = found['Long'].values[0]
        return lat, long
    else:
        geolocator = Nominatim(user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0')
        
        if 'ON' in fixed_location:
            ON_AB_localization['State'].append('ON')
            locationX = location+", Ontario"
        
        elif 'AB' in fixed_location:
            ON_AB_localization['State'].append('AB')
            locationX = location + ", Alberta"
        
        else:
            ON_AB_localization['State'].append('??')
            locationx=location
        
    location_city=geolocator.geocode(locationX)
    
    if location_city:
        city_lat = location_city.latitude
        city_long = location_city.longitude
        
        ON_AB_localization['City'].append(location)
        ON_AB_localization['Lat'].append(city_lat)
        ON_AB_localization['Long'].append(city_long)
    
        ON_AB_localization_DICT = pd.DataFrame.from_dict(ON_AB_localization)
        ON_AB = pd.concat([ON_AB_localization_EXCEL, ON_AB_localization_DICT], ignore_index=True)
        ON_AB = ON_AB.drop_duplicates(subset=['City'], ignore_index=True)
        ON_AB.to_csv("ON_AB_localization.csv", header=True, index=False)
    
        return city_lat, city_long
        
    else:
        print(f"Location couldn't be retrieved: {locationX}")
        return None, None
            

In [None]:
for cities in locations_list:
    search_city(cities)
ON_AB_localization_EXCEL=pd.read_csv("ON_AB_localization.csv")
print(ON_AB_localization_EXCEL)
