In [None]:
#!pip install webdriver-manager

In [3]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

import time
import numpy as np
import pandas as pd
import multiprocessing

# Scraping Linkedin

In [26]:
job_query = "python-developer"

In [None]:
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://ph.linkedin.com/jobs/{job_query}-jobs"
try:
    driver.get(url)
    ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    job_list = driver.find_elements(By.XPATH, "//ul[@class='jobs-search__results-list']/li")
    
    # job catalog scraping
    jobs_scraped = np.array([])
    for job in job_list:
        card = job.find_element(By.XPATH, ".//a[@data-tracking-control-name='public_jobs_jserp-result_search-card']")
        url = card.get_attribute('href')
        location = job.find_element(By.XPATH, ".//span[@class='job-search-card__location']").get_attribute('innerHTML')
        location = location.replace('\n', '').strip()
        job_title = job.find_element(By.XPATH, ".//h3[@class='base-search-card__title']").get_attribute('innerHTML').strip()
        try: 
            company = job.find_element(By.XPATH, ".//h4[@class='base-search-card__subtitle']/a").get_attribute('innerHTML').strip()
        except:
            company = job.find_element(By.XPATH, ".//h4[@class='base-search-card__subtitle']").get_attribute('innerHTML').strip()
        if len(jobs_scraped) == 0:
            jobs_scraped = np.append(jobs_scraped, [job_title, company, location, url])
        else:
            jobs_scraped = np.vstack([jobs_scraped, [job_title, company, location, url]])
    
    # if there is only 1 scraped job
    if jobs_scraped.shape == (4,):
        jobs_scraped = np.array([list(jobs_scraped)])
        
    # individual job scraping
    job_descs = np.array([])
    for job in jobs_scraped:
        descs = []
        try:
            # scraping seniority, emp_type, job_func, job_desc, posted ago
            driver.get(job[3])
            wait = WebDriverWait(driver, timeout=2)
            ul = wait.until(EC.presence_of_element_located((By.XPATH, ".//ul[@class='description__job-criteria-list']")))
            desc_job = ul.get_attribute('outerHTML')
            desc_job = BeautifulSoup(desc_job, 'html.parser')
            desc_job = desc_job.find_all('span')
            descs = [job[3]]
            for i in [0,1,2]:
                try:
                    descs.append(desc_job[i].contents[0].replace('\n', '').strip())
                except:
                    descs.append('')
            try:
                desc_gen = driver.find_element(By.XPATH, "//div[@class='description__text description__text--rich']/section/div")
                desc_gen = desc_gen.get_attribute('innerHTML')
                descs.append(desc_gen)
            except:
                descs.append('')
            try:
                posted_ago = driver.find_element(By.XPATH, "//span[@class='posted-time-ago__text topcard__flavor--metadata']")
                posted_ago = posted_ago.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(posted_ago)
            except:
                descs.append('')
        except: 
            descs = ['','','','','','']
            
        if len(job_descs) == 0:
            job_descs = np.append(job_descs, descs)
        else:
            job_descs = np.vstack([job_descs, descs])
        time.sleep(2)
    
    # merging
    jobs_df = pd.DataFrame(jobs_scraped)
    jobs_df.columns = ['title', 'company', 'location', 'link']
    job_descs_df = pd.DataFrame(job_descs)
    
    # if there is only 1 scraped job
    if job_descs_df.shape == (6,1):
        job_descs_df = job_descs_df.T
    
    job_descs_df.columns = ['link','seniority','emp_type', 'job_function', 'job_desc', 'posted_ago']
    linkedin_df = jobs_df.merge(job_descs_df, on='link')
except:
    print("LinkedIn not accessible")
finally:
    # close driver
    driver.close()
    
end = time.time()
print("TOTAL TIME", end-start)

# Scraping FoundIt

In [1]:
job_query = "python-developer"

In [2]:
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://www.foundit.com.ph/search/{job_query}-jobs"

try:
    driver.get(url)
    job_list = driver.find_elements(By.XPATH, "//div[@class='srpResultCard']/div")
    job_list = job_list[1:] # remove the header
    
    jobs_scraped = np.array([])
    for job in job_list:
        # scraping title, company, url
        job_title = job.find_element(By.XPATH, ".//a[@title]")
        url = job_title.get_attribute('href')
        job_title = job_title.get_attribute('innerHTML').replace('\n', '').strip()
        company = job.find_element(By.XPATH, ".//div[@class='companyName']/span")
        company = company.get_attribute('innerHTML').replace('\n', '').strip()
        descs = [job_title, company, url]
    
        # scraping location, seniority, posted ago
        job.find_element(By.XPATH, './/div[@onclick]/div').click()
        wait = WebDriverWait(driver, 5)
        try:
            desc_job = wait.until(EC.visibility_of_all_elements_located((By.XPATH, ".//div[@id='jobHighlight']/div/div/div")))
            for i in [0,1,2]:
                if i != 2:
                    detail = desc_job[i].find_element(By.XPATH, ".//div[@class='details']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').strip()
                else: 
                    detail = desc_job[i].find_element(By.XPATH, ".//span[@class='btnHighighlights']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').split('</i>')
                    detail = detail[1].strip()
                descs.append(detail)
        except:
            descs.append('','','')
    
        # scraping emp_type, job function, general job_desc
        try:
            desc_job_2 = wait.until(EC.visibility_of_all_elements_located((By.XPATH, ".//div[@id='jobDetail']/div/div")))
            for i in [0,2]:
                try:
                    detail = desc_job_2[i].find_element(By.XPATH, ".//div[@class='jobDesc']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').strip()
                    descs.append(detail)
                except:
                    descs.append('')
        except:
            descs.extend(['',''])
        try:
            desc_gen = wait.until(EC.visibility_of_element_located((By.XPATH, ".//p[@class='jobDescInfo']")))
            desc_gen = desc_gen.get_attribute('innerHTML')
            descs.append(desc_gen)
        except:
            descs.append('')
            
        if len(jobs_scraped) == 0:
            jobs_scraped = np.append(jobs_scraped, descs)
        else:
            jobs_scraped = np.vstack([jobs_scraped, descs])
    
    if jobs_scraped.shape == (9,):
        jobs_scraped = np.array([list(jobs_scraped)])
    
    foundit_df = pd.DataFrame(jobs_scraped)
    foundit_df.columns = ['job_title','company','link','location','seniority','posted','emp_type','job_func','job_desc']
except:
    print("FoundIt not accessible")
finally:
    # close driver
    driver.close()
    
end = time.time()
print("TOTAL TIME", end-start)

NameError: name 'time' is not defined

# Scraping Kalibrr

In [36]:
job_query = "python-developer"

In [37]:
# insert scraping code here
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
min_jobs = 50
url = f"https://www.kalibrr.com/home/te/{job_query}"
kalibrr_df = ''

try:
    driver.get(url)
    job_list = []
    # load job list 
    while len(job_list) < min_jobs:
        # click button to load more jobs until job_list > min_jobs
        try:
            # Re-evaluate job_list after each load
            job_list = driver.find_elements(By.XPATH, "//div[@itemscope]/div")
        
            wait = WebDriverWait(driver, 5)
            load_more = wait.until(EC.presence_of_element_located((By.XPATH, "//button[@class='k-btn-primary']")))
            load_more.click()
        except:
            break
    
    # job catalog scraping
    jobs_scraped = np.array([])
    for job in job_list:
        descs = []
        # scraping job title, company, emp_type, location
        try:
            job_title = job.find_element(By.XPATH, ".//h2[@data-tooltip-id='job-title-tooltip-[object Object]']/a")
            url = job_title.get_attribute('href')
            job_title = job_title.get_attribute('innerHTML').replace('\n', '').strip()
            descs.append(job_title)
            descs.append(url)
        except:
            continue
    
        try:
            company = job.find_element(By.XPATH, ".//span[@class='k-inline-flex k-items-center k-mb-1']/a")
            company = company.text.replace('\n', '').strip()
            descs.append(company)
        except:
            descs.append('')
            
        try:
            emp_type = job.find_element(By.XPATH, "./div[@class='k-relative']/div/span/span[@class='k-text-gray-500']")
            emp_type = emp_type.get_attribute('innerHTML').replace('\n', '').strip()
            descs.append(emp_type)
        except:
            descs.append('')
    
        try:
            location = job.find_element(By.XPATH, "./div[@class='k-relative']/div/span/span[@class='k-text-gray-500 k-block k-pointer-events-none']")
            location = location.get_attribute('innerHTML').replace('\n', '').strip()
            descs.append(location)
        except:
            descs.append('')
    
        if len(jobs_scraped) == 0:
            jobs_scraped = np.append(jobs_scraped, descs)
        else:
            jobs_scraped = np.vstack([jobs_scraped, descs])
    
    if jobs_scraped.shape == (5,):
        jobs_scraped = np.array([list(jobs_scraped)])
    
    # individual job scraping
    job_descs = np.array([])
    for job in jobs_scraped:
        descs = [job[1]]
        try:
            driver.get(job[1])
            # scrape desc, job categ, job level
            try:
                wait = WebDriverWait(driver, timeout=2)
                job_func = wait.until(EC.presence_of_element_located((By.XPATH, ".//div[@class='md:k-flex']//dt[contains(text(),'Job Category')]/following-sibling::dd/a")))
                job_func = job_func.get_attribute('innerHTML').replace('\n', '').strip()
                descs.append(job_func)
            except:
                descs.append('')
        
            try:
                seniority = driver.find_element(By.XPATH, ".//div[@class='md:k-flex']//dt[contains(text(),'Job Level')]/following-sibling::dd/a")
                seniority = seniority.get_attribute('innerHTML').replace('\n', '').strip()
                descs.append(seniority)
            except:
                descs.append('')
        
            gen_desc = ''
            try:
                job_desc = driver.find_element(By.XPATH, ".//div[@itemprop='description']")
                job_desc = job_desc.get_attribute('innerHTML')
                gen_desc += job_desc
            except:
                gen_desc = gen_desc
            try:
                job_qual = driver.find_element(By.XPATH, ".//div[@itemprop='qualifications']")
                job_qual = job_qual.get_attribute('innerHTML')
                gen_desc += job_qual
            except:
                gen_desc = gen_desc
            try:
                job_benef = driver.find_element(By.XPATH, ".//div[@itemprop='jobBenefits']")
                job_benef = job_benef.get_attribute('innerHTML').replace('\n', '').strip()
                gen_desc += job_qual
            except:
                gen_desc = gen_desc
            try:
                job_skills = driver.find_element(By.XPATH, ".//ul")
                job_skills = job_skills.get_attribute('innerHTML')
                gen_desc += job_skills
            except:
                gen_desc = gen_desc
        
            descs.append(gen_desc)
        
            if len(job_descs) == 0:
                job_descs = np.append(job_descs, descs)
            else:
                job_descs = np.vstack([job_descs, descs])
        except:
            continue
    
    jobs_df = pd.DataFrame(jobs_scraped)
    jobs_df.columns = ['title', 'link' ,'company', 'emp_type', 'location']
    
    job_descs_df = pd.DataFrame(job_descs)
    
    if job_descs_df.shape == (4, 1):
        job_descs_df = job_descs_df.T
    job_descs_df.columns = ['link', 'job_func', 'seniority','gen_desc']
    kalibrr_df = jobs_df.merge(job_descs_df, on='link', how='left')

except:
    print("Kalibrr not accessible")
finally:
    # close driver
    print()
end = time.time()
print("TOTAL TIME", end-start)    


TOTAL TIME 201.09589552879333


In [40]:
len(job_list)

60

In [41]:
len(job_descs)

29

# Scraping Jobstreet

In [72]:
# insert scraping code here