In [8]:
!pip install webdriver-manager

Defaulting to user installation because normal site-packages is not writeable
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2


In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

import time
import numpy as np
import pandas as pd
import multiprocessing

# Scraping Linkedin

In [2]:
job_query = "python-developer"

In [17]:
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://ph.linkedin.com/jobs/{job_query}-jobs"
try:
    driver.get(url)
    ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    job_list = driver.find_elements(By.XPATH, "//ul[@class='jobs-search__results-list']/li")
    
    # job catalog scraping
    jobs_scraped = np.array([])
    for job in job_list:
        card = job.find_element(By.XPATH, ".//a[@data-tracking-control-name='public_jobs_jserp-result_search-card']")
        url = card.get_attribute('href')
        location = job.find_element(By.XPATH, ".//span[@class='job-search-card__location']").get_attribute('innerHTML')
        location = location.replace('\n', '').strip()
        job_title = job.find_element(By.XPATH, ".//h3[@class='base-search-card__title']").get_attribute('innerHTML').strip()
        try: 
            company = job.find_element(By.XPATH, ".//h4[@class='base-search-card__subtitle']/a").get_attribute('innerHTML').strip()
        except:
            company = job.find_element(By.XPATH, ".//h4[@class='base-search-card__subtitle']").get_attribute('innerHTML').strip()
        if len(jobs_scraped) == 0:
            jobs_scraped = np.append(jobs_scraped, [job_title, company, location, url])
        else:
            jobs_scraped = np.vstack([jobs_scraped, [job_title, company, location, url]])
    
    # if there is only 1 scraped job
    if jobs_scraped.shape == (4,):
        jobs_scraped = np.array([list(jobs_scraped)])
        
    # individual job scraping
    job_descs = np.array([])
    for job in jobs_scraped:
        descs = []
        try:
            # scraping seniority, emp_type, job_func, job_desc, posted ago
            driver.get(job[3])
            wait = WebDriverWait(driver, timeout=2)
            desc_job = wait.until(EC.presence_of_all_elements_located((By.XPATH, ".//ul[@class='description__job-criteria-list']/li")))
            descs = [job[3]]
            for i in [0,1,2]:
                try:
                    detail = desc_job[i].find_element(By.XPATH, ".//span")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').strip()
                    descs.append(detail)
                except:
                    descs.append('')
            try:
                desc_gen = driver.find_element(By.XPATH, "//div[@class='description__text description__text--rich']/section/div")
                desc_gen = desc_gen.get_attribute('innerHTML')
                descs.append(desc_gen)
            except:
                descs.append('')
            try:
                posted_ago = driver.find_element(By.XPATH, "//span[@class='posted-time-ago__text topcard__flavor--metadata']")
                posted_ago = posted_ago.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(posted_ago)
            except:
                descs.append('')
        except: 
            descs = [job[3],'','','','','']
            
        if len(job_descs) == 0:
            job_descs = np.append(job_descs, descs)
        else:
            job_descs = np.vstack([job_descs, descs])
        time.sleep(2)
    
    # merging
    jobs_df = pd.DataFrame(jobs_scraped)
    jobs_df.columns = ['title', 'company', 'location', 'link']
    job_descs_df = pd.DataFrame(job_descs)
    
    # if there is only 1 scraped job
    if job_descs_df.shape == (6,1):
        job_descs_df = job_descs_df.T
    
    job_descs_df.columns = ['link','seniority','emp_type', 'job_function', 'job_desc', 'posted_ago']
    linkedin_df = jobs_df.merge(job_descs_df, on='link')
except:
    print("LinkedIn not accessible")
finally:
    # close driver
    driver.close()
    
end = time.time()
print("TOTAL TIME", end-start)

don
TOTAL TIME 10.93996548652649


# Scraping FoundIt

In [8]:
job_query = "python-developer"

In [9]:
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://www.foundit.com.ph/search/{job_query}-jobs"

try:
    driver.get(url)
    job_list = driver.find_elements(By.XPATH, "//div[@class='srpResultCard']/div")
    job_list = job_list[1:] # remove the header
    
    jobs_scraped = np.array([])
    for job in job_list:
        # scraping title, company, url
        job_title = job.find_element(By.XPATH, ".//a[@title]")
        url = job_title.get_attribute('href')
        job_title = job_title.get_attribute('innerHTML').replace('\n', '').strip()
        company = job.find_element(By.XPATH, ".//div[@class='companyName']/span")
        company = company.get_attribute('innerHTML').replace('\n', '').strip()
        descs = [job_title, company, url]
    
        # scraping location, seniority, posted ago
        job.find_element(By.XPATH, './/div[@onclick]/div').click()
        wait = WebDriverWait(driver, 5)
        try:
            desc_job = wait.until(EC.presence_of_all_elements_located((By.XPATH, ".//div[@id='jobHighlight']/div/div/div")))
            for i in [0,1,2]:
                if i != 2:
                    detail = desc_job[i].find_element(By.XPATH, ".//div[@class='details']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').strip()
                else: 
                    detail = desc_job[i].find_element(By.XPATH, ".//span[@class='btnHighighlights']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').split('</i>')
                    detail = detail[1].strip()
                descs.append(detail)
        except:
            descs.append('','','')
    
        # scraping emp_type, job function, general job_desc
        try:
            desc_job_2 = wait.until(EC.visibility_of_all_elements_located((By.XPATH, ".//div[@id='jobDetail']/div/div")))
            for i in [0,2]:
                try:
                    detail = desc_job_2[i].find_element(By.XPATH, ".//div[@class='jobDesc']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').strip()
                    descs.append(detail)
                except:
                    descs.append('')
        except:
            descs.extend(['',''])
        try:
            desc_gen = wait.until(EC.visibility_of_element_located((By.XPATH, ".//p[@class='jobDescInfo']")))
            desc_gen = desc_gen.get_attribute('innerHTML')
            descs.append(desc_gen)
        except:
            descs.append('')
            
        if len(jobs_scraped) == 0:
            jobs_scraped = np.append(jobs_scraped, descs)
        else:
            jobs_scraped = np.vstack([jobs_scraped, descs])
    
    if jobs_scraped.shape == (9,):
        jobs_scraped = np.array([list(jobs_scraped)])
    
    foundit_df = pd.DataFrame(jobs_scraped)
    foundit_df.columns = ['job_title','company','link','location','seniority','posted','emp_type','job_func','job_desc']
except:
    print("FoundIt not accessible")
finally:
    # close driver
    driver.close()
    
end = time.time()
print("TOTAL TIME", end-start)

TOTAL TIME 68.81081748008728


# Scraping Jobstreet

In [78]:
job_query = "web-designer"

In [79]:
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://ph.jobstreet.com/{job_query}-jobs"

try:
    driver.get(url)
    job_list = driver.find_elements(By.XPATH, "//article[@data-automation='normalJob']")
    
    jobs_scraped = np.array([])
    for job in job_list:
        try:
            job_title = job.find_element(By.XPATH, ".//a[@data-automation='jobTitle']")
            url = job_title.get_attribute('href')
            job_title = job_title.get_attribute('innerHTML').replace('\n','').strip()
            company = job.find_element(By.XPATH, ".//a[@data-type='company']")
            company = company.get_attribute('innerHTML').replace('\n', '').strip()
            if len(jobs_scraped) == 0:
                jobs_scraped = np.append(jobs_scraped, [job_title, company, url])
            else:
                jobs_scraped = np.vstack([jobs_scraped, [job_title, company, url]])
        except:
            continue

    if jobs_scraped.shape == (3,):
        jobs_scraped = np.array([list(jobs_scraped)])
        
    job_descs = np.array([])
    for job in jobs_scraped:
        try:
            driver.get(job[2])
            descs = [job[2]]
            try:
                location = driver.find_element(By.XPATH, "//span[@data-automation='job-detail-location']/a")
                location = location.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(location)
            except:
                descs.append('')
            try:
                job_func = driver.find_element(By.XPATH, "//span[@data-automation='job-detail-classifications']/a")
                job_func = job_func.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(job_func)
            except:
                descs.append('')
            try:
                emp_type = driver.find_element(By.XPATH, "//span[@data-automation='job-detail-work-type']/a")
                emp_type = emp_type.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(emp_type)
            except:
                descs.append('')
            try:
                posted_ago = driver.find_element(By.XPATH, "//span[@class='gepq850 eihuid4z i7p5ej0 i7p5ej1 i7p5ej22 _18ybopc4 i7p5ej7']")
                posted_ago = posted_ago.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(posted_ago)
            except:
                descs.append('')
            try:
                gen_desc = driver.find_element(By.XPATH, "//div[@data-automation='jobAdDetails']/div")
                gen_desc = gen_desc.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(gen_desc)
            except:
                descs.append('')
        except:
            descs = [job[2], '', '', '', '', '']
        if len(job_descs) == 0:
            job_descs = np.append(job_descs, descs)
        else:
            job_descs = np.vstack([job_descs, descs])

    # merging
    jobs_df = pd.DataFrame(jobs_scraped)
    jobs_df.columns = ['title', 'company', 'link']
    job_descs_df = pd.DataFrame(job_descs)
    
    # if there is only 1 scraped job
    if job_descs_df.shape == (6,1):
        job_descs_df = job_descs_df.T
    
    job_descs_df.columns = ['link','location','job_function', 'emp_type', 'posted_ago', 'job_desc']
    jobstreet_df = jobs_df.merge(job_descs_df, on='link')
except:
    print("Jobstreet not accessible")
finally:
    # close the driver
    driver.close()
end = time.time()
print("TOTAL TIME", end-start)

TOTAL TIME 57.15643930435181
