In [None]:
#!pip install webdriver-manager

In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

import time
import numpy as np
import pandas as pd
import multiprocessing

# Scraping Linkedin

In [26]:
job_query = "python-developer"

In [None]:
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://ph.linkedin.com/jobs/{job_query}-jobs"
try:
    driver.get(url)
    ActionChains(driver).send_keys(Keys.ESCAPE).perform()
    job_list = driver.find_elements(By.XPATH, "//ul[@class='jobs-search__results-list']/li")
    
    # job catalog scraping
    jobs_scraped = np.array([])
    for job in job_list:
        card = job.find_element(By.XPATH, ".//a[@data-tracking-control-name='public_jobs_jserp-result_search-card']")
        url = card.get_attribute('href')
        location = job.find_element(By.XPATH, ".//span[@class='job-search-card__location']").get_attribute('innerHTML')
        location = location.replace('\n', '').strip()
        job_title = job.find_element(By.XPATH, ".//h3[@class='base-search-card__title']").get_attribute('innerHTML').strip()
        try: 
            company = job.find_element(By.XPATH, ".//h4[@class='base-search-card__subtitle']/a").get_attribute('innerHTML').strip()
        except:
            company = job.find_element(By.XPATH, ".//h4[@class='base-search-card__subtitle']").get_attribute('innerHTML').strip()
        if len(jobs_scraped) == 0:
            jobs_scraped = np.append(jobs_scraped, [job_title, company, location, url])
        else:
            jobs_scraped = np.vstack([jobs_scraped, [job_title, company, location, url]])
    
    # if there is only 1 scraped job
    if jobs_scraped.shape == (4,):
        jobs_scraped = np.array([list(jobs_scraped)])
        
    # individual job scraping
    job_descs = np.array([])
    for job in jobs_scraped:
        descs = []
        try:
            # scraping seniority, emp_type, job_func, job_desc, posted ago
            driver.get(job[3])
            wait = WebDriverWait(driver, timeout=2)
            ul = wait.until(EC.presence_of_element_located((By.XPATH, ".//ul[@class='description__job-criteria-list']")))
            desc_job = ul.get_attribute('outerHTML')
            desc_job = BeautifulSoup(desc_job, 'html.parser')
            desc_job = desc_job.find_all('span')
            descs = [job[3]]
            for i in [0,1,2]:
                try:
                    descs.append(desc_job[i].contents[0].replace('\n', '').strip())
                except:
                    descs.append('')
            try:
                desc_gen = driver.find_element(By.XPATH, "//div[@class='description__text description__text--rich']/section/div")
                desc_gen = desc_gen.get_attribute('innerHTML')
                descs.append(desc_gen)
            except:
                descs.append('')
            try:
                posted_ago = driver.find_element(By.XPATH, "//span[@class='posted-time-ago__text topcard__flavor--metadata']")
                posted_ago = posted_ago.get_attribute('innerHTML').replace('\n','').strip()
                descs.append(posted_ago)
            except:
                descs.append('')
        except: 
            descs = ['','','','','','']
            
        if len(job_descs) == 0:
            job_descs = np.append(job_descs, descs)
        else:
            job_descs = np.vstack([job_descs, descs])
        time.sleep(2)
    
    # merging
    jobs_df = pd.DataFrame(jobs_scraped)
    jobs_df.columns = ['title', 'company', 'location', 'link']
    job_descs_df = pd.DataFrame(job_descs)
    
    # if there is only 1 scraped job
    if job_descs_df.shape == (6,1):
        job_descs_df = job_descs_df.T
    
    job_descs_df.columns = ['link','seniority','emp_type', 'job_function', 'job_desc', 'posted_ago']
    linkedin_df = jobs_df.merge(job_descs_df, on='link')
except:
    print("LinkedIn not accessible")
finally:
    # close driver
    driver.close()
    
end = time.time()
print("TOTAL TIME", end-start)

# Scraping FoundIt

In [24]:
job_query = "python-developer"

In [25]:
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://www.foundit.com.ph/search/{job_query}-jobs"

try:
    driver.get(url)
    job_list = driver.find_elements(By.XPATH, "//div[@class='srpResultCard']/div")
    job_list = job_list[1:] # remove the header
    
    jobs_scraped = np.array([])
    for job in job_list:
        # scraping title, company, url
        job_title = job.find_element(By.XPATH, ".//a[@title]")
        url = job_title.get_attribute('href')
        job_title = job_title.get_attribute('innerHTML').replace('\n', '').strip()
        company = job.find_element(By.XPATH, ".//div[@class='companyName']/span")
        company = company.get_attribute('innerHTML').replace('\n', '').strip()
        descs = [job_title, company, url]
    
        # scraping location, seniority, posted ago
        job.find_element(By.XPATH, './/div[@onclick]/div').click()
        wait = WebDriverWait(driver, 5)
        try:
            desc_job = wait.until(EC.visibility_of_all_elements_located((By.XPATH, ".//div[@id='jobHighlight']/div/div/div")))
            for i in [0,1,2]:
                if i != 2:
                    detail = desc_job[i].find_element(By.XPATH, ".//div[@class='details']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').strip()
                else: 
                    detail = desc_job[i].find_element(By.XPATH, ".//span[@class='btnHighighlights']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').split('</i>')
                    detail = detail[1].strip()
                descs.append(detail)
        except:
            descs.append('','','')
    
        # scraping emp_type, job function, general job_desc
        try:
            desc_job_2 = wait.until(EC.visibility_of_all_elements_located((By.XPATH, ".//div[@id='jobDetail']/div/div")))
            for i in [0,2]:
                try:
                    detail = desc_job_2[i].find_element(By.XPATH, ".//div[@class='jobDesc']")
                    detail = detail.get_attribute('innerHTML').replace('\n', '').strip()
                    descs.append(detail)
                except:
                    descs.append('')
        except:
            descs.extend(['',''])
        try:
            desc_gen = wait.until(EC.visibility_of_element_located((By.XPATH, ".//p[@class='jobDescInfo']")))
            desc_gen = desc_gen.get_attribute('innerHTML')
            descs.append(desc_gen)
        except:
            descs.append('')
            
        if len(jobs_scraped) == 0:
            jobs_scraped = np.append(jobs_scraped, descs)
        else:
            jobs_scraped = np.vstack([jobs_scraped, descs])
    
    if jobs_scraped.shape == (9,):
        jobs_scraped = np.array([list(jobs_scraped)])
    
    foundit_df = pd.DataFrame(jobs_scraped)
    foundit_df.columns = ['job_title','company','link','location','seniority','posted','emp_type','job_func','job_desc']
except:
    print("FoundIt not accessible")
finally:
    # close driver
    driver.close()
    
end = time.time()
print("TOTAL TIME", end-start)

TOTAL TIME 29.352275133132935


# Scraping Kalibrr

In [34]:
job_query = "web-developer"

In [None]:
# insert scraping code here
start = time.time()
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = f"https://www.kalibrr.com/home/te/{job_query}"

kalibrr_df = pd.DataFrame()
try:
    driver.get(url)
    job_list = driver.find_elements(By.XPATH, "//div[@class='k-container k-grid k-grid-cols-1 md:k-grid-cols-2 xl:k-grid-cols-3 k-gap-4 k-mt-8 k-mb-10']/div")
    
    # change all 27s to 100s if you want to scrape more jobs
    # scrape title, type, company, location, url
    jobs_scraped = np.array([])
    while len(jobs_scraped) < 27:
        # Re-evaluate job_list after each load
        job_list = driver.find_elements(By.XPATH, "//div[@class='k-container k-grid k-grid-cols-1 md:k-grid-cols-2 xl:k-grid-cols-3 k-gap-4 k-mt-8 k-mb-10']/div")

        for job in job_list:
            if len(jobs_scraped) >= 27:
                break  # Stop if 100 jobs are already scraped
    
            job_title_element = job.find_element(By.XPATH, ".//h2[@data-tooltip-id='job-title-tooltip-[object Object]']/a")
            job_title = job_title_element.get_attribute('innerHTML').replace('\n', '').strip()
            job_url = job_title_element.get_attribute('href')

            #print(job_title)

            job_type = job.find_element(By.XPATH, "./div[@class='k-relative']/div/span/span[@class='k-text-gray-500']")
            job_type = job_type.get_attribute('innerHTML').replace('\n', '').strip()
            #print(job_type)

            job_company = job.find_element(By.XPATH, ".//span[@class='k-inline-flex k-items-center k-mb-1']/a")
            job_company = job_company.text.strip()
            #print(job_company)


            job_location = job.find_element(By.XPATH, "./div[@class='k-relative']/div/span/span[@class='k-text-gray-500 k-block k-pointer-events-none']")
            job_location = job_location.get_attribute('innerHTML').replace('\n', '').strip()
            #print(job_location)

            
            #print(job_url)


            if len(jobs_scraped) == 0:
                jobs_scraped = np.append(jobs_scraped, [job_title, job_type ,job_company, job_location, job_url])
            else:
                jobs_scraped = np.vstack([jobs_scraped, [job_title, job_type,job_company, job_location, job_url]])
        
            #click button to load more jobs until jobs_scraped is 100 (NOT SURE IF IT SCRAPES THE NEWLY LOADED ONES)
            if jobs_scraped.shape[0] < 27:
                try:
                    load_more = driver.find_element(By.XPATH, "//button[@class='k-btn-primary']")
                    load_more.click()
                    time.sleep(2)
                except:
                    break

    #print jobs scraped overview
    print(jobs_scraped.shape)
    print(jobs_scraped[0])

    #scrape desc, job categ, job level
    job_descs = []
    for job in jobs_scraped:
            descs = []
            try:
                driver.get(job[4])
                wait = WebDriverWait(driver, timeout=2)
                new_page = wait.until(EC.presence_of_element_located((By.XPATH, ".//div[@class='md:k-w-full md:k-pr-4 css-11e7ttb']")))
                job_all_desc = new_page.find_element(By.XPATH, "./div[@itemprop='description']")
                all_texts = job_all_desc.find_elements(By.XPATH, ".//*[self::p or self::li or self::span or self::div or self::a]")
                job_desc = " ".join([element.text for element in all_texts if element.text.strip() != ""])
                #print(job_desc)

                job_categ = new_page.find_element(By.XPATH, ".//div[@class='md:k-flex']//dt[contains(text(),'Job Category')]/following-sibling::dd/a")
                job_categ = job_categ.get_attribute('innerHTML').replace('\n', '').strip()

                job_level = new_page.find_element(By.XPATH, ".//div[@class='md:k-flex']//dt[contains(text(),'Job Level')]/following-sibling::dd/a")
                job_level = job_level.get_attribute('innerHTML').replace('\n', '').strip()

                descs.append(job_categ)
                descs.append(job_level)
                descs.append(job_desc)
                job_url = job[4]
                descs.append(job_url)
            except:
                descs = ['','','','']
            
            job_descs.append(descs)
            time.sleep(2)
    #check descs scraped overview
    job_descs = np.array(job_descs)
    print(job_descs.shape)
    print(job_descs[0])

    jobs_df = pd.DataFrame(jobs_scraped)
    jobs_df.columns = ['job_title', 'job_type' ,'job_company', 'job_location', 'job_url']
    job_descs_df = pd.DataFrame(job_descs)
    job_descs_df.columns = ['job_categ','job_level','job_desc','job_url']
    kalibrr_df = jobs_df.merge(job_descs_df, on='job_url',how='left')

except:
    print("Kalibrr not accessible")
finally:
    # close driver
    driver.close()
end = time.time()
print("TOTAL TIME", end-start)
print(kalibrr_df.head(1))
    

(27, 5)
['Web Developer' 'Full time' 'SEO Hacker' 'Parañaque, Philippines'
 'https://www.kalibrr.com/c/seo-marketing-services-inc/jobs/240698/web-developer']
(27, 4)
['IT and Software' 'Associate / Supervisor'
 'Develop custom WordPress themes from scratch, adhering to best practices and design standards. Utilize Custom Post Types and Advanced Custom Fields (ACF) plugins to enhance the functionality and flexibility of WordPress websites. Customize WooCommerce templates to meet specific project requirements. Collaborate with designers and other team members to ensure seamless integration of design and functionality. Troubleshoot and resolve technical issues related to WordPress development. Stay updated on the latest trends and technologies in WordPress development and implement them effectively. Maintain existing WordPress websites ensuring all functions are working and security measures are up to date. Writing well designed, testable, efficient code by using best software development 

# Scraping Jobstreet

In [72]:
# insert scraping code here