In [1]:
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd

In [None]:
job_categories = [
    "software+engineer",
    "arts",
    "hr",
    "sales",
    "ui+ux",
    "data+analyst",
    "cyber+security",
]

In [None]:
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)

In [None]:
def get_category_urls(category):
    category_urls = [f"https://sg.indeed.com/jobs?q={category}&start={i}" for i in range(0, 991, 10)]
    return category_urls

def get_job_urls(category_url):
    driver.get(category_url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    job_listings = soup.find_all("div", class_="cardOutline")[:10]
    job_url_extensions = [job_listing.find("a", {"class":"jcs-JobTitle"}).get("href") for job_listing in job_listings]
    job_urls = list(map(lambda url_extension: "https://sg.indeed.com"+url_extension, job_url_extensions))
    return job_urls

def get_full_job_description(job_url):
    driver.get(job_url)
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    full_job_description = soup.find("div", class_="jobsearch-jobDescriptionText")
    return full_job_description

def scrape_data(verbose=False):
    with open("jd.csv", "w") as f:
        writer = csv.writer(f)
        page_counter = 0
        for category in job_categories:
            category_urls = get_category_urls(category)
            for category_url in category_urls:
                try:
                    job_urls = get_job_urls(category_url)
                    for job_url in job_urls:
                        try:
                            full_job_description = get_full_job_description(job_url)
                            writer.writerow((category, full_job_description))
                            page_counter += 1
                            if verbose:
                                print(f"{page_counter} page(s) scraped")
                        except TimeoutException:
                            continue
                except TimeoutException:
                    continue
        f.close()

In [None]:
scrape_data()
driver.quit()

In [None]:
all_job_descriptions = pd.read_csv("jd.csv")
jd_easy = []
jd_difficult = []
for row in all_job_descriptions:
    if row[0] in ["software+engineer", "arts", "hr", "sales"]:
        jd_easy.extend([list(row)])
    elif row[0] in ["software+engineer", "ui+ux", "data+analyst", "cyber+security"]:
        jd_difficult.extend([list(row)])
pd.DataFrame(jd_easy).to_csv("jd_easy.csv", index=False, header=False)
pd.DataFrame(jd_difficult).to_csv("jd_difficult.csv", index=False, header=False)