In [None]:
import pandas as pd
import numpy as np
import re
import requests
import shutil,string,time
from bs4 import BeautifulSoup
from IPython.display import clear_output


## Webscraping Indeed

In [1]:
class IndeedScrapper():
    _BASE_URL = 'https://sg.indeed.com'
    _job_ids = []

    def __init__ (self, q, max_iter=10, timeout=5):
        self.query_formatted = q
        self.query = re.sub('\s', '%20', q)
        self.max_iter = max_iter
        self.timeout = timeout

    def scrape (self):
        print(f'Scrapping Indeed using query {self.query_formatted}')
        for i in range(self.max_iter):
            r = requests.request("GET", f'{self._BASE_URL}/jobs?q={self.query}&filter=0&start={i*10}')
            found_ids = re.findall('(?:jk:\')(.{16})(?:\')', r.text)

            if (len(found_ids) == 0):
                break
            elif (all(elem in self._job_ids for elem in found_ids)):
                break
            else:
                self._job_ids.extend(found_ids)
            
            time.sleep(self.timeout)
        
        return list(set(self._job_ids))
    
    def scrape_job_desc(self, job_ids):
        print('Scrapping job descriptions.')
        output = pd.DataFrame(columns=['JOB_ID', 'JOB_URL','JOB_TITLE','COMPANY','SALARY','JOB_DESC','QUERY'])
        i = 0

        for job_id in job_ids:
            job_url = f'https://sg.indeed.com/m/basecamp/viewjob?viewtype=embedded&jk={job_id}'
            r = requests.request("GET", job_url)
            
            soup = BeautifulSoup(r.text, 'html.parser')
            job_title = soup.find('h1').text
            company = soup.find('div', attrs={'class': 'jobsearch-CompanyInfoContainer'}).text
            job_description = soup.find('div', attrs={'id': 'jobDescriptionText'}).text
            try:
                salary = soup.find('div', attrs={'id': 'salaryInfoAndJobType'}).text
            except:
                salary = np.nan
            finally:
                output.loc[i,:] = [job_id, job_url, job_title, company, salary, job_description, self.query_formatted]
                i += 1

            time.sleep(self.timeout)
            
        return output

In [None]:
with open('./job_ids.csv', 'r') as f:
    job_ids = f.read().split(',')
with open('./job_titles.csv', 'r') as f:
    job_titles = f.read().split(',')
with open('./job_descriptions.csv', 'r', encoding='utf-8') as f:
    df = pd.read_csv(f, index_col=0)

In [None]:
while True:
    if len(job_titles) == 0:
        print('Reach end of job_titles.')
        break
    
    print('Waiting...')
    time.sleep(5)
    clear_output(wait=True)

    next_title = job_titles.pop()
    scraper = IndeedScrapper(next_title, timeout=2)
    job_id_list = scraper.scrape()
    next_job_ids = [id for id in job_id_list if id not in job_ids]

    if len(next_job_ids) == 0:
        raise(Exception("next_job_ids is empty."))

    job_ids = list(set(job_ids + job_id_list))
    job_descriptions = scraper.scrape_job_desc(next_job_ids)
    df = pd.concat([df,job_descriptions])
    df.reset_index(drop=True, inplace=True)

    with open('./job_ids.csv', 'w') as f:
        f.write(','.join(job_ids))
    with open('./job_titles.csv', 'w') as f:
        f.write(','.join(job_titles))

    df.to_csv('./job_descriptions.csv', encoding='utf-8')

## Scraping Jobstreet

In [None]:
def getJobDescription(job_url):
    response = requests.get(job_url, headers={
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36"})
    html = response.content
    soup = BeautifulSoup(html, "html.parser")
    try:
        job_specialization = ""
        for element in soup.findAll("span",
                                    {"class": "sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc3 _1d0g9qk4 _18qlyvcb"}):
            if element.text == 'Job Specializations':
                job_specialization = element.parent.parent.find("span", {
                    "class": "sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc1 _1d0g9qk4 _18qlyvcb"}).text
                break
        job_description = soup.findAll("div", {"class": "YCeva_0"})[0].text
        time.sleep(3)
        return job_description.replace(",", "---"),job_specialization.replace(",", "---")
    except Exception as e:
        print("%s error %s" % (job_url, str(e)))

In [None]:
url = "https://www.jobstreet.com.sg/en/job-search/%s-jobs-in-singapore/%s/"
df=pd.read_csv("job_descriptions-jobstreet-new.csv",encoding="utf-8")
JOB_KEYWORDS=['General Ledger Accountant', 'Accountancy Practice', 'Senior Accountant', 'Accounts Administrator', 'Auditor', 'Company Accountant', 'Assistant Accountant', 'Accountant', 'Accounting Clerk', 'Junior Accountant', 'Housekeeping Supervisor', 'Executive Level', 'District Manager', 'Medical Secretary', 'Inventory Management Associate', 'Junior Secretary', 'Admin', 'Executive Secretary', 'Project Manager', 'Teller', 'Copywriter', 'Director of Communication', 'Brand Manager', 'Fashion Design', 'Radio', 'Animation', 'Games', 'Creative Director', 'Campaign Manager', 'Events Design', 'Agribusiness', 'Agriculture', 'Animal Care', 'Horticulture', 'Property', 'Architect', 'Architectural Technician', 'Architecture', 'GIS', 'Landscape Architect', 'Planner', 'Graphic Artist', 'Art', 'Designer', 'Music Industry', 'User Interface Designer', 'Product Design', 'Graphic Designer', 'Graphics Designer', 'Music', 'Arts', 'Back Office', 'Settlements', 'Securities Lending', 'Reconciliations', 'Offshore Banking', 'Analyst', 'Corp Actions', 'Banking Lawyer', 'Underwriter', 'Corporate Business Sales', 'Oracle Developer', 'Webmaster', 'Network Support Engineer', 'Development Engineer', 'Database Administrator', 'Lead Developer', 'Website Manager', 'Visual Information Specialist', 'XML Development Specialist', 'Computer Scientist', 'Planning Manager', 'Electrician', 'Interior Designer', 'Trades', 'Quantity Surveyor', 'Rail', 'Construction', 'Joiner', 'Facilities Management', 'Construction Worker', 'Change Management', 'Management Consultant', 'Consultant', 'Business Advisor', 'Business Analyst', 'Programme Management', 'Statistician', 'Junior Business Analyst', 'Consulting', 'Outsourcing', 'Customer Service Officer', 'Customer Service Advisor', 'Help Desk', 'Sales Engineer', 'Support Assistant', 'Call Centre', 'Retail Customer Support Representative', 'Contact Centre Manager', 'Helpdesk Administrator', 'General Service Technician', 'Primary School', 'Assistant Teacher', 'English Teacher', 'Lecturer', 'Higher Education', 'Secondary School', 'Supply Teacher', 'Further Education', 'Instructor', 'Teaching', 'Nuclear', 'Energy', 'Alternative Energy', 'Gas', 'Power Supply', 'Oil', 'Exploration', 'Resource Trading', 'Drilling', 'Utilities', 'CNC Programming', 'Field Engineer', 'Engineer', 'Environmental Engineer', 'Test Engineer', 'Plastics', 'Operations Research Analyst', 'Field', 'Structural', 'Maintenance Engineer', 'Installation Engineer', 'Concierge', 'Handyman', 'Welder', 'HVAC Engineer', 'Maintenance Worker', 'Facilities Coordinator', 'Maintenance', 'General Foreman', 'Welding', 'Economist', 'Tax Manager', 'Consultancy', 'Budget Analyst', 'VP Director of Finance', 'Payment', 'Controller', 'Finance', 'Finance Director', 'Financial Consultant', 'Catering Assistant', 'Grill Cook', 'Food Service', 'Catering', 'Cook', 'Baker', 'Bakery', 'Line Cook', 'Food Service Worker', 'Contract Catering Management', 'Personal Training', 'Medical Receptionist', 'Associate Specialist', 'Healthcare Assistant', 'Occupational Therapy', 'General Healthcare Assistant', 'Optical Assistant', 'Phlebotomy', 'Unit Manager', 'Health Care', 'Hotel', 'Banquet Manager', 'Cleaning', 'Porter', 'Cleaner', 'Events Management', 'Hotel Management', 'Guest Service Agent', 'Event Manager', 'Hotel Receptionist', 'Director of Human Resources', 'HR Specialist', 'HR', 'HR Generalist', 'HR Assistant', 'Job Coach', 'Recruiter', 'Human Resources Manager', 'Head of HR', 'IT Recruitment Consultant', 'Benefits Specialist', 'Benefits', 'Insurance Sales Representative', 'Personal Lines', 'Field Investigator', 'Actuarial', 'Insurance', 'Benefits Administrator', 'Fraud', 'Insurance Sales Advisor', 'Digital', 'Technical Architect', 'IT Support Engineer', 'IT', 'IT System Manager', 'Project Office Support', 'Web Design', 'IT Director', 'Desktop Support Engineer', 'Middleware', 'Law Enforcement', 'Law', 'Police Officer', 'Loss Prevention Manager', 'Police', 'Policy', 'Probation', 'Litigation', 'Procurement', 'Legal Counsel', 'Legal Administrator', 'General Counsel', 'Legal Executive', 'Legal', 'Trading Standards', 'Loan Officer', 'Loans Advisor', 'Loan', 'Mortgages', 'Loans', 'Mortgage', 'Loan Consultant', 'Loans Administration', 'Warehouse Assistant', 'Warehouse Worker', 'Logistics', 'Warehouse Person', 'Warehouse Coordinator', 'Logistics Analyst', 'Logistics Manager', 'Warehouse Specialist', 'Warehouse Driver', 'Warehouse Associate', 'Bid Manager', 'Team Manager', 'Site Management', 'Safety Manager', 'Business Manager', 'Business Office Manager', 'Team Leader', 'Senior Project Manager', 'Management Accountant', 'Practice Manager', 'Production', 'QA Engineer', 'Quality Technician', 'Vessel Manager', 'Quality', 'Quality Inspector', 'Production Planning Manager', 'QC Analyst', 'Quality Assurance Manager', 'Quality Controller', 'Marketing Analyst', 'Online Marketing Manager', 'Direct Marketing Executive', 'Market Executive', 'Marketing Analytics', 'Online Marketing Executive', 'Marketing Manager', 'Director of Marketing', 'Direct Marketing Manager', 'Market Research', 'Maintenance Mechanic', 'Validation Engineer', 'Procurement Manager', 'Material Handler', 'Mechanical Fitter', 'CNC Machinist', 'Technician', 'Drafter', 'Mechanical', 'Machinist', 'Chemical Analyst', 'Lab Assistant', 'Pharmaceutical', 'Pharmacy Technician', 'Lab Technician', 'Chemical', 'Clinical Research Associate', 'Pharmacist', 'Analytical Chemist', 'Pharmacology', 'PR Executive', 'PR Account Manager', 'Public Relations Manager', 'PR Manager', 'PR', 'Public Relations', 'Editorial Assistant', 'Writer', 'Technical Writer', 'News', 'Editor', 'Journalism', 'Writer Editor', 'Editorial', 'Desktop Publishing', 'Newspaper Delivery', 'Lease', 'Real Estate Agent', 'Leasing Agent', 'Leasing Consultant', 'Property Manager', 'Estate Agency', 'Real Estate', 'Actuarial', 'Assistant Property Manager', 'Land Surveyor', 'Waiter', 'Waiting Staff', 'Chef De Partie', 'Restaurant Supervisor', 'Chef Manager', 'Sous Chef', 'Dishwasher', 'Wait Staff', 'Commis Chef', 'Development Chef', 'Store Promotions', 'Retail', 'Beauty', 'Store Manager', 'Visual Merchandiser', 'Retail Sales Representative', 'Retail Assistant', 'Assistant Store Manager', 'Store Staff', 'Cashier', 'Director of Business Development', 'National Sales Manager', 'Sales Specialist', 'Field Representative', 'Business Development Executive', 'Commercial Sales Manager', 'Merchandiser', 'Sales Consultant', 'National Account Manager', 'Media Sales', 'Clinical Data Management', 'Science', 'Polymer Chemistry', 'Life Science', 'Scientist', 'Food Science', 'Scientific', 'Molecular Biology', 'Materials Science', 'Physics', 'Safety Officer', 'Security Officer', 'Health and Safety Manager', 'Public Services', 'Security Contracts Manager', 'Local Government', 'Military', 'Fire Safety', 'Government', 'Health and Safety Officer', 'Youth Support Worker', 'Social Care', 'Aged Care', 'Social Care Assessment', 'Elderly Care', 'Support Worker', 'Disability Support', 'Environmental', 'Family Support', 'Care Manager', 'Telecommunications Specialist', 'Telecoms Engineer', 'Wireless', 'Broadband', 'Telecom', 'Telecommunications', 'Communications Specialist', 'Telecoms Consultant', 'Wireless Consultant', 'Traineeship', 'Finance Trainer', 'Training Manager', 'Apprenticeship', 'Training Assistant', 'Trainer', 'Product Trainer', 'Technical Training', 'Training', 'Healthcare Trainer', 'Fleet Owners', 'Van Driver', 'Truck Driver', 'Forklift Operator', 'Vehicle Mechanic', 'Bus Driver', 'Parts Advisor', 'Shipping', 'Driver', 'Delivery Driver', 'Business Travel Consultant', 'Travel Consultant', 'Childrens Activity', 'Business Travel', 'Airline', 'Tourism', 'Travel Agent', 'Cruise', 'Travel', 'Events', 'Charity', 'Volunteer Coordinator', 'Counselling', 'Red Cross', 'Corporate Partnerships', 'Community Volunteer', 'Community Support', 'Recycling', 'Volunteers']
known_jobs=set(df["JOB_URL"].tolist())
index=1
while index<=30:
    for keyword in JOB_KEYWORDS:
        response = requests.get(url%(keyword.lower().replace(" ","-"),index), headers={"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
                                              "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.81 Safari/537.36"})
        html = response.content
        soup = BeautifulSoup(html, "html.parser")
        for job in soup.findAll("div",{"class":"sx2jih0 zcydq876 zcydq866 zcydq896 zcydq886 zcydq8n zcydq856 zcydq8f6 zcydq8eu"}):
            try:
                job_id = ""
                job_url="https://www.jobstreet.com.sg"+job.find("a",{"class":"_1hr6tkx5 _1hr6tkx8 _1hr6tkxb sx2jih0 sx2jihf zcydq8h"}).attrs["href"]
                if job_url in known_jobs:
                    continue
                job_title=job.find("div",{"class":"sx2jih0 l3gun70 l3gun74 l3gun72"}).text.strip().replace(",",";")
                company=job.find("span",{"class":"sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc1 _18qlyvca"}).text.strip().replace(",",";")
                try:
                    job_location=job.find("span",{"class":"sx2jih0 zcydq84u zcydq80 iwjz4h0"}).text.strip().replace(",",";")
                except:
                    job_location=""
                try:
                    salary=job.findAll("span",{"class":"sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc3 _18qlyvc7"})[1].text.strip().replace(",",";")
                except:
                    salary=""
                job_snippet=""
                for snippet in job.find("ul",{"class":"sx2jih0 sx2jih3 _17fduda0 _17fduda5"}).contents:
                    job_snippet+=snippet.text.strip().replace(",",";")+"   "
                try:
                    post_date=job.find("span",{"class":"sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1y _18qlyvc1 _18qlyvc7"}).text.strip().replace(",",";")
                except:
                    post_date=""
                job_description,job_specialization=getJobDescription(job_url)
                f = open("job_descriptions-jobstreet-new.csv", "a", encoding="utf-8")
                f.write("%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n"%(job_url,job_title,company,job_location,salary,job_snippet,post_date,job_description,job_specialization,keyword))
                f.close()
            except Exception as e:
                continue
        time.sleep(5)
        print("finished %s for %s" % (index,keyword))
    index+=1