In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

## Scraping function [](#2)

In [2]:
def scrape_article_ids(api_url, max_pages):
    job_id = []
    titles = []
    teasers = []
    companies = []
    locations = []
    categorys = []
    subCategories = []
    job_types = []
    salaries = []
    dateposted = []
    datedisplay = []
    for page_number in range(1, max_pages + 1):
        page_url = f'{api_url}&page={page_number}'

        # Send an HTTP request to the API endpoint
        response = requests.get(page_url)
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()

            # Extract advertiser IDs from each item in the 'data' list
            for item in data['data']:
                jid = item['id']
                title = item['title']
                teaser = item['teaser']
                company = item['advertiser'].get('description', '')
                location = item.get('location', '')
                category = item['classification'].get('description', '')
                subCategory= item['subClassification'].get('description', '')
                job_type = item.get('workType', '')
                salary = item.get('salary', '')
                date = item.get('listingDate', '')
                display = item.get('listingDateDisplay', '')

                job_id.append(jid)
                titles.append(title)
                teasers.append(teaser)
                companies.append(company)
                locations.append(location)
                categorys.append(category)
                subCategories.append(subCategory)
                job_types.append(job_type)
                salaries.append(salary)
                dateposted.append(date)
                datedisplay.append(display)

        else:
            print(f"Failed to retrieve data from the API. Status Code: {response.status_code}")
            break

    return job_id, titles, teasers, companies, locations, categorys, subCategories,job_types,salaries, dateposted, datedisplay

def combine_results(api_urls, max_pages):
    combined_data = {'job_id': [], 'titles': [], 'teasers': [], 'companies': [], 'locations': [], 'categorys': [],
                     'subCategories': [], 'job_types': [], 'salaries': [], 'dateposted': [], 'datedisplay': [], 'url': []}

    for api_url in api_urls:
        job_id, titles, teasers, companies, locations, categorys, subCategories, job_types, salaries, dateposted, datedisplay = scrape_article_ids(api_url, max_pages)
        combined_data['job_id'].extend(job_id)
        combined_data['titles'].extend(titles)
        combined_data['teasers'].extend(teasers)
        combined_data['companies'].extend(companies)
        combined_data['locations'].extend(locations)
        combined_data['categorys'].extend(categorys)
        combined_data['subCategories'].extend(subCategories)
        combined_data['job_types'].extend(job_types)
        combined_data['salaries'].extend(salaries)
        combined_data['dateposted'].extend(dateposted)
        combined_data['datedisplay'].extend(datedisplay)

        urls = [f'https://www.jobstreet.com.sg/job/{jid}' for jid in job_id]
        combined_data['url'].extend(urls)

    return combined_data

max_pages = 7

# Not sure how long the api_urls can last for
api_urls = [
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=a807f6a20a0666f3b89aa9e3cb3bca23-4784291&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=data+analytics&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48',
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=9fe593460e74b18f4ff43b70ea18f3b2-7061267&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=data+science&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48',
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=7ad56fc8a31dcbb7878db4b353af1c3e-7122654&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=AI&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48',
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=aca0e1278d6f7a2ee718037b5200293c-7195166&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=machine+learning&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48'
]


## Fetch job descriptions from individual job link

In [3]:
def fetch_job_article(job_id):
    article_url = f'https://www.jobstreet.com.sg/job/{job_id}'
    response = requests.get(article_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve job article. Status Code: {response.status_code}")
        return None

def extract_text_from_ul(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    ul_tags = soup.find_all('ul')
    text_list = [ul.get_text(separator='\n') for ul in ul_tags]
    return '\n'.join(text_list)

def scrape_and_store_text(job_ids):
    data = {'job_id': [],'job_title':[], 'job_teaser':[], 'company':[], 'descriptions': [], 'location':[],'category':[],'subcategory':[],'type':[],'salary':[],'dateposted':[],'datedisplay':[]}

    for job_id in job_ids:
        job_article_content = fetch_job_article(job_id)

        if job_article_content:
            text_from_ul = extract_text_from_ul(job_article_content)
            data['job_id'].append(job_id)
            data['descriptions'].append(text_from_ul)

    data['job_title'] = titles
    data['job_teaser'] = teasers
    data['company'] = companies
    data['location'] = locations
    data['category'] = categorys
    data['subcategory'] = subCategories
    data['type']=job_types
    data['salary']=salaries
    data['dateposted']=dateposted
    data['datedisplay']=datedisplay


    return data


## Combine API queries and output to CSV file

In [4]:
combined_data = combine_results(api_urls, max_pages)

result_df = pd.DataFrame(combined_data)

result_df.drop_duplicates(inplace=True)

result_df.to_csv('jobstreet_jobs_data.csv', index=False)

result_df

Unnamed: 0,job_id,titles,teasers,companies,locations,categorys,subCategories,job_types,salaries,dateposted,datedisplay,url
0,75375824,"Manager, SIT Teaching and Learning Academy (Da...",-,Singapore Institute of Technology,Central Region,Information & Communication Technology,Database Development & Administration,Full time,,2024-04-24T04:02:19Z,8h ago,https://www.jobstreet.com.sg/job/75375824
1,75386609,"Senior/ Analyst, Data Science, Group Finance A...",You will champion the development of new data ...,Singapore Health Services Pte Ltd (SingHealth HQ),Central Region,Science & Technology,"Mathematics, Statistics & Information Sciences",Contract/Temp,,2024-04-24T09:25:01Z,3h ago,https://www.jobstreet.com.sg/job/75386609
2,75367548,"Senior Data Analyst, O&T Quality Assurance - I...",The O&T Quality Assurance (QA) in O&T’s COO fu...,Citibank N.A.,Central Region,Information & Communication Technology,Testing & Quality Assurance,Full time,,2024-04-23T16:55:00Z,19h ago,https://www.jobstreet.com.sg/job/75367548
3,75321199,"Data Analyst - Marketing Analytics, Regional B...",-,SHOPEE SINGAPORE PRIVATE LIMITED,Central Region,Marketing & Communications,Marketing Communications,Full time,,2024-04-22T07:53:08Z,2d ago,https://www.jobstreet.com.sg/job/75321199
4,75339613,"Manager, Group Care Plan Operations, Data Anal...",-,NUHS Group Ops & IT,Central Region,Marketing & Communications,Market Research & Analysis,Full time,,2024-04-23T02:14:32Z,1d ago,https://www.jobstreet.com.sg/job/75339613
...,...,...,...,...,...,...,...,...,...,...,...,...
879,75296036,Data Product Manager,Responsible for understanding AEI’s business d...,AES Global Holdings Pte. Ltd.,Central Region,Information & Communication Technology,Product Management & Development,Full time,,2024-04-20T09:07:00Z,4d ago,https://www.jobstreet.com.sg/job/75296036
889,75071220,Video processing algorithm engineer,This inclusive employer is a member of myGwork...,"Zoom Video Communications, Inc.",Central Region,Design & Architecture,Web & Interaction Design,Full time,,2024-04-11T09:02:00Z,13d ago,https://www.jobstreet.com.sg/job/75071220
892,75226829,Fundamental Quantitative Researcher,The fundamental quantitative researchers devel...,DYNAMIC TECHNOLOGY LAB PRIVATE LIMITED,Central Region,Science & Technology,"Mathematics, Statistics & Information Sciences",Full time,,2024-04-18T02:39:00Z,6d ago,https://www.jobstreet.com.sg/job/75226829
893,75327990,Product Lifecycle Engineer (EE),Working at Illumina means being part of someth...,Illumina Singapore Pte Ltd,North Region,Engineering,Electrical/Electronic Engineering,Full time,,2024-04-22T10:29:00Z,2d ago,https://www.jobstreet.com.sg/job/75327990
