In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv

## Scraping function [](#2)

In [None]:
def scrape_article_ids(api_url, max_pages):
    job_id = []
    titles = []
    companies = []
    locations = []
    categorys = []
    subCategories = []
    job_types = []
    salaries = []
    dateposted = []
    datedisplay = []
    for page_number in range(1, max_pages + 1):
        page_url = f'{api_url}&page={page_number}'

        # Send an HTTP request to the API endpoint
        response = requests.get(page_url)
        if response.status_code == 200:
            # Parse the JSON response
            data = response.json()

            # Extract advertiser IDs from each item in the 'data' list
            for item in data['data']:
                jid = item['id']
                title = item['title']
                company = item['advertiser'].get('description', '')
                location = item.get('location', '')
                category = item['classification'].get('description', '')
                subCategory= item['subClassification'].get('description', '')
                job_type = item.get('workType', '')
                salary = item.get('salary', '')
                date = item.get('listingDate', '')
                display = item.get('listingDateDisplay', '')

                job_id.append(jid)
                titles.append(title)
                companies.append(company)
                locations.append(location)
                categorys.append(category)
                subCategories.append(subCategory)
                job_types.append(job_type)
                salaries.append(salary)
                dateposted.append(date)
                datedisplay.append(display)

        else:
            print(f"Failed to retrieve data from the API. Status Code: {response.status_code}")
            break

    return job_id, titles, companies, locations, categorys, subCategories,job_types,salaries, dateposted, datedisplay

def combine_results(api_urls, max_pages):
    combined_data = {'job_id': [], 'titles': [], 'companies': [], 'locations': [], 'categorys': [],
                     'subCategories': [], 'job_types': [], 'salaries': [], 'dateposted': [], 'datedisplay': []}

    for api_url in api_urls:
        job_id, titles, companies, locations, categorys, subCategories, job_types, salaries, dateposted, datedisplay = scrape_article_ids(api_url, max_pages)
        combined_data['job_id'].extend(job_id)
        combined_data['titles'].extend(titles)
        combined_data['companies'].extend(companies)
        combined_data['locations'].extend(locations)
        combined_data['categorys'].extend(categorys)
        combined_data['subCategories'].extend(subCategories)
        combined_data['job_types'].extend(job_types)
        combined_data['salaries'].extend(salaries)
        combined_data['dateposted'].extend(dateposted)
        combined_data['datedisplay'].extend(datedisplay)

    return combined_data

max_pages = 7

# Not sure how long the api_urls can last for
api_urls = [
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=a807f6a20a0666f3b89aa9e3cb3bca23-4784291&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=data+analytics&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48',
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=9fe593460e74b18f4ff43b70ea18f3b2-7061267&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=data+science&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48',
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=7ad56fc8a31dcbb7878db4b353af1c3e-7122654&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=AI&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48',
    'https://www.jobstreet.com.sg/api/chalice-search/v4/search?siteKey=SG-Main&sourcesystem=houston&userqueryid=aca0e1278d6f7a2ee718037b5200293c-7195166&userid=627c24a7-8b33-456a-9420-ed94dd9c1da6&usersessionid=627c24a7-8b33-456a-9420-ed94dd9c1da6&eventCaptureSessionId=627c24a7-8b33-456a-9420-ed94dd9c1da6&where=singapore&seekSelectAllPages=true&keywords=machine+learning&pageSize=30&include=seodata&locale=en-SG&seekerId=527177449&solId=82238ce8-3314-4947-a2bb-606e4dce4c48'
]


## Fetch job descriptions from individual job link

In [None]:
def fetch_job_article(job_id):
    article_url = f'https://www.jobstreet.com.sg/job/{job_id}'
    response = requests.get(article_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to retrieve job article. Status Code: {response.status_code}")
        return None

def extract_text_from_ul(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    ul_tags = soup.find_all('ul')
    text_list = [ul.get_text(separator='\n') for ul in ul_tags]
    return '\n'.join(text_list)

def scrape_and_store_text(job_ids):
    data = {'job_id': [],'job_title':[],'company':[], 'descriptions': [], 'location':[],'category':[],'subcategory':[],'type':[],'salary':[],'dateposted':[],'datedisplay':[]}

    for job_id in job_ids:
        job_article_content = fetch_job_article(job_id)

        if job_article_content:
            text_from_ul = extract_text_from_ul(job_article_content)
            data['job_id'].append(job_id)
            data['descriptions'].append(text_from_ul)

    data['job_title'] = titles
    data['company'] = companies
    data['location'] = locations
    data['category'] = categorys
    data['subcategory'] = subCategories
    data['type']=job_types
    data['salary']=salaries
    data['dateposted']=dateposted
    data['datedisplay']=datedisplay


    return data


## Combine API queries and output to CSV file

In [None]:
combined_data = combine_results(api_urls, max_pages)

result_df = pd.DataFrame(combined_data)

result_df.drop_duplicates(inplace=True)

result_df.to_csv('jobstreet_jobs_data.csv', index=False)

result_df

Unnamed: 0,job_id,titles,companies,locations,categorys,subCategories,job_types,salaries,dateposted,datedisplay
0,74570578,Intern - Data Analytics and Customer Services,Daimler South East Asia Pte Ltd,Central Region,Science & Technology,"Mathematics, Statistics & Information Sciences",Full time,,2024-03-19T17:14:00Z,1d ago
1,74568682,"Associate, FCC Data Analytics",Standard Chartered Bank,Central Region,Consulting & Strategy,Management & Change Consulting,Full time,,2024-03-19T12:36:00Z,1d ago
2,74526931,Manager / Assistant Senior Manager (Data Analy...,National University of Singapore,Central Region,"Manufacturing, Transport & Logistics",Import/Export & Customs,Full time,,2024-03-18T05:13:56Z,2d ago
3,74601534,18 Months Contract Data Analytics and Operatio...,RECRUIT EXPRESS PTE LTD,Central Region,Information & Communication Technology,Management,Contract/Temp,"$5,000 – $6,000 per month",2024-03-20T08:10:08Z,15h ago
4,74537971,Business Data Analyst (Global),Gradiant International Holdings Pte. Ltd.,West Region,Information & Communication Technology,Business/Systems Analysts,Full time,,2024-03-18T13:22:05Z,2d ago
...,...,...,...,...,...,...,...,...,...,...
875,74115949,Research Engineer (Algorithmic and Statistical...,National University of Singapore,Central Region,Education & Training,Other,Full time,,2024-02-29T06:46:06Z,20d ago
876,74495476,Senior Research Scientist (Mechanobiology Inst...,Yale-NUS College,Central Region,Science & Technology,Biological & Biomedical Sciences,Full time,,2024-03-15T10:36:00Z,5d ago
879,74422017,Date Engineer (ETL) - DSC/JS,ST Engineering Training & Simulation Systems P...,North-East Region,Information & Communication Technology,Engineering - Software,Full time,,2024-03-13T06:31:56Z,7d ago
884,74496936,Senior/Staff ISP RTL Design Engineer,OmniVision Technologies Singapore Pte Ltd,West Region,Engineering,Electrical/Electronic Engineering,Full time,,2024-03-15T10:34:00Z,5d ago
