In [1]:
# pip install playwright

In [5]:
import asyncio
import pandas as pd
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import os
import re

# Data Collection

In [8]:
async def scrape_jobstreet():
    scrape_start_time = datetime.now()
    file_path = "../../Projects-Data/Job-Scraping/Jobsteet-Data.xlsx"

    if os.path.exists(file_path):
        existing_df = pd.read_excel(file_path)
        existing_ids = set(existing_df["Job ID"].dropna().astype(str).unique())
    else:
        existing_df = pd.DataFrame(columns=[
            "Job Title", "Company", "Location", "Work Type", "Classification",
            "Salary", "Job Description", "Job ID", "Posted Time"
        ])
        existing_ids = set()

    new_data = []
    page_num = 1
    stop_scraping = False

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        while not stop_scraping:
            url = f"https://ph.jobstreet.com/Data-Science-jobs?page={page_num}&sortmode=ListedDate"
            print(f"\n🌐 Navigating to Page {page_num} — {url}")

            try:
                await page.goto(url, timeout=30000)
                await page.wait_for_selector("article[data-testid='job-card']", timeout=5000)
            except Exception as e:
                print(f"⚠️ Retry page {page_num} due to error: {e}")
                try:
                    await page.goto(url, timeout=30000)
                    await page.wait_for_selector("article[data-testid='job-card']", timeout=5000)
                except:
                    print(f"⚠️ Skipping page {page_num} — no job listings or permanent failure.")
                    page_num += 1
                    continue

            await asyncio.sleep(2)
            jobs = await page.query_selector_all("article[data-testid='job-card']")
            print(f"📄 Found {len(jobs)} job listings on page {page_num}")

            if len(jobs) < 32:
                print(f"📉 Less than 32 jobs found — likely last page. Stopping after this.")
                stop_scraping = True

            for i in range(len(jobs)):
                try:
                    print(f"👉 Clicking job {i+1} on page {page_num}...")

                    jobs = await page.query_selector_all("article[data-testid='job-card']")
                    job = jobs[i]

                    await job.click()
                    await asyncio.sleep(2)

                    try:
                        await page.wait_for_selector("div[data-automation='jobAdDetails']", timeout=10000)
                    except:
                        print(f"⚠️ Skipping job {i+1}: job details panel failed to load.")
                        continue

                    anchor = await job.query_selector("a[data-automation='job-list-view-job-link']")
                    url_suffix = await anchor.get_attribute("href") if anchor else ""
                    match = re.search(r"/job/(\d+)", url_suffix)
                    job_id = match.group(1) if match else "N/A"

                    if job_id in existing_ids:
                        print(f"🛑 Duplicate job ID '{job_id}' found — stopping.")
                        stop_scraping = True
                        break

                    job_title_el = await page.query_selector("h1[data-automation='job-detail-title']")
                    job_title = await job_title_el.inner_text() if job_title_el else "N/A"

                    company_el = await page.query_selector("span[data-automation='advertiser-name']")
                    company = await company_el.inner_text() if company_el else "N/A"

                    location_el = await page.query_selector("span[data-automation='job-detail-location']")
                    location = await location_el.inner_text() if location_el else "N/A"

                    work_type_el = await page.query_selector("span[data-automation='job-detail-work-type']")
                    work_type = await work_type_el.inner_text() if work_type_el else "N/A"

                    classification_el = await page.query_selector("span[data-automation='job-detail-classifications']")
                    classification = await classification_el.inner_text() if classification_el else "N/A"

                    salary_el = await page.query_selector("span[data-automation='job-detail-salary']")
                    salary = await salary_el.inner_text() if salary_el else ""

                    desc_el = await page.query_selector("div[data-automation='jobAdDetails']")
                    job_description = await desc_el.inner_text() if desc_el else "N/A"

                    posted_el = await job.query_selector("span[data-automation='jobListingDate']")
                    posted_raw = await posted_el.inner_text() if posted_el else ""
                    posted_datetime = "N/A"
                    try:
                        if "m" in posted_raw:
                            minutes = int(posted_raw.split("m")[0].strip())
                            posted_time = scrape_start_time - timedelta(minutes=minutes)
                        elif "hr" in posted_raw:
                            hours = int(posted_raw.split("hr")[0].strip())
                            posted_time = scrape_start_time - timedelta(hours=hours)
                        elif "d" in posted_raw:
                            days = int(posted_raw.split("d")[0].strip())
                            posted_time = scrape_start_time - timedelta(days=days)
                        else:
                            posted_time = scrape_start_time
                        posted_datetime = posted_time.strftime("%d/%m/%y %H:%M")
                    except Exception as e:
                        print(f"⚠️ Failed to parse posted time '{posted_raw}': {e}")

                    new_data.append({
                        "Job Title": clean_text(job_title),
                        "Company": clean_text(company),
                        "Location": clean_text(location),
                        "Work Type": clean_text(work_type),
                        "Classification": clean_text(classification),
                        "Salary": clean_text(salary),
                        "Job Description": clean_text(job_description),
                        "Job ID": clean_text(job_id),
                        "Posted Time": clean_text(posted_datetime)
                    })

                except Exception as e:
                    print(f"⚠️ Error on job {i+1} of page {page_num}: {e}")
                    continue

            page_num += 1

        await browser.close()

    if new_data:
        new_df = pd.DataFrame(new_data)
        full_df = pd.concat([existing_df, new_df], ignore_index=True)
        full_df.to_excel(file_path, index=False)
        print(f"\n✅ Scraping finished. {len(new_data)} new job(s) saved to Excel.")
    else:
        print("\n📂 No new jobs found to save.")

In [7]:
# Run the async function inside Jupyter
run = await scrape_jobstreet()


🌐 Navigating to Page 1 — https://ph.jobstreet.com/Data-Science-jobs?page=1&sortmode=ListedDate
📄 Found 32 job listings on page 1
👉 Clicking job 1 on page 1...
🛑 Duplicate job ID '85881125' found — stopping.

📂 No new jobs found to save.


# Data Preprocessing

In [9]:
df = pd.read_excel("../../Projects-Data/Job-Scraping/Jobstreet-Data.xlsx")

In [10]:
df

Unnamed: 0,Job Title,Company,Location,Work Type,Classification,Salary,Job Description,Job ID,Posted Time
0,AI Automation Engineer,van den Boom & Associates,"Manila City, Metro Manila (Remote)",Full time,Engineering - Software (Information & Communic...,,van den Boom & Associates is a US-based accoun...,85874883,19/07/25 05:20
1,Marketing Manager,IDC Market Research (M) Sdn Bhd,"Manila City, Metro Manila (Remote)",Full time,Marketing Communications (Marketing & Communic...,,IDC is seeking a data-driven and strategic Mar...,85874622,19/07/25 05:20
2,L1 Service Desk Analyst,HCL Technologies Philippines Inc,"Taguig City, Metro Manila",Full time,Help Desk & IT Support (Information & Communic...,,QUALIFICATIONS:\n\nMust possess at least a Voc...,85874567,19/07/25 05:20
3,Corporate Financial Data Analyst - Onsite/WFH,Centrefy Business Process Outsourcing Services,"Buhangin, Davao City, Davao del Sur (Hybrid)",Full time,Analysis & Reporting (Accounting),"₱30,000 – ₱35,000 per month",𝗞𝗲𝘆 𝗥𝗲𝘀𝗽𝗼𝗻𝘀𝗶𝗯𝗶𝗹𝗶𝘁𝗶𝗲𝘀\n\n- Develop and maintain...,85873843,19/07/25 05:20
4,Reporting Analyst (Advisor Reporting),"eClerx Philippines, Inc.","Muntinlupa City, Metro Manila",Full time,Analysis & Reporting (Banking & Financial Serv...,,Position Overview:\n\nWe are seeking a Reporti...,85874033,19/07/25 05:20
...,...,...,...,...,...,...,...,...,...
4110,Data Science - AI - Pharma,"WNS Global Services, Inc.",Philippines,Full time,"Mathematics, Statistics & Information Sciences...",,Company Description\n\n\nWNS (Holdings) Limite...,85881495,19/07/25 10:22
4111,Senior Director - RNA,"WNS Global Services, Inc.",Philippines,Full time,Product Management & Development (Information ...,,Company Description\n\n\nWNS (Holdings) Limite...,85878485,19/07/25 10:22
4112,REF41844Y_202463021 - Deputy Manager - RNA - E...,"WNS Global Services, Inc.",Philippines,Full time,Database Development & Administration (Informa...,,Company Description\n\n\nWNS (Holdings) Limite...,85877572,19/07/25 10:22
4113,AI - ML,"WNS Global Services, Inc.",Philippines,Full time,"Mathematics, Statistics & Information Sciences...",,Company Description\n\n\nWNS (Holdings) Limite...,85876538,19/07/25 10:22


In [24]:
job_title_df = df['Job Title'].value_counts().reset_index()
job_title_df.columns = ['Job Title', 'Count']

In [25]:
job_title_df.head(20)

Unnamed: 0,Job Title,Count
0,Data Analyst,75
1,Business Analyst,73
2,Data Engineer,45
3,Python Developer,16
4,Data Scientist,13
5,Senior Business Analyst,12
6,Senior Data Engineer,12
7,Medical Representative,9
8,Solutions Architect,9
9,Reports Analyst,9


In [26]:
job_title_counts = job_title_df.to_excel("../../Projects-Data/Job-Scraping/Job-Titles.xlsx")