In [1]:
# pip install playwright

In [2]:
import asyncio
import pandas as pd
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import os
import re
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE

# Data Collection

In [27]:
def clean_text(text):
    if isinstance(text, str):
        return ILLEGAL_CHARACTERS_RE.sub("", text)
    return text

async def scrape_jobstreet():
    scrape_start_time = datetime.now()
    file_path = "../../Projects-Data/Job-Scraping/Jobstreet-Data.xlsx"

    if os.path.exists(file_path):
        existing_df = pd.read_excel(file_path)
        existing_ids = set(existing_df["Job ID"].dropna().astype(str).unique())
    else:
        existing_df = pd.DataFrame(columns=[
            "Job Title", "Company", "Location", "Work Type", "Classification",
            "Salary", "Job Description", "Job ID", "Posted Time"
        ])
        existing_ids = set()

    new_data = []
    page_num = 1
    consecutive_duplicates = 0
    max_consecutive_duplicates = 5

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        while True:
            url = f"https://ph.jobstreet.com/Data-Science-jobs?page={page_num}&sortmode=ListedDate"
            print(f"\n🌐 Navigating to Page {page_num} — {url}")

            try:
                await page.goto(url, timeout=30000)
                await page.wait_for_selector("article[data-testid='job-card']", timeout=5000)
            except Exception as e:
                print(f"⚠️ Retry page {page_num} due to error: {e}")
                try:
                    await page.goto(url, timeout=30000)
                    await page.wait_for_selector("article[data-testid='job-card']", timeout=5000)
                except:
                    print(f"⚠️ Skipping page {page_num} — no job listings or permanent failure.")
                    page_num += 1
                    continue

            await asyncio.sleep(2)
            jobs = await page.query_selector_all("article[data-testid='job-card']")
            print(f"📄 Found {len(jobs)} job listings on page {page_num}")

            if len(jobs) < 32:
                print(f"📉 Less than 32 jobs found — likely last page. Stopping after this.")
                break

            for i in range(len(jobs)):
                try:
                    print(f"👉 Clicking job {i+1} on page {page_num}...")

                    jobs = await page.query_selector_all("article[data-testid='job-card']")
                    job = jobs[i]

                    await job.click()
                    await asyncio.sleep(2)

                    try:
                        await page.wait_for_selector("div[data-automation='jobAdDetails']", timeout=10000)
                    except:
                        print(f"⚠️ Skipping job {i+1}: job details panel failed to load.")
                        continue

                    anchor = await job.query_selector("a[data-automation='job-list-view-job-link']")
                    url_suffix = await anchor.get_attribute("href") if anchor else ""
                    match = re.search(r"/job/(\d+)", url_suffix)
                    job_id = match.group(1) if match else "N/A"

                    if job_id in existing_ids:
                        print(f"⚠️ Duplicate job ID '{job_id}' — skipping.")
                        consecutive_duplicates += 1
                        if consecutive_duplicates >= max_consecutive_duplicates:
                            print(f"🛑 Reached {max_consecutive_duplicates} consecutive duplicates — stopping.")
                            await browser.close()
                            break
                        continue  # Skip saving this duplicate
                    else:
                        consecutive_duplicates = 0  # Reset if a new ID is found

                    job_title_el = await page.query_selector("h1[data-automation='job-detail-title']")
                    job_title = await job_title_el.inner_text() if job_title_el else "N/A"

                    company_el = await page.query_selector("span[data-automation='advertiser-name']")
                    company = await company_el.inner_text() if company_el else "N/A"

                    location_el = await page.query_selector("span[data-automation='job-detail-location']")
                    location = await location_el.inner_text() if location_el else "N/A"

                    work_type_el = await page.query_selector("span[data-automation='job-detail-work-type']")
                    work_type = await work_type_el.inner_text() if work_type_el else "N/A"

                    classification_el = await page.query_selector("span[data-automation='job-detail-classifications']")
                    classification = await classification_el.inner_text() if classification_el else "N/A"

                    salary_el = await page.query_selector("span[data-automation='job-detail-salary']")
                    salary = await salary_el.inner_text() if salary_el else ""

                    desc_el = await page.query_selector("div[data-automation='jobAdDetails']")
                    job_description = await desc_el.inner_text() if desc_el else "N/A"

                    posted_el = await job.query_selector("span[data-automation='jobListingDate']")
                    posted_raw = await posted_el.inner_text() if posted_el else ""
                    posted_datetime = "N/A"
                    try:
                        if "m" in posted_raw:
                            minutes = int(posted_raw.split("m")[0].strip())
                            posted_time = scrape_start_time - timedelta(minutes=minutes)
                        elif "hr" in posted_raw:
                            hours = int(posted_raw.split("hr")[0].strip())
                            posted_time = scrape_start_time - timedelta(hours=hours)
                        elif "d" in posted_raw:
                            days = int(posted_raw.split("d")[0].strip())
                            posted_time = scrape_start_time - timedelta(days=days)
                        else:
                            posted_time = scrape_start_time
                        posted_datetime = posted_time.strftime("%d/%m/%y %H:%M")
                    except Exception as e:
                        print(f"⚠️ Failed to parse posted time '{posted_raw}': {e}")

                    new_data.append({
                        "Job Title": clean_text(job_title),
                        "Company": clean_text(company),
                        "Location": clean_text(location),
                        "Work Type": clean_text(work_type),
                        "Classification": clean_text(classification),
                        "Salary": clean_text(salary),
                        "Job Description": clean_text(job_description),
                        "Job ID": clean_text(job_id),
                        "Posted Time": clean_text(posted_datetime)
                    })

                except Exception as e:
                    print(f"⚠️ Error on job {i+1} of page {page_num}: {e}")
                    continue

            if consecutive_duplicates >= max_consecutive_duplicates:
                break

            page_num += 1

        await browser.close()

    if new_data:
        new_df = pd.DataFrame(new_data)
        full_df = pd.concat([existing_df, new_df], ignore_index=True)
        full_df.to_excel(file_path, index=False)
        print(f"\n✅ Scraping finished. {len(new_data)} new job(s) saved to Excel.")
    else:
        print("\n📂 No new jobs found to save.")


In [28]:
# Run the async function inside Jupyter
run = await scrape_jobstreet()


🌐 Navigating to Page 1 — https://ph.jobstreet.com/Data-Science-jobs?page=1&sortmode=ListedDate
📄 Found 32 job listings on page 1
👉 Clicking job 1 on page 1...
👉 Clicking job 2 on page 1...
👉 Clicking job 3 on page 1...
👉 Clicking job 4 on page 1...
⚠️ Duplicate job ID '86034307' — skipping.
👉 Clicking job 5 on page 1...
⚠️ Duplicate job ID '86034136' — skipping.
👉 Clicking job 6 on page 1...
⚠️ Duplicate job ID '86034023' — skipping.
👉 Clicking job 7 on page 1...
⚠️ Duplicate job ID '86034094' — skipping.
👉 Clicking job 8 on page 1...
⚠️ Duplicate job ID '86034048' — skipping.
🛑 Reached 5 consecutive duplicates — stopping.

✅ Scraping finished. 3 new job(s) saved to Excel.


# Data Preprocessing

In [32]:
df = pd.read_excel("../../Projects-Data/Job-Scraping/Jobstreet-Data.xlsx")

In [33]:
df

Unnamed: 0,Job Title,Company,Location,Work Type,Classification,Salary,Job Description,Job ID,Posted Time
0,Business Analyst,De La Salle University,"Manila City, Metro Manila",Full time,Business/Systems Analysts (Information & Commu...,,Reference Number:\n\n2425A-78\n\nDate Posted:\...,80069938,2025-06-19 05:20:00
1,L3 Application Support,CGI (PHILIPPINES) INC.,"Alabang, Muntinlupa City, Metro Manila",Full time,Help Desk & IT Support (Information & Communic...,,Position Description:\n\n\nAs an L3 Applicatio...,75522103,2025-06-19 05:20:00
2,"Senior Engineer, Product Applications","Analog Devices Gen. Trias, Inc.","Cavite, Calabarzon",Full time,Networks & Systems Administration (Information...,,"Analog Devices, Inc. (NASDAQ: ADI) is a global...",76923346,2025-06-19 05:20:00
3,"Associate Analyst, Applications Programming","Analog Devices Gen. Trias, Inc.","Cavite, Calabarzon",Full time,Developers/Programmers (Information & Communic...,,Come join Analog Devices (ADI) – a place where...,76980991,2025-06-19 05:20:00
4,RPA Developer,"Bosch Service Solutions, Inc.","Santa Ana, Manila City, Metro Manila (Hybrid)",Full time,Developers/Programmers (Information & Communic...,,"Company Description\n\n\nAt Bosch, we shape th...",77159791,2025-06-19 05:20:00
...,...,...,...,...,...,...,...,...,...
4346,Linux System/ Server Specialist,ZAMA Precision Industry Manufacturing Philippi...,"Batangas, Calabarzon (Hybrid)",Full time,Networks & Systems Administration (Information...,,Responsible for the overall Unix/ Linux operat...,86033594,25/07/25 23:50
4347,Business Analyst,Xurpas Incorporated,"Ortigas, Pasig City, Metro Manila",Full time,Business/Systems Analysts (Information & Commu...,,Responsibilities\n\nAssist in gathering and do...,86032874,25/07/25 23:50
4348,Officer - Reference Data Services Analyst 2 - ...,Citigroup Business Process Solutions Pte. Ltd.,"Santa Ana, Manila City, Metro Manila (Hybrid)",Full time,Analysis & Reporting (Banking & Financial Serv...,,The Reference Data Services Analyst 2 is a dev...,86034795,26/07/25 00:01
4349,Officer - Reference Data Services Intmd Analys...,Citigroup Business Process Solutions Pte. Ltd.,"Santa Ana, Manila City, Metro Manila (Hybrid)",Full time,Analysis & Reporting (Banking & Financial Serv...,,The Reference Data Services Intmd Analyst is a...,86034794,26/07/25 00:01


In [34]:
df = df.dropna(subset=['Salary'])
df

Unnamed: 0,Job Title,Company,Location,Work Type,Classification,Salary,Job Description,Job ID,Posted Time
820,WEB DEVELOPER (WORDPRESS)- WORK ONSITE,REMCO Business Process Outsourcing,"Angeles City, Pampanga",Full time,Developers/Programmers (Information & Communic...,"₱32,000 – ₱35,000 per month",\nREMCO Business Process Outsourcing is seekin...,84797766,2025-06-19 05:20:00
858,ETL Developer (Extract Transform Load)_Contrac...,AVENSYS CONSULTING INC.,"Quezon City, Metro Manila (Hybrid)",Contract/Temp,Developers/Programmers (Information & Communic...,"₱100,000 – ₱130,000 per month",Avensys is a reputed global IT professional se...,84942591,2025-06-19 05:20:00
864,Company Data Analyst,CEIC Data Company Ltd.,"Ortigas, Pasig City, Metro Manila (Hybrid)",Full time,Analysis & Reporting (Banking & Financial Serv...,"₱24,000 – ₱27,000 per month",About the Role:\n\nWe are looking for a highly...,84954389,2025-06-19 05:20:00
872,Data Building Tool_Contract 6 months,AVENSYS CONSULTING INC.,"Quezon City, Metro Manila",Contract/Temp,Developers/Programmers (Information & Communic...,"₱180,000 – ₱200,000 per month",Avensys is a reputed global IT professional se...,84955635,2025-06-19 05:20:00
890,Data Sales Analyst,Middleby Philippines Corporation,"Laguna, Calabarzon",Full time,Analysis & Reporting (Sales),"₱20,000 – ₱28,000 per month",Must be Industrial Engineering graduate\n Will...,84984034,2025-06-19 05:20:00
...,...,...,...,...,...,...,...,...,...
4327,Credit Risk Business Analsyt,"John Clements Consultants, Inc.","Makati City, Metro Manila",Full time,Compliance & Risk (Banking & Financial Services),"₱100,000 – ₱150,000 per month",Job Summary\n\nCredit Risk Expertise (Non‑Nego...,85992610,2025-07-24 20:26:00
4329,IT Specialist,Sunpower Partners Corporation,"Paranaque City, Metro Manila",Full time,Networks & Systems Administration (Information...,"₱23,000 – ₱27,000 per month",JOB SUMMARY\n\nResponsible for ensuring our sy...,86032274,2025-07-25 18:51:00
4332,Policy Analyst,ATOS Information Technology Inc.,"Taguig City, Metro Manila",Full time,Policy (Consulting & Strategy),"₱20,000 – ₱30,000 per month",We are seeking a Policy Analyst to join our te...,86032554,2025-07-25 19:10:00
4334,SPARK ENGINEER -- With Experience in Apache Sp...,"Information Professionals, Inc.","Pasig City, Metro Manila",Full time,Developers/Programmers (Information & Communic...,"₱40,000 – ₱45,000 per month","Design, develop, and maintain big data solutio...",86032555,2025-07-25 19:11:00


In [35]:
df.to_excel("../../Projects-Data/Job-Scraping/Jobstreet-Data-w_SalaryOnly.xlsx", index=False)