In [17]:
# pip install playwright

In [70]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import os

In [64]:
async def scrape_jobstreet():
    scrape_start_time = datetime.now()

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        await page.goto("https://ph.jobstreet.com/Data-Science-jobs?sortmode=ListedDate")

        print("🔍 Waiting for job listings to load...")
        await page.wait_for_selector("article[data-testid='job-card']")
        await asyncio.sleep(5)  # Ensure full DOM is loaded

        jobs = await page.query_selector_all("article[data-testid='job-card']")
        print(f"📄 Found {len(jobs)} job listings")

        data = []

        for i, job in enumerate(jobs):
            try:
                print(f"👉 Clicking job {i+1}...")
                await job.click()
                await asyncio.sleep(2)  # Let right panel populate

                # Wait for key detail to appear
                await page.wait_for_selector("div[data-automation='jobAdDetails']", timeout=5000)

                # Extract job details
                job_title_el = await page.query_selector("h1[data-automation='job-detail-title']")
                job_title = await job_title_el.inner_text() if job_title_el else "N/A"

                company_el = await page.query_selector("span[data-automation='advertiser-name']")
                company = await company_el.inner_text() if company_el else "N/A"

                location_el = await page.query_selector("span[data-automation='job-detail-location']")
                location = await location_el.inner_text() if location_el else "N/A"

                work_type_el = await page.query_selector("span[data-automation='job-detail-work-type']")
                work_type = await work_type_el.inner_text() if work_type_el else "N/A"

                classification_el = await page.query_selector("span[data-automation='job-detail-classifications']")
                classification = await classification_el.inner_text() if classification_el else "N/A"

                # Salary: check which one exists
                salary_el = await page.query_selector("span[data-automation='job-detail-salary']")
                if salary_el:
                    salary = await salary_el.inner_text()
                else:
                    salary = ""

                # Description block
                desc_el = await page.query_selector("div[data-automation='jobAdDetails']")
                job_description = await desc_el.inner_text() if desc_el else "N/A"

                # Job URL from job card element
                anchor = await job.query_selector("a[data-automation='job-list-view-job-link']")
                url_suffix = await anchor.get_attribute("href") if anchor else ""
                job_url = "https://ph.jobstreet.com" + url_suffix if url_suffix else "N/A"

                # Posted time from job card
                posted_el = await job.query_selector("span[data-automation='jobListingDate']")
                posted_raw = await posted_el.inner_text() if posted_el else ""
                posted_datetime = "N/A"
                try:
                    if "m" in posted_raw:
                        minutes = int(posted_raw.split("m")[0].strip())
                        posted_time = scrape_start_time - timedelta(minutes=minutes)
                    elif "hr" in posted_raw:
                        hours = int(posted_raw.split("hr")[0].strip())
                        posted_time = scrape_start_time - timedelta(hours=hours)
                    elif "d" in posted_raw:
                        days = int(posted_raw.split("d")[0].strip())
                        posted_time = scrape_start_time - timedelta(days=days)
                    else:
                        posted_time = scrape_start_time
                    posted_datetime = posted_time.strftime("%d/%m/%y %H:%M")
                except Exception as e:
                    print(f"⚠️ Failed to parse posted time '{posted_raw}': {e}")

                data.append({
                    "Job Title": job_title,
                    "Company": company,
                    "Location": location,
                    "Work Type": work_type,
                    "Classification": classification,
                    "Salary": salary,
                    "Job Description": job_description,
                    "Job URL": job_url,
                    "Posted Time": posted_datetime
                })

            except Exception as e:
                print(f"⚠️ Error on job {i+1}: {e}")
                continue

        await browser.close()
        return pd.DataFrame(data)

In [65]:
# Run the async function inside Jupyter
df = await scrape_jobstreet()
df.head()

🔍 Waiting for job listings to load...
📄 Found 32 job listings
👉 Clicking job 1...
👉 Clicking job 2...
👉 Clicking job 3...
👉 Clicking job 4...
👉 Clicking job 5...
👉 Clicking job 6...
👉 Clicking job 7...
👉 Clicking job 8...
👉 Clicking job 9...
👉 Clicking job 10...
👉 Clicking job 11...
👉 Clicking job 12...
👉 Clicking job 13...
👉 Clicking job 14...
👉 Clicking job 15...
👉 Clicking job 16...
👉 Clicking job 17...
👉 Clicking job 18...
👉 Clicking job 19...
👉 Clicking job 20...
👉 Clicking job 21...
👉 Clicking job 22...
👉 Clicking job 23...
👉 Clicking job 24...
👉 Clicking job 25...
👉 Clicking job 26...
👉 Clicking job 27...
👉 Clicking job 28...
👉 Clicking job 29...
👉 Clicking job 30...
👉 Clicking job 31...
👉 Clicking job 32...


Unnamed: 0,Job Title,Company,Location,Work Type,Classification,Salary,Job Description,Job URL,Posted Time
0,Integration Specialist,MegaXcess IT Solutions Inc.,"Pasig City, Metro Manila (Hybrid)",Full time,Business/Systems Analysts (Information & Commu...,,THE OPPORTUNITY:\n\nThe Integration Management...,https://ph.jobstreet.com/job/85863081?type=sta...,18/07/25 14:43
1,Dev Engineer,OwnBank,"Bonifacio Global City, Taguig City, Metro Manila",Full time,Developers/Programmers (Information & Communic...,"₱80,000 – ₱120,000 per month",Duties and Responsibilities\n\nOversee end-to-...,https://ph.jobstreet.com/job/85863682?type=sta...,18/07/25 14:41
2,Network Engineer,Compass Offices,"Taguig City, Metro Manila",Full time,Networks & Systems Administration (Information...,,The Network Engineer will be responsible for i...,https://ph.jobstreet.com/job/85863305?type=sta...,18/07/25 14:34
3,Jr. SAP Manager,"Benby Enterprises, Inc.","Quezon City, Metro Manila",Full time,Management (Information & Communication Techno...,,SAP Jr. Manager is responsible in the followin...,https://ph.jobstreet.com/job/85862918?type=sta...,18/07/25 14:27
4,Sr. Software Development Lead,"Benby Enterprises, Inc.","Quezon City, Metro Manila",Full time,Team Leaders (Information & Communication Tech...,,Key Responsibilities:\n\nLead and manage all p...,https://ph.jobstreet.com/job/85862630?type=sta...,18/07/25 14:21


In [67]:
df.to_excel("../../Projects-Data/Job-Scraping/Data.xlsx")

# Testing Space

In [76]:
import asyncio
import pandas as pd
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import os
import re

async def scrape_jobstreet():
    scrape_start_time = datetime.now()
    file_path = "../../Projects-Data/Job-Scraping/Data.xlsx"

    # Load existing data or initialize empty DataFrame
    if os.path.exists(file_path):
        existing_df = pd.read_excel(file_path)
        existing_ids = set(existing_df["Job ID"].dropna().astype(str).unique())
    else:
        existing_df = pd.DataFrame(columns=[
            "Job Title", "Company", "Location", "Work Type", "Classification",
            "Salary", "Job Description", "Job ID", "Posted Time"
        ])
        existing_ids = set()

    new_data = []
    page_num = 1
    stop_scraping = False

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()

        while not stop_scraping:
            url = f"https://ph.jobstreet.com/Data-Science-jobs?page={page_num}&sortmode=ListedDate"
            print(f"\n🌐 Navigating to Page {page_num} — {url}")
            await page.goto(url)
            await page.wait_for_selector("article[data-testid='job-card']")
            await asyncio.sleep(3)

            jobs = await page.query_selector_all("article[data-testid='job-card']")
            print(f"📄 Found {len(jobs)} job listings on page {page_num}")

            for i, job in enumerate(jobs):
                try:
                    print(f"👉 Clicking job {i+1} on page {page_num}...")
                    await job.click()
                    await asyncio.sleep(2)
                    await page.wait_for_selector("div[data-automation='jobAdDetails']", timeout=10000)

                    # Extract Job ID from href
                    anchor = await job.query_selector("a[data-automation='job-list-view-job-link']")
                    url_suffix = await anchor.get_attribute("href") if anchor else ""
                    match = re.search(r"/job/(\d+)", url_suffix)
                    job_id = match.group(1) if match else "N/A"

                    # 🚨 Stop if duplicate
                    if job_id in existing_ids:
                        print(f"🛑 Duplicate job ID '{job_id}' found — stopping.")
                        stop_scraping = True
                        break

                    # Extract other details
                    job_title_el = await page.query_selector("h1[data-automation='job-detail-title']")
                    job_title = await job_title_el.inner_text() if job_title_el else "N/A"

                    company_el = await page.query_selector("span[data-automation='advertiser-name']")
                    company = await company_el.inner_text() if company_el else "N/A"

                    location_el = await page.query_selector("span[data-automation='job-detail-location']")
                    location = await location_el.inner_text() if location_el else "N/A"

                    work_type_el = await page.query_selector("span[data-automation='job-detail-work-type']")
                    work_type = await work_type_el.inner_text() if work_type_el else "N/A"

                    classification_el = await page.query_selector("span[data-automation='job-detail-classifications']")
                    classification = await classification_el.inner_text() if classification_el else "N/A"

                    salary_el = await page.query_selector("span[data-automation='job-detail-salary']")
                    salary = await salary_el.inner_text() if salary_el else ""

                    desc_el = await page.query_selector("div[data-automation='jobAdDetails']")
                    job_description = await desc_el.inner_text() if desc_el else "N/A"

                    posted_el = await job.query_selector("span[data-automation='jobListingDate']")
                    posted_raw = await posted_el.inner_text() if posted_el else ""
                    posted_datetime = "N/A"
                    try:
                        if "m" in posted_raw:
                            minutes = int(posted_raw.split("m")[0].strip())
                            posted_time = scrape_start_time - timedelta(minutes=minutes)
                        elif "hr" in posted_raw:
                            hours = int(posted_raw.split("hr")[0].strip())
                            posted_time = scrape_start_time - timedelta(hours=hours)
                        elif "d" in posted_raw:
                            days = int(posted_raw.split("d")[0].strip())
                            posted_time = scrape_start_time - timedelta(days=days)
                        else:
                            posted_time = scrape_start_time
                        posted_datetime = posted_time.strftime("%d/%m/%y %H:%M")
                    except Exception as e:
                        print(f"⚠️ Failed to parse posted time '{posted_raw}': {e}")

                    new_data.append({
                        "Job Title": job_title,
                        "Company": company,
                        "Location": location,
                        "Work Type": work_type,
                        "Classification": classification,
                        "Salary": salary,
                        "Job Description": job_description,
                        "Job ID": job_id,
                        "Posted Time": posted_datetime
                    })

                except Exception as e:
                    print(f"⚠️ Error on job {i+1} of page {page_num}: {e}")
                    continue

            page_num += 1

        await browser.close()

    # Save to file if there are new jobs
    if new_data:
        new_df = pd.DataFrame(new_data)
        full_df = pd.concat([existing_df, new_df], ignore_index=True)
        full_df.to_excel(file_path, index=False)
        print(f"\n✅ Scraping finished. {len(new_data)} new job(s) saved to Excel.")
    else:
        print("\n📂 No new jobs found to save.")

In [77]:
# Run the async function inside Jupyter
run = await scrape_jobstreet()


🌐 Navigating to Page 1 — https://ph.jobstreet.com/Data-Science-jobs?page=1&sortmode=ListedDate
📄 Found 32 job listings on page 1
👉 Clicking job 1 on page 1...
👉 Clicking job 2 on page 1...
👉 Clicking job 3 on page 1...
👉 Clicking job 4 on page 1...
👉 Clicking job 5 on page 1...
👉 Clicking job 6 on page 1...
👉 Clicking job 7 on page 1...
👉 Clicking job 8 on page 1...
👉 Clicking job 9 on page 1...
👉 Clicking job 10 on page 1...
👉 Clicking job 11 on page 1...
👉 Clicking job 12 on page 1...
👉 Clicking job 13 on page 1...
👉 Clicking job 14 on page 1...
👉 Clicking job 15 on page 1...
👉 Clicking job 16 on page 1...
👉 Clicking job 17 on page 1...
👉 Clicking job 18 on page 1...
👉 Clicking job 19 on page 1...
👉 Clicking job 20 on page 1...
👉 Clicking job 21 on page 1...
👉 Clicking job 22 on page 1...
👉 Clicking job 23 on page 1...
👉 Clicking job 24 on page 1...
👉 Clicking job 25 on page 1...
👉 Clicking job 26 on page 1...
👉 Clicking job 27 on page 1...
👉 Clicking job 28 on page 1...
👉 Clicking 

CancelledError: 