In [2]:
import re
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import parse_qs, urlencode, urljoin, urlparse, urlunparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

# ---- Search settings ----
keyword = "data engineer"
location = ""  # optional: e.g. "bangkok"

MAX_PAGES = 10
SLEEP_SECONDS = 1.0

keyword_slug = "-".join(keyword.lower().split())
base_url = f"https://th.jobsdb.com/th/{keyword_slug}-jobs"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9,th;q=0.8",
}


def update_query_in_url(url: str, **params) -> str:
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    for key, value in params.items():
        query[key] = [str(value)]
    new_query = urlencode(query, doseq=True)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, new_query, parsed.fragment))


def clean_text(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "")).strip()


def normalize_for_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", (text or "").lower()).strip()


def keyword_match_groups_from_query(search_keyword: str) -> list[list[str]]:
    term_variants = {
        "data": ["data"],
        "scientist": ["scientist", "science", "scien", "scient"],
        "science": ["science", "scientist", "scien", "scient"],
        "engineer": ["engineer", "engineering", "eng"],
        "analyst": ["analyst", "analytics", "analysis"],
        "developer": ["developer", "development", "dev"],
    }

    tokens = [token for token in normalize_for_match(search_keyword).split() if token]
    groups = []

    for token in tokens:
        groups.append(term_variants.get(token, [token]))

    return groups


def title_matches_keyword(title: str, keyword_groups: list[list[str]]) -> bool:
    if not keyword_groups:
        return True
    title_norm = normalize_for_match(title)
    return all(any(variant in title_norm for variant in group) for group in keyword_groups)


def extract_salary(text: str) -> str:
    patterns = [
        r"THB\s*[\d,]+\s*[-–]\s*THB\s*[\d,]+",
        r"THB\s*[\d,]+",
        r"[\d,]+\s*[-–]\s*[\d,]+\s*บาท",
        r"[\d,]+\s*บาท",
        r"Negotiable|ไม่ระบุเงินเดือน|ตามตกลง|ตามประสบการณ์",
    ]
    for pattern in patterns:
        match = re.search(pattern, text, flags=re.IGNORECASE)
        if match:
            return clean_text(match.group(0))
    return ""


def is_probable_salary(text: str) -> bool:
    if not text:
        return False
    text_norm = text.lower()
    salary_keywords = ["thb", "บาท", "salary", "negotiable", "ตามตกลง", "ตามประสบการณ์"]
    if any(key in text_norm for key in salary_keywords):
        return True
    return bool(re.search(r"\d", text_norm) and re.search(r"[-–]", text_norm))


def guess_province_name(location_text: str) -> str:
    location_clean = clean_text(location_text)
    if not location_clean:
        return ""
    parts = [clean_text(part) for part in re.split(r",|\|", location_clean) if clean_text(part)]
    if not parts:
        return ""
    return parts[-1]


def parse_card(card, page_num: int, search_keyword: str) -> dict:
    title_el = card.select_one("a[data-automation='jobTitle']")
    company_el = card.select_one("a[data-automation='jobCompany'], [data-automation='jobCompany']")
    location_el = card.select_one("a[data-automation='jobLocation'], [data-automation='jobCardLocation']")
    date_el = card.select_one("[data-automation='jobListingDate']")
    salary_el = card.select_one("[data-automation='jobSalary']")
    overlay_link_el = card.select_one("a[data-automation='job-list-item-link-overlay'][href]")

    title = clean_text(title_el.get_text(" ", strip=True) if title_el else "")
    company = clean_text(company_el.get_text(" ", strip=True) if company_el else "")
    location_name = clean_text(location_el.get_text(" ", strip=True) if location_el else "")
    posted_date = clean_text(date_el.get_text(" ", strip=True) if date_el else "")

    salary_candidate = clean_text(salary_el.get_text(" ", strip=True) if salary_el else "")
    salary = salary_candidate if is_probable_salary(salary_candidate) else ""

    href = ""
    if overlay_link_el:
        href = overlay_link_el.get("href", "")
    elif title_el and title_el.get("href"):
        href = title_el.get("href", "")
    job_url = urljoin("https://th.jobsdb.com", href) if href else ""

    raw_text = clean_text(card.get_text("\n", strip=True))
    if not salary:
        salary = extract_salary(raw_text)

    province_name = guess_province_name(location_name)

    return {
        "keyword": search_keyword,
        "province_code": "",
        "province_name": province_name,
        "page": page_num,
        "job_title": title,
        "company": company,
        "location": location_name,
        "salary": salary,
        "posted_date": posted_date,
        "job_url": job_url,
        "raw_text": raw_text,
    }


def scrape_jobsdb(search_url: str, search_location: str, max_pages: int = 10, sleep_seconds: float = 1.0) -> pd.DataFrame:
    keyword_groups = keyword_match_groups_from_query(keyword)

    all_rows = []
    seen_urls = set()

    print(f"[Search] Starting JobsDB crawl: max_pages={max_pages}")

    for page_num in range(1, max_pages + 1):
        page_url = update_query_in_url(search_url, page=page_num)
        if search_location.strip():
            page_url = update_query_in_url(page_url, where=search_location.strip())

        print(f"[Search] Page {page_num}/{max_pages} -> request")
        response = requests.get(page_url, headers=headers, timeout=30)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        cards = soup.select("article[data-testid='job-card'], article[data-automation='normalJob']")
        print(f"[Search] Page {page_num}/{max_pages} -> found cards: {len(cards)}")

        if not cards:
            print(f"[Search] Page {page_num}/{max_pages} -> no cards, stopping")
            break

        page_rows = []
        for card in cards:
            row = parse_card(card, page_num=page_num, search_keyword=keyword)
            if not row["job_title"] or not row["job_url"]:
                continue
            if not title_matches_keyword(row["job_title"], keyword_groups):
                continue
            if row["job_url"] in seen_urls:
                continue

            seen_urls.add(row["job_url"])
            page_rows.append(row)

        if not page_rows:
            print(f"[Search] Page {page_num}/{max_pages} -> no keyword matches, stopping")
            break

        all_rows.extend(page_rows)
        print(f"[Search] Page {page_num}/{max_pages} -> kept {len(page_rows)} | cumulative={len(all_rows)}")

        if sleep_seconds > 0:
            time.sleep(sleep_seconds)

    df = pd.DataFrame(all_rows)

    if not df.empty:
        df = df[
            [
                "keyword",
                "province_code",
                "province_name",
                "page",
                "job_title",
                "company",
                "location",
                "salary",
                "posted_date",
                "job_url",
                "raw_text",
            ]
        ].drop_duplicates(subset=["job_url"])

    return df


jobs_df = scrape_jobsdb(
    base_url,
    search_location=location,
    max_pages=MAX_PAGES,
    sleep_seconds=SLEEP_SECONDS,
)
print(f"\nTotal unique jobs: {len(jobs_df)}")

output_file = f"jobsdb_{keyword_slug}_jobsdb-like-jobthai_{datetime.now().strftime('%Y%m%d')}.csv"
output_path = Path(output_file)
jobs_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"Saved to: {output_path.resolve()}")

jobs_df.head(20)

[Search] Starting JobsDB crawl: max_pages=10
[Search] Page 1/10 -> request
[Search] Page 1/10 -> found cards: 32
[Search] Page 1/10 -> kept 25 | cumulative=25
[Search] Page 2/10 -> request
[Search] Page 2/10 -> found cards: 32
[Search] Page 2/10 -> kept 15 | cumulative=40
[Search] Page 3/10 -> request
[Search] Page 3/10 -> found cards: 32
[Search] Page 3/10 -> kept 3 | cumulative=43
[Search] Page 4/10 -> request
[Search] Page 4/10 -> found cards: 32
[Search] Page 4/10 -> kept 2 | cumulative=45
[Search] Page 5/10 -> request
[Search] Page 5/10 -> found cards: 32
[Search] Page 5/10 -> kept 4 | cumulative=49
[Search] Page 6/10 -> request
[Search] Page 6/10 -> found cards: 32
[Search] Page 6/10 -> kept 2 | cumulative=51
[Search] Page 7/10 -> request
[Search] Page 7/10 -> found cards: 32
[Search] Page 7/10 -> no keyword matches, stopping

Total unique jobs: 51
Saved to: G:\Users\Moss\Documents\PYTHON_PROJECT\Job_Market_Analyzer_Web_Scraping\Moss\jobsdb_data-engineer_jobsdb-like-jobthai_20260

Unnamed: 0,keyword,province_code,province_name,page,job_title,company,location,salary,posted_date,job_url,raw_text
0,data engineer,,กรุงเทพมหานคร,1,Data Engineer,SUNSUSOLUTION COMPANY LIMITED,กรุงเทพมหานคร,,4 นาทีที่ผ่านมา,https://th.jobsdb.com/th/job/90366546?type=sta...,Listed more than four days ago Data Engineer ท...
1,data engineer,,บางกะปิ กรุงเทพมหานคร,1,Data Engineer / ETL Developer - Bang Kapi,PERSOL Thailand,บางกะปิ กรุงเทพมหานคร,,38 นาทีที่ผ่านมา,https://th.jobsdb.com/th/job/90387660?type=sta...,Listed more than thirty eight days ago Data En...
2,data engineer,,กรุงเทพมหานคร,1,Data Engineer,CardX Company Limited,กรุงเทพมหานคร,,1 วันที่ผ่านมา,https://th.jobsdb.com/th/job/90361068?type=sta...,Listed more than one day ago Data Engineer ที่...
3,data engineer,,พระโขนง กรุงเทพมหานคร,1,[Bangchak] Senior Data Engineer,Bangchak Corporation Public Company Limited,พระโขนง กรุงเทพมหานคร,,1 ชั่วโมงที่ผ่านมา,https://th.jobsdb.com/th/job/90386156?type=sta...,Listed more than one day ago [Bangchak] Senior...
4,data engineer,,พระโขนง กรุงเทพมหานคร,1,Data Engineer,"Thai Insurance Research and Development Co., Ltd.",พระโขนง กรุงเทพมหานคร,,5 วันที่ผ่านมา,https://th.jobsdb.com/th/job/89773901?type=sta...,Listed more than five days ago Data Engineer ท...
5,data engineer,,ปทุมวัน กรุงเทพมหานคร,1,Data Platform Engineer (mid/senior),WARRIX SPORT PUBLIC COMPANY LIMITED,ปทุมวัน กรุงเทพมหานคร,"฿50,000 – ฿70,000 per month",5 วันที่ผ่านมา,https://th.jobsdb.com/th/job/90241704?type=sta...,Listed more than five days ago Data Platform E...
6,data engineer,,กรุงเทพมหานคร,1,Senior Data Platform Engineer (Contract),CGP Recruitment (Thailand) Company Limited,กรุงเทพมหานคร,"฿100,000 – ฿120,000 per month",44 นาทีที่ผ่านมา,https://th.jobsdb.com/th/job/90387501?type=sta...,Listed more than forty four days ago Senior Da...
7,data engineer,,ปทุมวัน กรุงเทพมหานคร,1,Data engineer,"Siam Paragon Development Co., Ltd.",ปทุมวัน กรุงเทพมหานคร,,4 วันที่ผ่านมา,https://th.jobsdb.com/th/job/90308122?type=sta...,Listed more than four days ago Data engineer ท...
8,data engineer,,บางรัก กรุงเทพมหานคร,1,Data Engineer,ExxonMobil Limited,บางรัก กรุงเทพมหานคร,,28 วันที่ผ่านมา,https://th.jobsdb.com/th/job/89775367?type=sta...,Listed more than twenty eight days ago Data En...
9,data engineer,,สาทร กรุงเทพมหานคร,1,"Data Analyst, Data Scientist, Data Engineer",Land and Houses Bank Public Company Limited,สาทร กรุงเทพมหานคร,,24 วันที่ผ่านมา,https://th.jobsdb.com/th/job/89901519?type=sta...,Listed more than twenty four days ago Data Ana...


In [None]:
# Hello world