# JobThai Web Scraper (Data Scientist)

This notebook scrapes JobThai search results for **Data Scientist**, extracts structured fields, and saves them to CSV.

In [14]:
import re
import time
from pathlib import Path
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

SEARCH_URL = "https://www.jobthai.com/th/jobs?keyword=Data%20Scientist&page=1&orderBy=RELEVANCE_SEARCH"
OUTPUT_CSV = "jobsdb_data-scientist_jobthai_all-locations_20260217.csv"
MAX_PAGES = 10
SLEEP_SECONDS = 1.0
DETAIL_SLEEP_SECONDS = 0.5


def normalize_province_code(value) -> str:
    text = str(value).strip()
    if text.isdigit():
        number = int(text)
        if number <= 0:
            raise ValueError(f"Province must be positive, got: {value}")
        return f"{number:02d}"
    raise ValueError(f"Invalid province code: {value}")


def normalize_for_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()


def keyword_match_groups_from_query(keyword: str) -> list[list[str]]:
    term_variants = {
        "data": ["data"],
        "scientist": ["scientist", "science", "scien", "scient"],
        "science": ["science", "scientist", "scien", "scient"],
        "engineer": ["engineer", "engineering", "eng"],
        "analyst": ["analyst", "analytics", "analysis"],
        "developer": ["developer", "development", "dev"],
    }

    tokens = [token for token in normalize_for_match(keyword).split() if token]
    groups = []

    for token in tokens:
        if token in term_variants:
            groups.append(term_variants[token])
        else:
            groups.append([token])

    return groups


def title_matches_keyword(title: str, keyword_groups: list[list[str]]) -> bool:
    if not keyword_groups:
        return True

    title_norm = normalize_for_match(title)
    return all(any(variant in title_norm for variant in group) for group in keyword_groups)


def update_query_in_url(url: str, **params) -> str:
    parsed = urlparse(url)
    query = parse_qs(parsed.query)

    for key, value in params.items():
        query[key] = [str(value)]

    new_query = urlencode(query, doseq=True)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, new_query, parsed.fragment))


def clean_text(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def extract_salary(text: str) -> str:
    patterns = [
        r"\d[\d,\s]*\s*-\s*\d[\d,\s]*\s*บาท",
        r"\d[\d,\s]*\s*บาท",
        r"ตามโครงสร้างบริษัทฯ",
        r"ตามประสบการณ์",
        r"ตามตกลง",
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return clean_text(match.group(0))
    return ""


def extract_posted_date(text: str) -> str:
    match = re.search(r"\b\d{1,2}\s+[ก-๙A-Za-z\.]+\s+\d{2}\b", text)
    return clean_text(match.group(0)) if match else ""


def pick_text(parent, selectors: list[str]) -> str:
    for selector in selectors:
        element = parent.select_one(selector)
        if element:
            text = clean_text(element.get_text(" ", strip=True))
            if text:
                return text
    return ""


def guess_location(lines: list[str], title: str, company: str, salary: str) -> str:
    priority_keywords = ["เขต", "กรุงเทพ", "จังหวัด", "อำเภอ", "อ.", "ต."]
    transit_keywords = ["BTS", "MRT", "SRT", "BRT", "Airport Rail Link"]

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in priority_keywords):
            return line

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in transit_keywords):
            return line

    if salary and salary in lines:
        salary_idx = lines.index(salary)
        for idx in range(salary_idx - 1, -1, -1):
            candidate = lines[idx]
            if candidate not in {title, company}:
                return candidate

    return ""


def parse_card_from_title(title_node, page_num: int, keyword: str) -> dict:
    title = clean_text(title_node.get_text(" ", strip=True))

    anchor = title_node.find_parent("a", href=True)
    href = anchor.get("href", "") if anchor else ""
    job_url = href if href.startswith("http") else f"https://www.jobthai.com{href}"

    card = anchor if anchor is not None else title_node

    company = pick_text(card, [
        'span[id^="job-list-company-name-"]',
        'h2.ohgq7e-0.enAWkF',
    ])

    location = pick_text(card, [
        "h3#location-text",
        "h3.location-text",
    ])

    salary = pick_text(card, [
        "span.salary-text",
        "div.msklqa-20",
        "div.msklqa-17",
    ])

    posted_date = pick_text(card, [
        "span.msklqa-9",
    ])

    raw_lines = [clean_text(x) for x in card.get_text("\n", strip=True).splitlines() if clean_text(x)]
    raw_text = clean_text(" ".join(raw_lines))

    if not salary:
        salary = extract_salary(raw_text)
    if not posted_date:
        posted_date = extract_posted_date(raw_text)
    if not location:
        location = guess_location(raw_lines, title=title, company=company, salary=salary)

    return {
        "keyword": keyword,
        "page": page_num,
        "job_title": title,
        "company": company,
        "location": location,
        "salary": salary,
        "posted_date": posted_date,
        "job_url": job_url,
        "raw_text": raw_text,
    }


def extract_province_from_detail(job_url: str, headers: dict) -> tuple[str, str]:
    try:
        response = requests.get(job_url, headers=headers, timeout=30)
        response.raise_for_status()
    except Exception:
        return "", ""

    soup = BeautifulSoup(response.text, "html.parser")
    province_links = soup.select('a[id^="company-1-tag-"][href*="province="]')

    for link in province_links:
        href = link.get("href", "")
        name = clean_text(link.get_text(" ", strip=True))

        if not href or not name:
            continue

        province_value = parse_qs(urlparse(href).query).get("province", [""])[0]
        if not province_value or not province_value.isdigit():
            continue

        try:
            code = normalize_province_code(province_value)
        except ValueError:
            continue

        return code, name

    return "", ""


def scrape_jobthai_all_locations(
    search_url: str,
    max_pages: int = 10,
    sleep_seconds: float = 1.0,
    detail_sleep_seconds: float = 0.5,
) -> pd.DataFrame:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Accept-Language": "th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7",
    }

    keyword = parse_qs(urlparse(search_url).query).get("keyword", [""])[0]
    keyword_groups = keyword_match_groups_from_query(keyword)

    all_rows = []
    seen_urls = set()
    start_time = time.time()

    print(f"[Search] Starting page crawl: max_pages={max_pages}")

    for page_num in range(1, max_pages + 1):
        page_url = update_query_in_url(search_url, page=page_num)
        print(f"[Search] Page {page_num}/{max_pages} -> request")

        response = requests.get(page_url, headers=headers, timeout=30)
        response.raise_for_status()

        if "nodata=true" in response.url.lower():
            print(f"[Search] Page {page_num}/{max_pages} -> nodata=true, stopping")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        title_nodes = soup.select('h2[id^="job-card-item-"]')
        print(f"[Search] Page {page_num}/{max_pages} -> found cards: {len(title_nodes)}")

        page_rows = []
        for title_node in title_nodes:
            row = parse_card_from_title(
                title_node,
                page_num=page_num,
                keyword=keyword,
            )
            if not row["job_url"]:
                continue
            if not title_matches_keyword(row["job_title"], keyword_groups):
                continue
            if row["job_url"] in seen_urls:
                continue

            seen_urls.add(row["job_url"])
            page_rows.append(row)

        if not page_rows:
            print(f"[Search] Page {page_num}/{max_pages} -> no keyword matches, stopping")
            break

        all_rows.extend(page_rows)
        print(
            f"[Search] Page {page_num}/{max_pages} -> kept {len(page_rows)} | cumulative={len(all_rows)}"
        )

        if sleep_seconds > 0:
            time.sleep(sleep_seconds)

    total_details = len(all_rows)
    print(f"[Detail] Starting province lookup for {total_details} jobs")

    for index, row in enumerate(all_rows, start=1):
        province_code, province_name = extract_province_from_detail(row["job_url"], headers=headers)
        row["province_code"] = province_code
        row["province_name"] = province_name

        if total_details <= 50 or index % 10 == 0 or index == total_details:
            percent = (index / total_details) * 100 if total_details else 100
            print(f"[Detail] {index}/{total_details} ({percent:.1f}%)")

        if detail_sleep_seconds > 0:
            time.sleep(detail_sleep_seconds)

    elapsed = time.time() - start_time
    print(f"[Done] Completed in {elapsed:.1f}s")

    df = pd.DataFrame(all_rows)

    if not df.empty:
        df = df[
            [
                "keyword",
                "province_code",
                "province_name",
                "page",
                "job_title",
                "company",
                "location",
                "salary",
                "posted_date",
                "job_url",
                "raw_text",
            ]
        ].drop_duplicates(subset=["job_url"])

    return df


jobs_df = scrape_jobthai_all_locations(
    SEARCH_URL,
    max_pages=MAX_PAGES,
    sleep_seconds=SLEEP_SECONDS,
    detail_sleep_seconds=DETAIL_SLEEP_SECONDS,
)
print(f"\nTotal unique jobs: {len(jobs_df)}")

output_path = Path(OUTPUT_CSV)
output_path.parent.mkdir(parents=True, exist_ok=True)
jobs_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"Saved CSV: {output_path.resolve()}")

jobs_df.head(10)

[Search] Starting page crawl: max_pages=10
[Search] Page 1/10 -> request
[Search] Page 1/10 -> found cards: 20
[Search] Page 1/10 -> kept 7 | cumulative=7
[Search] Page 2/10 -> request
[Search] Page 2/10 -> found cards: 20
[Search] Page 2/10 -> no keyword matches, stopping
[Detail] Starting province lookup for 7 jobs
[Detail] 1/7 (14.3%)
[Detail] 2/7 (28.6%)
[Detail] 3/7 (42.9%)
[Detail] 4/7 (57.1%)
[Detail] 5/7 (71.4%)
[Detail] 6/7 (85.7%)
[Detail] 7/7 (100.0%)
[Done] Completed in 13.9s

Total unique jobs: 7
Saved CSV: G:\Users\Moss\Documents\PYTHON_PROJECT\Job_Market_Analyzer_Web_Scraping\Moss\jobsdb_data-scientist_jobthai_all-locations_20260217.csv


Unnamed: 0,keyword,province_code,province_name,page,job_title,company,location,salary,posted_date,job_url,raw_text
0,Data Scientist,1,กรุงเทพมหานคร,1,Data Scientist,"Digital Dialogue Co., Ltd.","MRT พระราม 9, ศูนย์วัฒนธรรมแห่งประเทศไทย",ตามโครงสร้างบริษัทฯ,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1685397,"Data Scientist Digital Dialogue Co., Ltd. Digi..."
1,Data Scientist,1,กรุงเทพมหานคร,1,Data Scientist,บริษัท จี.เอ็ม.เอส. คอร์เปอเรชั่น จำกัด,เขตยานนาวา กรุงเทพมหานคร,"25,000 - 30,000 บาท",16 ก.พ. 69,https://www.jobthai.com/th/company/job/1822474,Data Scientist บริษัท จี.เอ็ม.เอส. คอร์เปอเรชั...
2,Data Scientist,28,จ.ปทุมธานี,1,Data Scientist,บริษัท ดูโฮม จำกัด (มหาชน),อ.เมืองปทุมธานี จ.ปทุมธานี,ตามโครงสร้างบริษัท/ประสบการณ์,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1533922,Data Scientist บริษัท ดูโฮม จำกัด (มหาชน) บริษ...
3,Data Scientist,1,กรุงเทพมหานคร,1,Specialist - DATA Scientist,บริษัท เคซีจี คอร์ปอเรชั่น จำกัด (มหาชน) / KCG...,"BTS บางจาก, ปุณณวิถี",ตามโครงสร้างบริษัทฯ,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1848401,Specialist - DATA Scientist บริษัท เคซีจี คอร์...
4,Data Scientist,1,กรุงเทพมหานคร,1,Data Scientist – AI & Research,"Lief Capital Asset Management Co., Ltd.",เขตคลองสาน กรุงเทพมหานคร,ตามโครงสร้างบริษัทฯ,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1584351,Data Scientist – AI & Research Lief Capital As...
5,Data Scientist,1,กรุงเทพมหานคร,1,Data Scientist - WFH 95% + Office 5%,"THiNKNET Co., Ltd.","BTS ช่องนนทรี, ศาลาแดง","25,000 - 70,000 บาท",16 ก.พ. 69,https://www.jobthai.com/th/company/job/1439476,Data Scientist - WFH 95% + Office 5% THiNKNET ...
6,Data Scientist,1,กรุงเทพมหานคร,1,AI & Data Science Specialist,บริษัท ทีคิวเอ็ม อินชัวร์รันส์ โบรคเกอร์ จำกัด...,เขตลาดพร้าว กรุงเทพมหานคร,ตามประสบการณ์,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1680356,AI & Data Science Specialist บริษัท ทีคิวเอ็ม ...
