# JobThai Web Scraper (Data Scientist)

This notebook scrapes JobThai search results for **Data Scientist**, extracts structured fields, and saves them to CSV.

In [None]:
import re
import time
from pathlib import Path
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

SEARCH_URL = "https://www.jobthai.com/th/jobs?province=01&event-page=home&event-section=search-keyword-history&keyword=Data%20Scientist&page=1&orderBy=RELEVANCE_SEARCH"
OUTPUT_CSV = "jobsdb_data-scientist_jobthai_20260217.csv"
MAX_PAGES = 10
SLEEP_SECONDS = 1.2


def update_page_in_url(url: str, page: int) -> str:
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    query["page"] = [str(page)]
    new_query = urlencode(query, doseq=True)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, new_query, parsed.fragment))


def clean_text(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def extract_salary(text: str) -> str:
    patterns = [
        r"\d[\d,\s]*\s*-\s*\d[\d,\s]*\s*บาท",
        r"\d[\d,\s]*\s*บาท",
        r"ตามโครงสร้างบริษัทฯ",
        r"ตามประสบการณ์",
        r"ตามตกลง",
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return clean_text(match.group(0))
    return ""


def extract_posted_date(text: str) -> str:
    match = re.search(r"\b\d{1,2}\s+[ก-๙A-Za-z\.]+\s+\d{2}\b", text)
    return clean_text(match.group(0)) if match else ""


def pick_text(parent, selectors: list[str]) -> str:
    for selector in selectors:
        element = parent.select_one(selector)
        if element:
            text = clean_text(element.get_text(" ", strip=True))
            if text:
                return text
    return ""


def guess_location(lines: list[str], title: str, company: str, salary: str) -> str:
    priority_keywords = ["เขต", "กรุงเทพ", "จังหวัด", "อำเภอ", "อ.", "ต."]
    transit_keywords = ["BTS", "MRT", "SRT", "BRT", "Airport Rail Link"]

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in priority_keywords):
            return line

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in transit_keywords):
            return line

    if salary and salary in lines:
        salary_idx = lines.index(salary)
        for idx in range(salary_idx - 1, -1, -1):
            candidate = lines[idx]
            if candidate not in {title, company}:
                return candidate

    return ""


def parse_card_from_title(title_node, page_num: int, keyword: str) -> dict:
    title = clean_text(title_node.get_text(" ", strip=True))

    anchor = title_node.find_parent("a", href=True)
    href = anchor.get("href", "") if anchor else ""
    job_url = href if href.startswith("http") else f"https://www.jobthai.com{href}"

    card = anchor if anchor is not None else title_node

    company = pick_text(card, [
        'span[id^="job-list-company-name-"]',
        'h2.ohgq7e-0.enAWkF',
    ])

    location = pick_text(card, [
        "h3#location-text",
        "h3.location-text",
    ])

    salary = pick_text(card, [
        "span.salary-text",
        "div.msklqa-20",
        "div.msklqa-17",
    ])

    posted_date = pick_text(card, [
        "span.msklqa-9",
    ])

    raw_lines = [clean_text(x) for x in card.get_text("\n", strip=True).splitlines() if clean_text(x)]
    raw_text = clean_text(" ".join(raw_lines))

    if not salary:
        salary = extract_salary(raw_text)
    if not posted_date:
        posted_date = extract_posted_date(raw_text)
    if not location:
        location = guess_location(raw_lines, title=title, company=company, salary=salary)

    return {
        "keyword": keyword,
        "page": page_num,
        "job_title": title,
        "company": company,
        "location": location,
        "salary": salary,
        "posted_date": posted_date,
        "job_url": job_url,
        "raw_text": raw_text,
    }


def scrape_jobthai(search_url: str, max_pages: int = 10, sleep_seconds: float = 1.0) -> pd.DataFrame:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Accept-Language": "th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7",
    }

    keyword = parse_qs(urlparse(search_url).query).get("keyword", [""])[0]
    all_rows = []
    seen_urls = set()

    for page_num in range(1, max_pages + 1):
        page_url = update_page_in_url(search_url, page_num)
        response = requests.get(page_url, headers=headers, timeout=30)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        title_nodes = soup.select('h2[id^="job-card-item-"]')

        page_rows = []
        for title_node in title_nodes:
            row = parse_card_from_title(title_node, page_num=page_num, keyword=keyword)
            if not row["job_url"]:
                continue
            if row["job_url"] in seen_urls:
                continue
            seen_urls.add(row["job_url"])
            page_rows.append(row)

        if not page_rows:
            print(f"Page {page_num}: no jobs found, stopping.")
            break

        all_rows.extend(page_rows)
        print(f"Page {page_num}: collected {len(page_rows)} jobs")

        if sleep_seconds > 0:
            time.sleep(sleep_seconds)

    df = pd.DataFrame(all_rows)

    if not df.empty:
        df = df[
            [
                "keyword",
                "page",
                "job_title",
                "company",
                "location",
                "salary",
                "posted_date",
                "job_url",
                "raw_text",
            ]
        ].drop_duplicates(subset=["job_url"])

    return df


jobs_df = scrape_jobthai(SEARCH_URL, max_pages=MAX_PAGES, sleep_seconds=SLEEP_SECONDS)
print(f"Total unique jobs: {len(jobs_df)}")

output_path = Path(OUTPUT_CSV)
output_path.parent.mkdir(parents=True, exist_ok=True)
jobs_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"Saved CSV: {output_path.resolve()}")

jobs_df.head(10)

Page 1: collected 20 jobs
Page 2: collected 20 jobs
Page 3: collected 20 jobs
Page 4: collected 20 jobs
Page 5: collected 20 jobs
Page 6: collected 17 jobs
Page 7: no jobs found, stopping.
Total unique jobs: 117
Saved CSV: G:\Users\Moss\Documents\PYTHON_PROJECT\Job_Market_Analyzer_Web_Scraping\Moss\jobsdb_data-scientist_jobthai_20260217.csv


Unnamed: 0,keyword,page,job_title,company,location,salary,posted_date,job_url,raw_text
0,Data Scientist,1,Data Scientist,บริษัท จี.เอ็ม.เอส. คอร์เปอเรชั่น จำกัด,เขตยานนาวา กรุงเทพมหานคร,"25,000 - 30,000 บาท",16 ก.พ. 69,https://www.jobthai.com/th/company/job/1822474,Data Scientist บริษัท จี.เอ็ม.เอส. คอร์เปอเรชั...
1,Data Scientist,1,Data Scientist,"Digital Dialogue Co., Ltd.",,ตามโครงสร้างบริษัทฯ,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1685397,"Data Scientist Digital Dialogue Co., Ltd. Digi..."
2,Data Scientist,1,Specialist - DATA Scientist,บริษัท เคซีจี คอร์ปอเรชั่น จำกัด (มหาชน) / KCG...,,ตามโครงสร้างบริษัทฯ,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1848401,Specialist - DATA Scientist บริษัท เคซีจี คอร์...
3,Data Scientist,1,Data Scientist – AI & Research,"Lief Capital Asset Management Co., Ltd.",เขตคลองสาน กรุงเทพมหานคร,ตามโครงสร้างบริษัทฯ,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1584351,Data Scientist – AI & Research Lief Capital As...
4,Data Scientist,1,Data Scientist - WFH 95% + Office 5%,"THiNKNET Co., Ltd.",,"25,000 - 70,000 บาท",16 ก.พ. 69,https://www.jobthai.com/th/company/job/1439476,Data Scientist - WFH 95% + Office 5% THiNKNET ...
5,Data Scientist,1,Data Engineer (Contractor),"The Prodigy (Thailand) Co., Ltd.",เขตพญาไท กรุงเทพมหานคร,"25,000 - 35,000 บาท",9 ก.พ. 69,https://www.jobthai.com/th/company/job/1821205,Data Engineer (Contractor) The Prodigy (Thaila...
6,Data Scientist,1,Data Analyst,บริษัท ซาบีน่า จำกัด (มหาชน),เขตบางกอกน้อย กรุงเทพมหานคร,ตามตกลง,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1862880,Data Analyst บริษัท ซาบีน่า จำกัด (มหาชน) บริษ...
7,Data Scientist,1,Network Engineer,OxygenAI (บริษัท ออมนิ ดีล จำกัด),,ตามตกลง,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1815773,Network Engineer 16 ก.พ. 69 OxygenAI (บริษัท อ...
8,Data Scientist,1,Data Analyst,บริษัท ชับบ์สามัคคีประกันภัย จำกัด (มหาชน),เขตหลักสี่ กรุงเทพมหานคร,ตามตกลง,16 ก.พ. 69,https://www.jobthai.com/th/company/job/1824648,Data Analyst บริษัท ชับบ์สามัคคีประกันภัย จำกั...
9,Data Scientist,1,Business Intelligence (BI) Analyst,"Sasom Co., Ltd.",เขตพระโขนง กรุงเทพมหานคร,"30,000 - 35,000 บาท",13 ก.พ. 69,https://www.jobthai.com/th/company/job/1842618,"Business Intelligence (BI) Analyst Sasom Co., ..."
