# JobThai Web Scraper (Data Scientist)

This notebook scrapes JobThai search results for **Data Scientist**, extracts structured fields, and saves them to CSV.

In [1]:
import re
import time
from pathlib import Path
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

SEARCH_URL = "https://www.jobthai.com/th/jobs?keyword=Data%20Analyst&page=1&orderBy=RELEVANCE_SEARCH"
OUTPUT_CSV = "jobthai_data-analyst_20260218.csv"
MAX_PAGES = 10
SLEEP_SECONDS = 1.0
DETAIL_SLEEP_SECONDS = 0.5

SKILLS = {
    # ---------------- Core Programming ----------------
    "python": ["python"],
    "r": [" r ", " r,", " r\n", " r/"],
    "java": ["java"],
    "scala": ["scala"],
    "c++": ["c++"],

    # ---------------- SQL & Databases ----------------
    "sql": [" sql ", "mysql", "postgres", "postgresql", "oracle", "sql server", "mssql", "sqlite"],
    "mongodb": ["mongodb", "mongo"],
    "redis": ["redis"],
    "cassandra": ["cassandra"],
    "elasticsearch": ["elasticsearch", "elastic search"],

    # ---------------- Data Libraries ----------------
    "pandas": ["pandas"],
    "numpy": ["numpy"],
    "scipy": ["scipy"],
    "sklearn": ["scikit-learn", "sklearn"],

    # ---------------- Machine Learning ----------------
    "machine_learning": [
        "machine learning", "supervised", "unsupervised",
        "random forest", "xgboost", "lightgbm", "catboost"
    ],

    # ---------------- Deep Learning ----------------
    "deep_learning": [
        "deep learning", "neural network", "cnn", "rnn", "lstm", "transformer"
    ],

    # ---------------- GenAI / LLM ----------------
    "llm": ["llm", "large language model"],
    "rag": ["rag", "retrieval augmented generation"],
    "langchain": ["langchain"],
    "openai": ["openai"],
    "huggingface": ["huggingface"],
    "prompt_engineering": ["prompt engineering"],
    "vector_db": ["vector database", "pinecone", "faiss", "weaviate", "milvus"],

    # ---------------- Visualization / BI ----------------
    "excel": ["excel", "vlookup", "pivot table", "power query"],
    "powerbi": ["power bi", "powerbi", "dax"],
    "tableau": ["tableau"],
    "matplotlib": ["matplotlib"],
    "seaborn": ["seaborn"],
    "plotly": ["plotly"],

    # ---------------- Big Data ----------------
    "spark": ["spark", "pyspark"],
    "hadoop": ["hadoop"],
    "kafka": ["kafka"],

    # ---------------- Cloud ----------------
    "aws": ["aws", "amazon web services", "s3", "redshift", "athena", "glue", "lambda"],
    "gcp": ["gcp", "google cloud", "bigquery", "cloud storage"],
    "azure": ["azure", "synapse", "databricks"],

    # ---------------- Data Engineering ----------------
    "etl": ["etl", "elt", "data pipeline"],
    "airflow": ["airflow"],
    "dbt": ["dbt"],
    "snowflake": ["snowflake"],

    # ---------------- MLOps / Deployment ----------------
    "docker": ["docker"],
    "kubernetes": ["kubernetes", "k8s"],
    "mlflow": ["mlflow"],
    "fastapi": ["fastapi"],
    "flask": ["flask"],
    "streamlit": ["streamlit"],

    # ---------------- Statistics ----------------
    "statistics": [
        "statistics", "statistical", "hypothesis testing",
        "regression", "anova", "probability"
    ],

    # ---------------- Version Control ----------------
    "git": ["git", "github", "gitlab"],

    # ---------------- APIs ----------------
    "api": ["api", "rest api"],

    # ---------------- Linux ----------------
    "linux": ["linux", "unix"],    

}
SKILL_COLUMNS = [f"skill_{name}" for name in SKILLS]


def normalize_province_code(value) -> str:
    text = str(value).strip()
    if text.isdigit():
        number = int(text)
        if number <= 0:
            raise ValueError(f"Province must be positive, got: {value}")
        return f"{number:02d}"
    raise ValueError(f"Invalid province code: {value}")


def normalize_for_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()


def normalize_for_skill_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()


def keyword_match_groups_from_query(keyword: str) -> list[list[str]]:
    term_variants = {
        "data": ["data"],
        "scientist": ["scientist", "science", "scien", "scient"],
        "science": ["science", "scientist", "scien", "scient"],
        "engineer": ["engineer", "engineering", "eng"],
        "analyst": ["analyst", "analytics", "analysis"],
        "developer": ["developer", "development", "dev"],
    }

    tokens = [token for token in normalize_for_match(keyword).split() if token]
    groups = []

    for token in tokens:
        if token in term_variants:
            groups.append(term_variants[token])
        else:
            groups.append([token])

    return groups


def title_matches_keyword(title: str, keyword_groups: list[list[str]]) -> bool:
    if not keyword_groups:
        return True

    title_norm = normalize_for_match(title)
    return all(any(variant in title_norm for variant in group) for group in keyword_groups)


def variant_matches_text(variant: str, normalized_text: str) -> bool:
    variant_norm = normalize_for_skill_match(variant)
    if not variant_norm:
        return False
    pattern = rf"(?<![a-z0-9]){re.escape(variant_norm).replace(r'\\ ', r'\\s+')}(?![a-z0-9])"
    return re.search(pattern, normalized_text) is not None


def extract_skills(text: str) -> dict:
    normalized_text = normalize_for_skill_match(text)
    matched = []

    for skill_name, variants in SKILLS.items():
        if any(variant_matches_text(variant, normalized_text) for variant in variants):
            matched.append(skill_name)

    skill_flags = {f"skill_{name}": int(name in matched) for name in SKILLS}

    return {
        "matched_skills": "|".join(matched),
        "matched_skill_count": len(matched),
        **skill_flags,
    }


def normalize_jobthai_detail_url(job_url: str) -> str:
    if not job_url:
        return ""

    parsed = urlparse(job_url)
    path = parsed.path

    path = path.replace("/th/company/job/", "/th/job/")
    path = path.replace("/company/job/", "/job/")

    return urlunparse((parsed.scheme, parsed.netloc, path, "", "", ""))


def update_query_in_url(url: str, **params) -> str:
    parsed = urlparse(url)
    query = parse_qs(parsed.query)

    for key, value in params.items():
        query[key] = [str(value)]

    new_query = urlencode(query, doseq=True)
    return urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, new_query, parsed.fragment))


def clean_text(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


def extract_salary(text: str) -> str:
    patterns = [
        r"\d[\d,\s]*\s*-\s*\d[\d,\s]*\s*บาท",
        r"\d[\d,\s]*\s*บาท",
        r"ตามโครงสร้างบริษัทฯ",
        r"ตามประสบการณ์",
        r"ตามตกลง",
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return clean_text(match.group(0))
    return ""


def extract_posted_date(text: str) -> str:
    match = re.search(r"\b\d{1,2}\s+[ก-๙A-Za-z\.]+\s+\d{2}\b", text)
    return clean_text(match.group(0)) if match else ""


def pick_text(parent, selectors: list[str]) -> str:
    for selector in selectors:
        element = parent.select_one(selector)
        if element:
            text = clean_text(element.get_text(" ", strip=True))
            if text:
                return text
    return ""


def guess_location(lines: list[str], title: str, company: str, salary: str) -> str:
    priority_keywords = ["เขต", "กรุงเทพ", "จังหวัด", "อำเภอ", "อ.", "ต."]
    transit_keywords = ["BTS", "MRT", "SRT", "BRT", "Airport Rail Link"]

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in priority_keywords):
            return line

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in transit_keywords):
            return line

    if salary and salary in lines:
        salary_idx = lines.index(salary)
        for idx in range(salary_idx - 1, -1, -1):
            candidate = lines[idx]
            if candidate not in {title, company}:
                return candidate

    return ""


def parse_card_from_title(title_node, page_num: int, keyword: str) -> dict:
    title = clean_text(title_node.get_text(" ", strip=True))

    anchor = title_node.find_parent("a", href=True)
    href = anchor.get("href", "") if anchor else ""
    job_url = href if href.startswith("http") else f"https://www.jobthai.com{href}"
    job_url = normalize_jobthai_detail_url(job_url)

    card = anchor if anchor is not None else title_node

    company = pick_text(card, [
        'span[id^="job-list-company-name-"]',
        'h2.ohgq7e-0.enAWkF',
    ])

    location = pick_text(card, [
        "h3#location-text",
        "h3.location-text",
    ])

    salary = pick_text(card, [
        "span.salary-text",
        "div.msklqa-20",
        "div.msklqa-17",
    ])

    posted_date = pick_text(card, [
        "span.msklqa-9",
    ])

    raw_lines = [clean_text(x) for x in card.get_text("\n", strip=True).splitlines() if clean_text(x)]
    raw_text = clean_text(" ".join(raw_lines))

    if not salary:
        salary = extract_salary(raw_text)
    if not posted_date:
        posted_date = extract_posted_date(raw_text)
    if not location:
        location = guess_location(raw_lines, title=title, company=company, salary=salary)

    return {
        "keyword": keyword,
        "page": page_num,
        "job_title": title,
        "company": company,
        "location": location,
        "salary": salary,
        "posted_date": posted_date,
        "job_url": job_url,
        "raw_text": raw_text,
    }


def extract_detail_from_job_page(job_url: str, headers: dict) -> dict:
    base_detail = {
        "province_code": "",
        "province_name": "",
        "job_detail_text": "",
        "job_qualification_text": "",
        "matched_skills": "",
        "matched_skill_count": 0,
        **{column: 0 for column in SKILL_COLUMNS},
    }

    try:
        response = requests.get(job_url, headers=headers, timeout=30)
        response.raise_for_status()
    except Exception:
        return base_detail

    soup = BeautifulSoup(response.text, "html.parser")

    province_code = ""
    province_name = ""
    for anchor in soup.select('a[href*="province="]'):
        tag = anchor.select_one('h3[id^="job-detail-tag-"]')
        if not tag:
            continue

        href = anchor.get("href", "")
        name = clean_text(tag.get_text(" ", strip=True))
        if not href or not name:
            continue

        province_value = parse_qs(urlparse(href).query).get("province", [""])[0]
        if not province_value or not province_value.isdigit():
            continue

        try:
            province_code = normalize_province_code(province_value)
        except ValueError:
            continue

        province_name = name
        break

    jd_node = soup.select_one("span#job-detail")
    job_detail_text = clean_text(jd_node.get_text("\n", strip=True)) if jd_node else ""

    qualification_node = soup.select_one("#job-properties-wrapper")
    job_qualification_text = clean_text(qualification_node.get_text(" ", strip=True)) if qualification_node else ""

    combined_text = " ".join([text for text in [job_detail_text, job_qualification_text] if text])
    skill_info = extract_skills(combined_text)

    return {
        "province_code": province_code,
        "province_name": province_name,
        "job_detail_text": job_detail_text,
        "job_qualification_text": job_qualification_text,
        **skill_info,
    }


def scrape_jobthai_all_locations(
    search_url: str,
    max_pages: int = 10,
    sleep_seconds: float = 1.0,
    detail_sleep_seconds: float = 0.5,
) -> pd.DataFrame:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Accept-Language": "th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7",
    }

    keyword = parse_qs(urlparse(search_url).query).get("keyword", [""])[0]
    keyword_groups = keyword_match_groups_from_query(keyword)

    all_rows = []
    seen_urls = set()
    start_time = time.time()

    print(f"[Search] Starting page crawl: max_pages={max_pages}")

    for page_num in range(1, max_pages + 1):
        page_url = update_query_in_url(search_url, page=page_num)
        print(f"[Search] Page {page_num}/{max_pages} -> request")

        response = requests.get(page_url, headers=headers, timeout=30)
        response.raise_for_status()

        if "nodata=true" in response.url.lower():
            print(f"[Search] Page {page_num}/{max_pages} -> nodata=true, stopping")
            break

        soup = BeautifulSoup(response.text, "html.parser")
        title_nodes = soup.select('h2[id^="job-card-item-"]')
        print(f"[Search] Page {page_num}/{max_pages} -> found cards: {len(title_nodes)}")

        page_rows = []
        for title_node in title_nodes:
            row = parse_card_from_title(
                title_node,
                page_num=page_num,
                keyword=keyword,
            )
            if not row["job_url"]:
                continue
            if not title_matches_keyword(row["job_title"], keyword_groups):
                continue
            if row["job_url"] in seen_urls:
                continue

            seen_urls.add(row["job_url"])
            page_rows.append(row)

        if not page_rows:
            print(f"[Search] Page {page_num}/{max_pages} -> no keyword matches, stopping")
            break

        all_rows.extend(page_rows)
        print(
            f"[Search] Page {page_num}/{max_pages} -> kept {len(page_rows)} | cumulative={len(all_rows)}"
        )

        if sleep_seconds > 0:
            time.sleep(sleep_seconds)

    total_details = len(all_rows)
    print(f"[Detail] Starting detail extraction for {total_details} jobs")

    for index, row in enumerate(all_rows, start=1):
        detail_info = extract_detail_from_job_page(row["job_url"], headers=headers)
        row.update(detail_info)

        if total_details <= 50 or index % 10 == 0 or index == total_details:
            percent = (index / total_details) * 100 if total_details else 100
            print(f"[Detail] {index}/{total_details} ({percent:.1f}%)")

        if detail_sleep_seconds > 0:
            time.sleep(detail_sleep_seconds)

    elapsed = time.time() - start_time
    print(f"[Done] Completed in {elapsed:.1f}s")

    df = pd.DataFrame(all_rows)

    if not df.empty:
        ordered_columns = [
            "keyword",
            "province_code",
            "province_name",
            "page",
            "job_title",
            "company",
            "location",
            "salary",
            "posted_date",
            "job_url",
            "raw_text",
            "job_detail_text",
            "job_qualification_text",
            "matched_skills",
            "matched_skill_count",
            *SKILL_COLUMNS,
        ]
        for column in ordered_columns:
            if column not in df.columns:
                df[column] = "" if column not in {"matched_skill_count", *SKILL_COLUMNS} else 0

        df = df[ordered_columns].drop_duplicates(subset=["job_url"])

    return df


jobs_df = scrape_jobthai_all_locations(
    SEARCH_URL,
    max_pages=MAX_PAGES,
    sleep_seconds=SLEEP_SECONDS,
    detail_sleep_seconds=DETAIL_SLEEP_SECONDS,
)
print(f"\nTotal unique jobs: {len(jobs_df)}")

output_path = Path(OUTPUT_CSV)
output_path.parent.mkdir(parents=True, exist_ok=True)
jobs_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"Saved CSV: {output_path.resolve()}")

jobs_df.head(10)

[Search] Starting page crawl: max_pages=10
[Search] Page 1/10 -> request
[Search] Page 1/10 -> found cards: 20
[Search] Page 1/10 -> kept 20 | cumulative=20
[Search] Page 2/10 -> request
[Search] Page 2/10 -> found cards: 20
[Search] Page 2/10 -> kept 20 | cumulative=40
[Search] Page 3/10 -> request
[Search] Page 3/10 -> found cards: 20
[Search] Page 3/10 -> kept 20 | cumulative=60
[Search] Page 4/10 -> request
[Search] Page 4/10 -> found cards: 20
[Search] Page 4/10 -> kept 2 | cumulative=62
[Search] Page 5/10 -> request
[Search] Page 5/10 -> found cards: 20
[Search] Page 5/10 -> no keyword matches, stopping
[Detail] Starting detail extraction for 62 jobs
[Detail] 10/62 (16.1%)
[Detail] 20/62 (32.3%)
[Detail] 30/62 (48.4%)
[Detail] 40/62 (64.5%)
[Detail] 50/62 (80.6%)
[Detail] 60/62 (96.8%)
[Detail] 62/62 (100.0%)
[Done] Completed in 124.1s

Total unique jobs: 62
Saved CSV: G:\Users\Moss\Documents\PYTHON_PROJECT\Job_Market_Analyzer_Web_Scraping\Moss\jobthai_data-analyst_20260218.csv


Unnamed: 0,keyword,province_code,province_name,page,job_title,company,location,salary,posted_date,job_url,...,skill_r,skill_pandas,skill_numpy,skill_spark,skill_aws,skill_azure,skill_gcp,skill_machine_learning,skill_deep_learning,skill_llm
0,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst,บริษัท เอฟซีซี เซอร์วิสเซส จำกัด,MRT ภาวนา,"17,000 - 22,000 บาท",17 ก.พ. 69,https://www.jobthai.com/th/job/1755307,...,0,0,0,0,0,0,0,0,0,0
1,Data Analyst,32,จ.พระนครศรีอยุธยา,1,Data Analyst,"Tech Solution AI Co., Ltd.",อ.วังน้อย จ.พระนครศรีอยุธยา,ตามตกลง,18 ก.พ. 69,https://www.jobthai.com/th/job/1857706,...,1,0,0,0,0,0,0,0,0,0
2,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst,บริษัท เมืองไทย แคปปิตอล จำกัด (มหาชน),MRT บางพลัด,"22,000 - 23,000 บาท",18 ก.พ. 69,https://www.jobthai.com/th/job/1752582,...,0,0,0,0,0,0,0,0,0,0
3,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst,S45 Clinic,BTS พร้อมพงษ์,ตามตกลง,18 ก.พ. 69,https://www.jobthai.com/th/job/1844306,...,0,0,0,0,0,0,0,0,0,0
4,Data Analyst,28,จ.ปทุมธานี,1,Data Analyst,บริษัท ดูโฮม จำกัด (มหาชน),อ.เมืองปทุมธานี จ.ปทุมธานี,"18,000 - 30,000 บาท",18 ก.พ. 69,https://www.jobthai.com/th/job/1809674,...,0,0,0,0,0,0,0,0,0,0
5,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst,บริษัท ซาบีน่า จำกัด (มหาชน),เขตบางกอกน้อย กรุงเทพมหานคร,ตามตกลง,18 ก.พ. 69,https://www.jobthai.com/th/job/1862880,...,0,0,0,0,0,0,0,1,0,0
6,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst,บริษัท ชับบ์สามัคคีประกันภัย จำกัด (มหาชน),เขตหลักสี่ กรุงเทพมหานคร,ตามตกลง,18 ก.พ. 69,https://www.jobthai.com/th/job/1824648,...,0,0,0,0,0,0,0,0,0,0
7,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst,บริษัท เริ่มใหม่ จำกัด,เขตสายไหม กรุงเทพมหานคร,ตามความสามารถและประสบการณ์,18 ก.พ. 69,https://www.jobthai.com/th/job/1643374,...,0,0,0,0,0,0,0,0,0,0
8,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst / Business Data Analyst,บริษัท ฟังก์ชั่น อินเตอร์เนชั่นแนล จำกัด (มหาชน),เขตคลองสามวา กรุงเทพมหานคร,"20,000 - 30,000 บาท",17 ก.พ. 69,https://www.jobthai.com/th/job/1854175,...,0,0,0,0,0,0,0,0,0,0
9,Data Analyst,1,กรุงเทพมหานคร,1,Data Analyst - BNG,บริษัท เบญจจินดา โฮลดิ้ง จำกัด,SRT บางเขน,ตามประสบการณ์,16 ก.พ. 69,https://www.jobthai.com/th/job/1858004,...,0,0,0,0,0,0,0,0,0,0
