# JobBKK Web Scraper (Data Analyst)

This notebook scrapes JobBKK result pages, extracts structured fields, and exports CSV with the same schema style used in your JobThai/JobsDB outputs.

In [3]:
import re
import time
from datetime import datetime
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup

SEARCH_URL = "https://jobbkk.com/jobs/lists/1/หางาน,data%20analyst,ทุกจังหวัด,ทั้งหมด.html?keyword_type=3&sort=4"
OUTPUT_CSV = "jobbkk_data-analyst_like-jobthai_20260218.csv"
MAX_PAGES = 10
SLEEP_SECONDS = 1.0
DETAIL_SLEEP_SECONDS = 0.5

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/121.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7",
}

SKILLS = {
    "python": ["python"],
    "r": ["r", "r language"],
    "java": ["java"],
    "scala": ["scala"],
    "c++": ["c++"],
    "sql": ["sql", "mysql", "postgres", "postgresql", "oracle", "sql server", "mssql", "sqlite", "ms access"],
    "mongodb": ["mongodb", "mongo"],
    "redis": ["redis"],
    "pandas": ["pandas"],
    "numpy": ["numpy"],
    "scipy": ["scipy"],
    "sklearn": ["scikit-learn", "sklearn"],
    "machine_learning": ["machine learning", "supervised", "unsupervised", "random forest", "xgboost", "lightgbm", "catboost"],
    "deep_learning": ["deep learning", "neural network", "cnn", "rnn", "lstm", "transformer"],
    "llm": ["llm", "large language model", "large language models", "gpt"],
    "rag": ["rag", "retrieval augmented generation"],
    "langchain": ["langchain"],
    "openai": ["openai"],
    "huggingface": ["huggingface"],
    "prompt_engineering": ["prompt engineering"],
    "vector_db": ["vector database", "pinecone", "faiss", "weaviate", "milvus"],
    "excel": ["excel", "vlookup", "pivot table", "power query"],
    "powerbi": ["power bi", "powerbi", "dax"],
    "tableau": ["tableau"],
    "matplotlib": ["matplotlib"],
    "seaborn": ["seaborn"],
    "plotly": ["plotly"],
    "spark": ["spark", "pyspark"],
    "hadoop": ["hadoop"],
    "kafka": ["kafka"],
    "aws": ["aws", "amazon web services", "s3", "redshift", "athena", "glue", "lambda"],
    "gcp": ["gcp", "google cloud", "bigquery", "cloud storage"],
    "azure": ["azure", "synapse", "databricks"],
    "etl": ["etl", "elt", "data pipeline"],
    "airflow": ["airflow"],
    "dbt": ["dbt"],
    "snowflake": ["snowflake"],
    "docker": ["docker"],
    "kubernetes": ["kubernetes", "k8s"],
    "mlflow": ["mlflow"],
    "fastapi": ["fastapi"],
    "flask": ["flask"],
    "streamlit": ["streamlit"],
    "statistics": ["statistics", "statistical", "hypothesis testing", "regression", "anova", "probability"],
    "git": ["git", "github", "gitlab"],
    "api": ["api", "rest api"],
    "linux": ["linux", "unix"],
}
SKILL_COLUMNS = [f"skill_{name}" for name in SKILLS]


def clean_text(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "")).strip()


def normalize_for_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", (text or "").lower()).strip()


def normalize_for_skill_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", (text or "").lower()).strip()


def keyword_match_groups_from_query(search_keyword: str) -> list[list[str]]:
    term_variants = {
        "data": ["data"],
        "scientist": ["scientist", "science", "scien", "scient"],
        "science": ["science", "scientist", "scien", "scient"],
        "engineer": ["engineer", "engineering", "eng"],
        "analyst": ["analyst", "analytics", "analysis"],
        "developer": ["developer", "development", "dev"],
    }

    tokens = [token for token in normalize_for_match(search_keyword).split() if token]
    return [term_variants.get(token, [token]) for token in tokens]


def title_matches_keyword(title: str, keyword_groups: list[list[str]]) -> bool:
    if not keyword_groups:
        return True
    title_norm = normalize_for_match(title)
    return all(any(variant in title_norm for variant in group) for group in keyword_groups)


def variant_matches_text(variant: str, normalized_text: str) -> bool:
    variant_norm = normalize_for_skill_match(variant)
    if not variant_norm:
        return False
    pattern = rf"(?<![a-z0-9]){re.escape(variant_norm).replace(r'\\ ', r'\\s+')}(?![a-z0-9])"
    return re.search(pattern, normalized_text) is not None


def extract_skills(text: str) -> dict:
    normalized_text = normalize_for_skill_match(text)
    matched = []

    for skill_name, variants in SKILLS.items():
        if any(variant_matches_text(variant, normalized_text) for variant in variants):
            matched.append(skill_name)

    skill_flags = {f"skill_{name}": int(name in matched) for name in SKILLS}

    return {
        "matched_skills": "|".join(matched),
        "matched_skill_count": len(matched),
        **skill_flags,
    }


def extract_salary(text: str) -> str:
    patterns = [
        r"\d[\d,\s]*\s*[-–]\s*\d[\d,\s]*\s*บาท",
        r"\d[\d,\s]*\s*บาท",
        r"ตามตกลง|ตามประสบการณ์|ไม่ระบุเงินเดือน",
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return clean_text(match.group(0))
    return ""


def update_page_in_search_url(url: str, page_num: int) -> str:
    return re.sub(r"/jobs/lists/\d+/", f"/jobs/lists/{page_num}/", url)


def extract_keyword_from_url(url: str) -> str:
    path = url.split("?")[0]
    parts = path.split(",")
    if len(parts) >= 2:
        return parts[1].replace("%20", " ")
    return ""


def guess_province_name(location_text: str) -> str:
    location_text = clean_text(location_text)
    if not location_text:
        return ""

    province_names = [
        "กรุงเทพมหานคร", "กระบี่", "กาญจนบุรี", "กาฬสินธุ์", "กำแพงเพชร", "ขอนแก่น", "จันทบุรี",
        "ฉะเชิงเทรา", "ชลบุรี", "ชัยนาท", "ชัยภูมิ", "ชุมพร", "เชียงราย", "เชียงใหม่", "ตรัง",
        "ตราด", "ตาก", "นครนายก", "นครปฐม", "นครพนม", "นครราชสีมา", "นครศรีธรรมราช", "นครสวรรค์",
        "นนทบุรี", "นราธิวาส", "น่าน", "บึงกาฬ", "บุรีรัมย์", "ปทุมธานี", "ประจวบคีรีขันธ์", "ปราจีนบุรี",
        "ปัตตานี", "พระนครศรีอยุธยา", "พังงา", "พัทลุง", "พิจิตร", "พิษณุโลก", "เพชรบุรี", "เพชรบูรณ์",
        "แพร่", "พะเยา", "ภูเก็ต", "มหาสารคาม", "มุกดาหาร", "แม่ฮ่องสอน", "ยะลา", "ยโสธร", "ร้อยเอ็ด",
        "ระนอง", "ระยอง", "ราชบุรี", "ลพบุรี", "ลำปาง", "ลำพูน", "เลย", "ศรีสะเกษ", "สกลนคร", "สงขลา",
        "สตูล", "สมุทรปราการ", "สมุทรสงคราม", "สมุทรสาคร", "สระแก้ว", "สระบุรี", "สิงห์บุรี", "สุโขทัย",
        "สุพรรณบุรี", "สุราษฎร์ธานี", "สุรินทร์", "หนองคาย", "หนองบัวลำภู", "อ่างทอง", "อุดรธานี", "อุตรดิตถ์",
        "อุทัยธานี", "อุบลราชธานี", "อำนาจเจริญ",
    ]

    for province in province_names:
        if province in location_text:
            return province
    return ""


def parse_jobbkk_card(card, page_num: int, keyword: str) -> dict:
    title_el = card.select_one(".joblist-name-urgent a[href*='/jobs/detail']")
    company_el = card.select_one(".joblist-company-name a")
    location_el = card.select_one(".position-location span:last-child")
    salary_el = card.select_one(".position-salary span:last-child")
    updated_el = card.select_one(".joblist-updatetime-md-upper a")

    title = clean_text(title_el.get_text(" ", strip=True) if title_el else "")
    company = clean_text(company_el.get_text(" ", strip=True) if company_el else "")
    location = clean_text(location_el.get_text(" ", strip=True) if location_el else "")
    salary = clean_text(salary_el.get_text(" ", strip=True) if salary_el else "")
    posted_date = clean_text(updated_el.get("title") if updated_el and updated_el.get("title") else "")

    href = title_el.get("href", "") if title_el else ""
    job_url = href if href.startswith("http") else f"https://jobbkk.com{href}" if href else ""

    if not job_url:
        company_id = card.get("data-com-id", "")
        job_id = card.get("data-job-id", "")
        if company_id and job_id:
            job_url = f"https://jobbkk.com/jobs/detailurgent/{company_id}/{job_id}"

    raw_text = clean_text(card.get_text("\n", strip=True))
    if not salary:
        salary = extract_salary(raw_text)
    if not posted_date and updated_el:
        posted_date = clean_text(updated_el.get_text(" ", strip=True))

    province_name = guess_province_name(location)

    return {
        "keyword": keyword,
        "province_code": "",
        "province_name": province_name,
        "page": page_num,
        "job_title": title,
        "company": company,
        "location": location,
        "salary": salary,
        "posted_date": posted_date,
        "job_url": job_url,
        "raw_text": raw_text,
    }


def collect_list_items_text(container) -> str:
    if container is None:
        return ""
    items = [clean_text(li.get_text(" ", strip=True)) for li in container.select("li")]
    items = [item for item in items if item]
    if items:
        return "\n".join(items)
    return clean_text(container.get_text("\n", strip=True))


def find_section_by_heading(root, heading_pattern: str):
    heading_regex = re.compile(heading_pattern)
    heading = root.find(
        lambda tag: tag.name in ["p", "span", "h2", "h3", "strong"]
        and heading_regex.search(clean_text(tag.get_text(" ", strip=True)))
    )
    if not heading:
        return None

    for container in [heading.find_parent("section"), heading.find_parent("div")]:
        if container and container.select("li"):
            return container

    next_ul = heading.find_next("ul")
    if next_ul:
        return next_ul

    return heading.find_parent("section") or heading.find_parent("div")


def extract_jobbkk_detail(job_url: str, headers: dict) -> dict:
    base_detail = {
        "job_detail_full_text": "",
        "matched_skills": "",
        "matched_skill_count": 0,
        **{column: 0 for column in SKILL_COLUMNS},
    }

    try:
        response = requests.get(job_url, headers=headers, timeout=30)
        response.raise_for_status()
    except Exception:
        return base_detail

    soup = BeautifulSoup(response.text, "html.parser")

    detail_root = soup.select_one("article.row") or soup
    job_detail_full_text = clean_text(detail_root.get_text("\n", strip=True))

    skill_info = extract_skills(job_detail_full_text)

    return {
        "job_detail_full_text": job_detail_full_text,
        **skill_info,
    }


def scrape_jobbkk(
    search_url: str,
    max_pages: int = 10,
    sleep_seconds: float = 1.0,
    detail_sleep_seconds: float = 0.5,
) -> pd.DataFrame:
    keyword = extract_keyword_from_url(search_url)
    keyword_groups = keyword_match_groups_from_query(keyword)

    all_rows = []
    seen_urls = set()

    print(f"[Search] Starting JobBKK crawl: max_pages={max_pages}")

    for page_num in range(1, max_pages + 1):
        page_url = update_page_in_search_url(search_url, page_num)
        print(f"[Search] Page {page_num}/{max_pages} -> request")

        response = requests.get(page_url, headers=headers, timeout=30)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        cards = soup.select("div.joblist-pos.jobbkk-list-company")
        print(f"[Search] Page {page_num}/{max_pages} -> found cards: {len(cards)}")

        if not cards:
            print(f"[Search] Page {page_num}/{max_pages} -> no cards, stopping")
            break

        page_rows = []
        for card in cards:
            row = parse_jobbkk_card(card, page_num=page_num, keyword=keyword)

            if not row["job_title"] or not row["job_url"]:
                continue
            if not title_matches_keyword(row["job_title"], keyword_groups):
                continue
            if row["job_url"] in seen_urls:
                continue

            seen_urls.add(row["job_url"])
            page_rows.append(row)

        if not page_rows:
            print(f"[Search] Page {page_num}/{max_pages} -> no keyword matches, stopping")
            break

        all_rows.extend(page_rows)
        print(f"[Search] Page {page_num}/{max_pages} -> kept {len(page_rows)} | cumulative={len(all_rows)}")

        if sleep_seconds > 0:
            time.sleep(sleep_seconds)

    total_details = len(all_rows)
    print(f"[Detail] Starting detail extraction for {total_details} jobs")

    for index, row in enumerate(all_rows, start=1):
        detail_info = extract_jobbkk_detail(row["job_url"], headers=headers)
        row.update(detail_info)

        if total_details <= 50 or index % 10 == 0 or index == total_details:
            percent = (index / total_details) * 100 if total_details else 100
            print(f"[Detail] {index}/{total_details} ({percent:.1f}%)")

        if detail_sleep_seconds > 0:
            time.sleep(detail_sleep_seconds)

    df = pd.DataFrame(all_rows)

    if not df.empty:
        ordered_columns = [
            "keyword",
            "province_code",
            "province_name",
            "page",
            "job_title",
            "company",
            "location",
            "salary",
            "posted_date",
            "job_url",
            "raw_text",
            "job_detail_full_text",
            "matched_skills",
            "matched_skill_count",
            *SKILL_COLUMNS,
        ]

        for column in ordered_columns:
            if column not in df.columns:
                df[column] = "" if column not in {"matched_skill_count", *SKILL_COLUMNS} else 0

        df = df[ordered_columns].drop_duplicates(subset=["job_url"])

    return df


jobs_df = scrape_jobbkk(
    SEARCH_URL,
    max_pages=MAX_PAGES,
    sleep_seconds=SLEEP_SECONDS,
    detail_sleep_seconds=DETAIL_SLEEP_SECONDS,
)
print(f"\nTotal unique jobs: {len(jobs_df)}")

keyword_slug = "-".join(extract_keyword_from_url(SEARCH_URL).lower().split())
output_file = OUTPUT_CSV if OUTPUT_CSV else f"jobbkk_{keyword_slug}_{datetime.now().strftime('%Y%m%d')}.csv"
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
jobs_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"Saved to: {output_path.resolve()}")

jobs_df.head(20)

[Search] Starting JobBKK crawl: max_pages=10
[Search] Page 1/10 -> request
[Search] Page 1/10 -> found cards: 15
[Search] Page 1/10 -> kept 9 | cumulative=9
[Search] Page 2/10 -> request
[Search] Page 2/10 -> found cards: 15
[Search] Page 2/10 -> kept 5 | cumulative=14
[Search] Page 3/10 -> request
[Search] Page 3/10 -> found cards: 15
[Search] Page 3/10 -> kept 1 | cumulative=15
[Search] Page 4/10 -> request
[Search] Page 4/10 -> found cards: 15
[Search] Page 4/10 -> kept 4 | cumulative=19
[Search] Page 5/10 -> request
[Search] Page 5/10 -> found cards: 15
[Search] Page 5/10 -> kept 2 | cumulative=21
[Search] Page 6/10 -> request
[Search] Page 6/10 -> found cards: 15
[Search] Page 6/10 -> kept 3 | cumulative=24
[Search] Page 7/10 -> request
[Search] Page 7/10 -> found cards: 15
[Search] Page 7/10 -> kept 3 | cumulative=27
[Search] Page 8/10 -> request
[Search] Page 8/10 -> found cards: 15
[Search] Page 8/10 -> kept 2 | cumulative=29
[Search] Page 9/10 -> request
[Search] Page 9/10 -> 

Unnamed: 0,keyword,province_code,province_name,page,job_title,company,location,salary,posted_date,job_url,...,skill_docker,skill_kubernetes,skill_mlflow,skill_fastapi,skill_flask,skill_streamlit,skill_statistics,skill_git,skill_api,skill_linux
0,data analyst,,กรุงเทพมหานคร,1,Data Analyst,บริษัท พูนธนามาร์เก็ตติ้ง จำกัด,กรุงเทพมหานคร เขตลาดพร้าว,"20,000 - 30,000 บาท",18/02/2026 16:57,https://jobbkk.com/jobs/detailurgent/42373/128...,...,0,0,0,0,0,0,0,0,0,0
1,data analyst,,กรุงเทพมหานคร,1,Data Analyst,บริษัท แมส ช้อยส์ คอร์ปอเรชั่น จำกัด,กรุงเทพมหานคร เขตตลิ่งชัน,ตามตกลง,18/02/2026 16:21,https://jobbkk.com/jobs/detailurgent/43586/126...,...,0,0,0,0,0,0,0,0,0,0
2,data analyst,,กรุงเทพมหานคร,1,Data Analyst,บริษัท อาร์เอสยู เฮลท์ แคร์ จำกัด,กรุงเทพมหานคร เขตคลองเตย,ตามตกลง,18/02/2026 15:51,https://jobbkk.com/jobs/detailurgent/40139/135...,...,0,0,0,0,0,0,0,0,0,0
3,data analyst,,กรุงเทพมหานคร,1,Data Analyst,บริษัท เอฟซีซี เซอร์วิสเซส จำกัด,กรุงเทพมหานคร เขตจตุจักร,"17,000 - 22,000 บาท",18/02/2026 10:21,https://jobbkk.com/jobs/detailurgent/33965/356851,...,0,0,0,0,0,0,0,0,0,0
4,data analyst,,กรุงเทพมหานคร,1,Data Analyst,บริษัท ไวท์ไลน์แอคทิเวชัน จำกัด,กรุงเทพมหานคร เขตราชเทวี,ตามตกลง,17/02/2026 09:51,https://jobbkk.com/jobs/detailurgent/58677/131...,...,0,0,0,0,0,0,0,0,0,0
5,data analyst,,สมุทรสาคร,1,DATA Analyst,EP Asia Group Company Limited,สมุทรสาคร เมืองสมุทรสาคร,ตามตกลง,16/02/2026 14:57,https://jobbkk.com/jobs/detailurgent/193275/10...,...,0,0,0,0,0,0,0,0,0,0
6,data analyst,,กรุงเทพมหานคร,1,เจ้าหน้าที่วิเคราะห์ข้อมูล (MIS / Data Analyst),บริษัท เมืองไทยประกันชีวิต จำกัด (มหาชน),กรุงเทพมหานคร เขตจตุจักร,ตามตกลง,17/02/2026 18:03,https://jobbkk.com/jobs/detailurgent/161183/13...,...,0,0,0,0,0,0,0,0,0,0
7,data analyst,,กรุงเทพมหานคร,1,นักวิเคราะห์ข้อมูล (Data Analyst),บริษัท อวิเอเท็ค จำกัด,กรุงเทพมหานคร เขตบางนา,ตามตกลง,18/02/2026 08:39,https://jobbkk.com/jobs/detailurgent/180147/13...,...,0,0,0,0,0,0,0,0,0,0
8,data analyst,,กรุงเทพมหานคร,1,นักวิเคราะห์ข้อมูล(Data Analyst),บริษัท มาราธอน (ประเทศไทย) จำกัด,กรุงเทพมหานคร เขตบางกอกน้อย,ตามตกลง,11/02/2026 10:03,https://jobbkk.com/jobs/detailurgent/205132/12...,...,0,0,0,0,0,0,0,0,0,0
9,data analyst,,กรุงเทพมหานคร,2,Data Analyst,บริษัท ไอโคเน็กซ์ จำกัด,กรุงเทพมหานคร เขตคลองเตย,ตามตกลง,17/02/2026 05:33,https://jobbkk.com/jobs/detail/55877/1349731,...,0,0,0,0,0,0,1,0,0,0
