In [74]:
import pandas as pd
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import time

## Initiate basic variables

In [75]:
JOBS_LIST = ['Data Scientist', 'Data Analyst', 'Data Engineer']
SKILLS = {
    # ---------------- Core Programming ----------------
    "python": ["python"],
    "r": [" r ", " r,", " r\n", " r/"],
    "java": ["java"],
    "scala": ["scala"],
    "c++": ["c++"],

    # ---------------- SQL & Databases ----------------
    "sql & database": [" sql ", "mysql", "postgres", "postgresql", "oracle", "sql server", "mssql", "sqlite"],
    "mongodb": ["mongodb", "mongo"],
    "elasticsearch": ["elasticsearch", "elastic search"],

    # ---------------- Data Libraries ----------------
    "pandas": ["pandas"],
    "numpy": ["numpy"],
    "scipy": ["scipy"],
    "sklearn": ["scikit-learn", "sklearn"],

    # ---------------- Machine Learning ----------------
    "machine_learning": [
        "machine learning", "supervised", "unsupervised",
        "random forest", "xgboost", "lightgbm", "catboost"
    ],

    # ---------------- Deep Learning ----------------
    "deep_learning": [
        "deep learning", "neural network", "cnn", "rnn", "lstm", "transformer"
    ],

    # ---------------- GenAI / LLM ----------------
    "llm": ["llm", "large language model"],
    "rag": ["rag", "retrieval augmented generation"],
    "langchain": ["langchain"],
    "openai": ["openai"],
    "huggingface": ["huggingface"],
    "prompt_engineering": ["prompt engineering"],
    "vector_db": ["vector database", "pinecone", "faiss", "weaviate", "milvus"],

    # ---------------- Visualization / BI ----------------
    "excel": ["excel", "vlookup", "pivot table", "power query"],
    "powerbi": ["power bi", "powerbi", "dax"],
    "tableau": ["tableau"],
    "matplotlib": ["matplotlib"],
    "seaborn": ["seaborn"],
    "plotly": ["plotly"],

    # ---------------- Big Data ----------------
    "spark": ["spark", "pyspark"],
    "hadoop": ["hadoop"],
    "kafka": ["kafka"],

    # ---------------- Cloud ----------------
    "aws": ["aws", "amazon web services", "s3", "redshift", "athena", "glue", "lambda"],
    "gcp": ["gcp", "google cloud", "bigquery", "cloud storage"],
    "azure": ["azure", "synapse", "databricks"],

    # ---------------- Data Engineering ----------------
    "etl": ["etl", "elt", "data pipeline"],
    "airflow": ["airflow"],

    # ---------------- MLOps / Deployment ----------------
    "docker": ["docker"],
    "kubernetes": ["kubernetes", "k8s"],
    "mlflow": ["mlflow"],
    "fastapi": ["fastapi"],
    "flask": ["flask"],
    "streamlit": ["streamlit"],

    # ---------------- Statistics ----------------
    "statistics": [
        "statistics", "statistical", "hypothesis testing",
        "regression", "anova", "probability"
    ],

    # ---------------- Version Control ----------------
    "git": ["git", "github", "gitlab"],

    # ---------------- APIs ----------------
    "api": ["api", "rest api"],

    # ---------------- Linux ----------------
    "linux": ["linux", "unix"],    
}

SEARCH_URLS = {
    "JobThai": [[job_title,f"https://www.jobthai.com/th/jobs?keyword={job_title}&page=1&orderBy=RELEVANCE_SEARCH".replace(" ", "%20")] for job_title in JOBS_LIST],
    "JobsDB": [[job_title,f"https://th.jobsdb.com/th/{job_title}-jobs".replace(" ", "-")] for job_title in JOBS_LIST],
    "JOBBKK": [[job_title,f"https://jobbkk.com/jobs/lists/1/หางาน,{job_title},ทุกจังหวัด,ทั้งหมด.html?keyword_type=3&sort=4".replace(" ", "%20")] for job_title in JOBS_LIST],
}

KEY_VARIANTS = {
    "data": ["data"],
    "scientist": ["scientist", "science", "scien", "scient"],
    "engineer": ["engineer", "engineering", "eng"],
    "analyst": ["analyst", "analytics", "analysis"],
    "developer": ["developer", "development", "dev"],
}

SKILL_COLUMNS = [f"skill_{name}" for name in SKILLS]

# For Debugging Start
print("Search URLs:")
for platform, urls in SEARCH_URLS.items():
    print(f"{platform}:")
    for url in urls:
        print(f"  {url}")
# For Debugging End     

Search URLs:
JobThai:
  ['Data Scientist', 'https://www.jobthai.com/th/jobs?keyword=Data%20Scientist&page=1&orderBy=RELEVANCE_SEARCH']
  ['Data Analyst', 'https://www.jobthai.com/th/jobs?keyword=Data%20Analyst&page=1&orderBy=RELEVANCE_SEARCH']
  ['Data Engineer', 'https://www.jobthai.com/th/jobs?keyword=Data%20Engineer&page=1&orderBy=RELEVANCE_SEARCH']
JobsDB:
  ['Data Scientist', 'https://th.jobsdb.com/th/Data-Scientist-jobs']
  ['Data Analyst', 'https://th.jobsdb.com/th/Data-Analyst-jobs']
  ['Data Engineer', 'https://th.jobsdb.com/th/Data-Engineer-jobs']
JOBBKK:
  ['Data Scientist', 'https://jobbkk.com/jobs/lists/1/หางาน,Data%20Scientist,ทุกจังหวัด,ทั้งหมด.html?keyword_type=3&sort=4']
  ['Data Analyst', 'https://jobbkk.com/jobs/lists/1/หางาน,Data%20Analyst,ทุกจังหวัด,ทั้งหมด.html?keyword_type=3&sort=4']
  ['Data Engineer', 'https://jobbkk.com/jobs/lists/1/หางาน,Data%20Engineer,ทุกจังหวัด,ทั้งหมด.html?keyword_type=3&sort=4']


## JobThai Scraper

In [None]:
def normalize_province_code(value) -> str:
    text = str(value).strip()
    if text.isdigit():
        number = int(text)
        if number <= 0:
            raise ValueError(f"Province must be positive, got: {value}")
        return f"{number:02d}"
    raise ValueError(f"Invalid province code: {value}")


def normalize_for_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()


def normalize_for_skill_match(text: str) -> str:
    return re.sub(r"[^a-z0-9]+", " ", text.lower()).strip()


def keyword_match_groups_from_query(keyword: str) -> list[list[str]]:
    tokens = [token for token in normalize_for_match(keyword).split() if token]
    groups = []

    for token in tokens:
        if token in KEY_VARIANTS:
            groups.append(KEY_VARIANTS[token])
        else:
            groups.append([token])

    return groups


def title_matches_keyword(title: str, keyword_groups: list[list[str]]) -> bool:
    if not keyword_groups:
        return True

    title_norm = normalize_for_match(title)
    search_from = 0

    for group in keyword_groups:
        best_pos = None
        best_variant = ""

        for variant in group:
            variant_norm = normalize_for_match(variant)
            if not variant_norm:
                continue

            pos = title_norm.find(variant_norm, search_from)
            if pos != -1 and (best_pos is None or pos < best_pos):
                best_pos = pos
                best_variant = variant_norm

        if best_pos is None:
            return False

        search_from = best_pos + len(best_variant)

    return True


def variant_matches_text(variant: str, normalized_text: str) -> bool:
    variant_norm = normalize_for_skill_match(variant)
    if not variant_norm:
        return False
    pattern = rf"(?<![a-z0-9]){re.escape(variant_norm).replace(r'\\ ', r'\\s+')}(?![a-z0-9])"
    return re.search(pattern, normalized_text) is not None


def extract_skills(text: str) -> dict:
    normalized_text = normalize_for_skill_match(text)
    matched = []

    for skill_name, variants in SKILLS.items():
        if any(variant_matches_text(variant, normalized_text) for variant in variants):
            matched.append(skill_name)

    skill_flags = {f"skill_{name}": int(name in matched) for name in SKILLS}

    return {
        "matched_skills": "|".join(matched),
        "matched_skill_count": len(matched),
        **skill_flags,
    }


def normalize_jobthai_detail_url(job_url: str) -> str:
    if not job_url:
        return ""

    parsed = urlparse(job_url)
    path = parsed.path

    path = path.replace("/th/company/job/", "/th/job/")
    path = path.replace("/company/job/", "/job/")

    return urlunparse((parsed.scheme, parsed.netloc, path, "", "", ""))


def clean_text(text: str) -> str:
    return " ".join((text or "").split())


def extract_salary(text: str) -> str:
    patterns = [
        r"\d[\d,\s]*\s*-\s*\d[\d,\s]*\s*บาท",
        r"\d[\d,\s]*\s*บาท",
        r"ตามโครงสร้างบริษัทฯ",
        r"ตามประสบการณ์",
        r"ตามตกลง",
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return clean_text(match.group(0))
    return ""


def extract_posted_date(text: str) -> str:
    match = re.search(r"\b\d{1,2}\s+[ก-๙A-Za-z\.]+\s+\d{2}\b", text)
    return clean_text(match.group(0)) if match else ""


def pick_text(parent, selectors: list[str]) -> str:
    for selector in selectors:
        element = parent.select_one(selector)
        if element:
            text = clean_text(element.get_text(" ", strip=True))
            if text:
                return text
    return ""


def guess_location(lines: list[str], title: str, company: str, salary: str) -> str:
    priority_keywords = ["เขต", "กรุงเทพ", "จังหวัด", "อำเภอ", "อ.", "ต."]
    transit_keywords = ["BTS", "MRT", "SRT", "BRT", "Airport Rail Link"]

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in priority_keywords):
            return line

    for line in lines:
        if line in {title, company, salary}:
            continue
        if any(keyword in line for keyword in transit_keywords):
            return line

    if salary and salary in lines:
        salary_idx = lines.index(salary)
        for idx in range(salary_idx - 1, -1, -1):
            candidate = lines[idx]
            if candidate not in {title, company}:
                return candidate

    return ""


def parse_card_from_title(title_node, page_num: int, keyword: str) -> dict:
    title = clean_text(title_node.get_text(" ", strip=True))

    anchor = title_node.find_parent("a", href=True)
    href = anchor.get("href", "") if anchor else ""
    job_url = href if href.startswith("http") else f"https://www.jobthai.com{href}"
    job_url = normalize_jobthai_detail_url(job_url)

    card = anchor if anchor is not None else title_node

    company = pick_text(card, [
        'span[id^="job-list-company-name-"]',
        'h2.ohgq7e-0.enAWkF',
    ])

    location = pick_text(card, [
        "h3#location-text",
        "h3.location-text",
    ])

    salary = pick_text(card, [
        "span.salary-text",
        "div.msklqa-20",
        "div.msklqa-17",
    ])

    posted_date = pick_text(card, [
        "span.msklqa-9",
    ])

    raw_lines = [clean_text(x) for x in card.get_text("\n", strip=True).splitlines() if clean_text(x)]
    raw_text = clean_text(" ".join(raw_lines))

    if not salary:
        salary = extract_salary(raw_text)
    if not posted_date:
        posted_date = extract_posted_date(raw_text)
    if not location:
        location = guess_location(raw_lines, title=title, company=company, salary=salary)

    return {
        "keyword": keyword,
        "page": page_num,
        "job_title": title,
        "company": company,
        "location": location,
        "salary": salary,
        "posted_date": posted_date,
        "job_url": job_url,
        "raw_text": raw_text,
    }


def extract_detail_from_job_page(job_url: str, headers: dict) -> dict:
    base_detail = {
        "province_code": "",
        "province_name": "",
        "job_detail_text": "",
        "job_qualification_text": "",
        "matched_skills": "",
        "matched_skill_count": 0,
        **{column: 0 for column in SKILL_COLUMNS},
    }

    try:
        response = requests.get(job_url, headers=headers, timeout=30)
        response.raise_for_status()
    except Exception:
        return base_detail

    soup = BeautifulSoup(response.text, "html.parser")

    province_code = ""
    province_name = ""
    for anchor in soup.select('a[href*="province="]'):
        tag = anchor.select_one('h3[id^="job-detail-tag-"]')
        if not tag:
            continue

        href = anchor.get("href", "")
        name = clean_text(tag.get_text(" ", strip=True))
        if not href or not name:
            continue

        province_value = parse_qs(urlparse(href).query).get("province", [""])[0]
        if not province_value or not province_value.isdigit():
            continue

        try:
            province_code = normalize_province_code(province_value)
        except ValueError:
            continue

        province_name = name
        break

    jd_node = soup.select_one("span#job-detail")
    job_detail_text = clean_text(jd_node.get_text("\n", strip=True)) if jd_node else ""

    qualification_node = soup.select_one("#job-properties-wrapper")
    job_qualification_text = clean_text(qualification_node.get_text(" ", strip=True)) if qualification_node else ""

    combined_text = " ".join([text for text in [job_detail_text, job_qualification_text] if text])
    skill_info = extract_skills(combined_text)

    return {
        "province_code": province_code,
        "province_name": province_name,
        "job_detail_text": job_detail_text,
        "job_qualification_text": job_qualification_text,
        **skill_info,
    }


def scrape_job_jobthai(
    SEARCH_URLS: dict[str, list[str]],
    SLEEP_SEC: float = 0.5,
) -> pd.DataFrame:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "Accept-Language": "th-TH,th;q=0.9,en-US;q=0.8,en;q=0.7",
    }

    collected_frames = []

    try:
        for job in SEARCH_URLS["JobThai"]:
            keyword = job[0]
            search_url = job[1]

            keyword_groups = keyword_match_groups_from_query(keyword)
            print(f"Scraping JobThai for '{keyword}'")

            all_rows = []
            seen_urls = set()

            for page_no in range(1, 50):
                page_url = search_url.replace("page=1", f"page={page_no}")
                print(f"\tFetching page {page_no}")

                response = requests.get(page_url, headers=headers, timeout=30)
                response.raise_for_status()

                if "nodata=true" in response.url.lower():
                    print("No data found for this keyword.")
                    break

                soup = BeautifulSoup(response.text, "html.parser")
                title_cards_html = soup.select('h2[id^="job-card-item-"]')

                page_rows = []
                for title_card_html in title_cards_html:
                    row = parse_card_from_title(
                        title_card_html,
                        page_num=page_no,
                        keyword=keyword,
                    )

                    if not row["job_url"]:
                        continue
                    if not title_matches_keyword(row["job_title"], keyword_groups):
                        continue
                    if row["job_url"] in seen_urls:
                        continue

                    seen_urls.add(row["job_url"])
                    page_rows.append(row)

                if not page_rows:
                    break

                all_rows.extend(page_rows)

                if SLEEP_SEC > 0:
                    time.sleep(SLEEP_SEC)

            total_details = len(all_rows)
            print(f"[Detail] Starting detail extraction for {total_details} jobs")

            for row in all_rows:
                detail_info = extract_detail_from_job_page(row["job_url"], headers=headers)
                row.update(detail_info)

                if SLEEP_SEC > 0:
                    time.sleep(SLEEP_SEC)

            job_df = pd.DataFrame(all_rows)
            if job_df.empty:
                continue

            ordered_columns = [
                "keyword",
                "province_code",
                "province_name",
                "page",
                "job_title",
                "company",
                "location",
                "salary",
                "posted_date",
                "job_url",
                "raw_text",
                "job_detail_text",
                "job_qualification_text",
                "matched_skills",
                "matched_skill_count",
                *SKILL_COLUMNS,
            ]
            for column in ordered_columns:
                if column not in job_df.columns:
                    job_df[column] = "" if column not in {"matched_skill_count", *SKILL_COLUMNS} else 0

            job_df = job_df[ordered_columns].drop_duplicates(subset=["job_url"])
            collected_frames.append(job_df)
            print(f"[Done] Collected {len(job_df)} rows for '{keyword}'")

    except Exception as e:
        print(f"Error occurred on JobThai scraping: {e}")
        print("Skipping JobThai and returning collected data so far.")

    if not collected_frames:
        return pd.DataFrame()

    final_df = pd.concat(collected_frames, ignore_index=True)
    final_df = final_df.drop_duplicates(subset=["job_url"])
    return final_df

In [79]:
jobs_df = scrape_job_jobthai(SEARCH_URLS)
jobs_df

Keyword Groups for 'Data Scientist': [['data'], ['scientist', 'science', 'scien', 'scient']]
Scraping JobThai for 'Data Scientist'
	Fetching page 1: found cards=20
	Fetching page 2: found cards=20
[Detail] Starting detail extraction for 8 jobs
[Done] Collected 8 rows for 'Data Scientist'
Keyword Groups for 'Data Analyst': [['data'], ['analyst', 'analytics', 'analysis']]
Scraping JobThai for 'Data Analyst'
	Fetching page 1: found cards=20
	Fetching page 2: found cards=20
	Fetching page 3: found cards=20
	Fetching page 4: found cards=20
	Fetching page 5: found cards=20
[Detail] Starting detail extraction for 61 jobs
[Done] Collected 61 rows for 'Data Analyst'
Keyword Groups for 'Data Engineer': [['data'], ['engineer', 'engineering', 'eng']]
Scraping JobThai for 'Data Engineer'
	Fetching page 1: found cards=20
	Fetching page 2: found cards=20
	Fetching page 3: found cards=20
[Detail] Starting detail extraction for 27 jobs
[Done] Collected 27 rows for 'Data Engineer'


Unnamed: 0,keyword,province_code,province_name,page,job_title,company,location,salary,posted_date,job_url,...,skill_docker,skill_kubernetes,skill_mlflow,skill_fastapi,skill_flask,skill_streamlit,skill_statistics,skill_git,skill_api,skill_linux
0,Data Scientist,01,กรุงเทพมหานคร,1,Data Scientist,บริษัท จี.เอ็ม.เอส. คอร์เปอเรชั่น จำกัด,เขตยานนาวา กรุงเทพมหานคร,"25,000 - 30,000 บาท",17 ก.พ. 69,https://www.jobthai.com/th/job/1822474,...,0,0,0,0,0,0,0,0,0,0
1,Data Scientist,28,จ.ปทุมธานี,1,Data Scientist,บริษัท ดูโฮม จำกัด (มหาชน),อ.เมืองปทุมธานี จ.ปทุมธานี,ตามโครงสร้างบริษัท/ประสบการณ์,18 ก.พ. 69,https://www.jobthai.com/th/job/1533922,...,0,0,0,0,0,0,0,0,0,0
2,Data Scientist,01,กรุงเทพมหานคร,1,Data Scientist,BSL Clinic บีเอสแอลคลินิก Bangkok skin and las...,BTS ศาลาแดง,ตามตกลง,18 ก.พ. 69,https://www.jobthai.com/th/job/1867086,...,0,0,0,0,0,0,1,0,0,0
3,Data Scientist,01,กรุงเทพมหานคร,1,Data Scientist,"Digital Dialogue Co., Ltd.","MRT พระราม 9, ศูนย์วัฒนธรรมแห่งประเทศไทย",ตามโครงสร้างบริษัทฯ,18 ก.พ. 69,https://www.jobthai.com/th/job/1685397,...,0,0,0,0,0,0,0,0,0,0
4,Data Scientist,01,กรุงเทพมหานคร,1,Specialist - DATA Scientist,บริษัท เคซีจี คอร์ปอเรชั่น จำกัด (มหาชน) / KCG...,"BTS บางจาก, ปุณณวิถี",ตามโครงสร้างบริษัทฯ,18 ก.พ. 69,https://www.jobthai.com/th/job/1848401,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,Data Engineer,01,กรุงเทพมหานคร,2,Senior Data Engineer - WFH 95% + Office 5%,"THiNKNET Co., Ltd.","BTS ช่องนนทรี, ศาลาแดง","40,000 - 100,000 บาท",18 ก.พ. 69,https://www.jobthai.com/th/job/1446835,...,1,1,0,0,0,0,0,1,0,0
91,Data Engineer,49,จ.ระยอง,2,Data Center Extra-Low Voltage Engineer,"RENRUI SERVICES RECRUITMENT (THAILAND) CO., LTD.",อ.ปลวกแดง จ.ระยอง,"100,000 - 350,000 บาท",11 ก.พ. 69,https://www.jobthai.com/th/job/1811128,...,0,0,0,0,0,0,0,0,0,0
92,Data Engineer,01,กรุงเทพมหานคร,2,Data Center HVAC & CFD Simulation Engineer,"RENRUI SERVICES RECRUITMENT (THAILAND) CO., LTD.",MRT พระราม 9,"100,000 - 350,000 บาท",11 ก.พ. 69,https://www.jobthai.com/th/job/1811152,...,0,0,0,0,0,0,0,0,0,0
93,Data Engineer,01,กรุงเทพมหานคร,2,ฝึกงาน ตำแหน่ง Data Engineer - WFH 95% + Offic...,"THiNKNET Co., Ltd.","BTS ช่องนนทรี, ศาลาแดง",400 บาท (รายวัน),18 ก.พ. 69,https://www.jobthai.com/th/job/1813949,...,0,0,0,0,0,0,0,0,0,0
