In [0]:
# 1. Cleaning + Extract core JD
import re
from typing import Optional

END_MARKERS = [
    r"\babout our internships\b",
    r"\babout ing\b",
    r"\brelated content\b",
    r"\bcookie statement\b",
    r"\bprivacy\b",
    r"\bdiversity\b",
    r"\bequal opportunity\b",
    r"\bload more\b",
    r"\bsign up\b",
    r"\bemail address\b",
    r"\bshare this job\b",
    r"\bsaved jobs\b",
    r"\bexplore the area\b",
    r"\bview all of our available opportunities\b",
]

# Phrase yang biasanya bukan inti JD (hapus di dalam teks, bukan buang satu line)
NOISE_PHRASES = [
    r"\bapply now\b",
    r"\bsave for later\b",
    r"\bscroll down\b",
    r"\bshare this job\b",
    r"\bsaved jobs\b",
    r"\byour place of work\b",
    r"\bmore for you\b",
    r"\bopen in gmail\b",
    r"\blinkedin\b",
]

def normalize_text(t: str) -> str:
    t = "" if t is None else str(t)
    # normalize whitespace (works even if it's one long line)
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

def remove_noise_phrases(text: str) -> str:
    t = text
    for p in NOISE_PHRASES:
        t = re.sub(p, " ", t, flags=re.IGNORECASE)
    # cleanup extra spaces/newlines after removals
    t = re.sub(r"\s{2,}", " ", t).strip()
    return t

def find_end_after_min(text_lc: str, patterns, min_pos: int) -> Optional[int]:
    hits = []
    for p in patterns:
        for m in re.finditer(p, text_lc, flags=re.IGNORECASE):
            if m.start() >= min_pos:
                hits.append(m.start())
                break
    return min(hits) if hits else None

def extract_core_jd(
    text: str,
    min_end_pos: int = 800,     # abaikan END_MARKER yang muncul terlalu awal (menu/header)
    min_core_chars: int = 400,  # kalau hasil terlalu pendek, jangan dipotong
    max_chars: int = 12000
) -> str:
    t = normalize_text(text)
    if not t:
        return ""

    # penting: hapus noise phrase tanpa membuang seluruh baris
    t = remove_noise_phrases(t)
    if not t:
        return ""

    t_lc = t.lower()

    # selalu start dari awal
    end = find_end_after_min(t_lc, END_MARKERS, min_pos=min_end_pos)

    core = t[:end].strip() if end else t.strip()

    # fallback: kalau kepotong jadi terlalu pendek, anggap END_MARKER false positive
    if len(core) < min_core_chars:
        core = t.strip()

    if len(core) > max_chars:
        core = core[:max_chars]

    return core

In [0]:
# ambil 1 row yang kamu pakai tadi
row = (
    spark.table("skills_intelligence.02_silver.financial_sector_data_or_bi_analyst")
    .select("Company","description_text")
    .where("description_text is not null")
#    .limit(1)
    .limit(3)
#    .collect()[0]
    .collect()[2]
)

raw = row["description_text"]
core = extract_core_jd(raw)

print("RAW length:", len(raw))
print("CORE length:", len(core))
print("\n--- CORE preview ---\n")
print(core[:6000], "...")

In [0]:
# Step 1 Imports + fungsi cleaning + chunking
import re
from typing import Optional, List
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, ArrayType

# ----------------------------
# CLEANING (tanpa START_MARKERS)
# ----------------------------
END_MARKERS = [
    r"\babout our internships\b",
    r"\babout ing\b",
    r"\brelated content\b",
    r"\bcookie statement\b",
    r"\bprivacy\b",
    r"\bdiversity\b",
    r"\bequal opportunity\b",
    r"\bload more\b",
    r"\bsign up\b",
    r"\bemail address\b",
    r"\bshare this job\b",
    r"\bsaved jobs\b",
    r"\bexplore the area\b",
    r"\bview all of our available opportunities\b",
]

NOISE_PHRASES = [
    r"\bapply now\b",
    r"\bsave for later\b",
    r"\bscroll down\b",
    r"\bshare this job\b",
    r"\bsaved jobs\b",
    r"\byour place of work\b",
    r"\bmore for you\b",
    r"\blinkedin\b",
]

def normalize_text(t: str) -> str:
    t = "" if t is None else str(t)
    t = t.replace("\r\n", "\n").replace("\r", "\n")
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip()

def remove_noise_phrases(text: str) -> str:
    t = text
    for p in NOISE_PHRASES:
        t = re.sub(p, " ", t, flags=re.IGNORECASE)
    t = re.sub(r"\s{2,}", " ", t).strip()
    return t

def find_end_after_min(text_lc: str, patterns, min_pos: int) -> Optional[int]:
    hits = []
    for p in patterns:
        for m in re.finditer(p, text_lc, flags=re.IGNORECASE):
            if m.start() >= min_pos:
                hits.append(m.start())
                break
    return min(hits) if hits else None

def extract_core_jd(
    text: str,
    min_end_pos: int = 800,
    min_core_chars: int = 400,
    max_chars: int = 12000
) -> str:
    t = normalize_text(text)
    if not t:
        return ""

    t = remove_noise_phrases(t)
    if not t:
        return ""

    end = find_end_after_min(t.lower(), END_MARKERS, min_pos=min_end_pos)
    core = t[:end].strip() if end else t.strip()

    # fallback kalau kepotong terlalu pendek (marker di header)
    if len(core) < min_core_chars:
        core = t.strip()

    if len(core) > max_chars:
        core = core[:max_chars]

    return core


# ----------------------------
# CHUNKING (per-row)
# ----------------------------
def _split_blocks(text: str) -> List[str]:
    text = normalize_text(text)
    if not text:
        return []
    paras = [p.strip() for p in text.split("\n\n") if p.strip()]

    blocks = []
    bullet_pat = re.compile(r"^(\s*[-•*]\s+|\s*\d+[\.\)]\s+)")

    for p in paras:
        lines = [ln.strip() for ln in p.split("\n") if ln.strip()]
        if not lines:
            continue

        current = []
        in_bullets = bullet_pat.match(lines[0]) is not None

        for ln in lines:
            is_bullet = bullet_pat.match(ln) is not None
            if current and (is_bullet != in_bullets):
                blocks.append(" ".join(current).strip())
                current = []
                in_bullets = is_bullet
            current.append(ln)

        if current:
            blocks.append(" ".join(current).strip())

    return blocks

def chunk_job_description(
    text: str,
    chunk_size: int = 1500,
    overlap: int = 200,
    min_chunk_chars: int = 300
) -> List[str]:
    blocks = _split_blocks(text)
    if not blocks:
        return []

    chunks = []
    current = ""

    def flush(cur: str):
        cur = cur.strip()
        if len(cur) >= min_chunk_chars:
            chunks.append(cur)

    for b in blocks:
        if not current:
            current = b
            continue

        if len(current) + 1 + len(b) <= chunk_size:
            current = current + " " + b
        else:
            flush(current)
            tail = current[-overlap:] if overlap > 0 else ""
            current = (tail + " " + b).strip()

    flush(current)
    return chunks


# ----------------------------
# Spark UDF wrappers
# ----------------------------
extract_core_udf = F.udf(lambda x: extract_core_jd(x), StringType())
chunk_udf = F.udf(lambda x: chunk_job_description(x, chunk_size=1500, overlap=200, min_chunk_chars=300), ArrayType(StringType()))

In [0]:
# Step 2 Load table → Clean semua row

src = "skills_intelligence.02_silver.financial_sector_data_or_bi_analyst"

df = (
    spark.table(src)
    .select("Company", "description_text")
    .where(F.col("description_text").isNotNull())
    .withColumn("core_description_text", extract_core_udf(F.col("description_text")))
)

# Optional: drop core yang terlalu pendek (noise)
df_clean = df.where(F.length("core_description_text") >= 400)

display(df_clean.select("Company", F.length("description_text").alias("raw_len"), F.length("core_description_text").alias("core_len")).limit(20))


In [0]:
# Step 3 Chunk per row → explode jadi baris per chunk
df_chunks = (
    df_clean
    .withColumn("chunks", chunk_udf(F.col("core_description_text")))
    .withColumn("chunk_count", F.size("chunks"))
    .withColumn("chunk", F.explode("chunks"))
    .withColumn("chunk_len", F.length("chunk"))
    .drop("chunks")  # biar hemat storage
)

display(df_chunks.select("Company", "chunk_count", "chunk_len", "chunk").limit(20))

In [0]:
# Step 4 Tambah metadata + chunk_id (stabil)
df_chunks2 = (
    df_chunks
    .withColumn("row_hash", F.sha2(F.concat_ws("||", F.col("Company"), F.col("core_description_text")), 256))
    .withColumn("chunk_hash", F.sha2(F.col("chunk"), 256))
    .withColumn("created_at", F.current_timestamp())
    .select(
        "Company",
        "row_hash",
        "chunk_hash",
        "chunk",
        "chunk_len",
        "chunk_count",
        "created_at"
    )
)

display(df_chunks2.limit(20))

In [0]:
# Step 5 Simpan hasil chunks ke table
target = "skills_intelligence.02_silver.financial_sector_data_or_bi_analyst_core_chunks"

(
    df_chunks2
    .write
    .mode("overwrite")
    .saveAsTable(target)
)

print("Saved to:", target)

In [0]:
# Step 6 Quality check cepat
qc = (
    df_chunks2.groupBy("Company")
    .agg(
        F.count("*").alias("total_chunks"),
        F.avg("chunk_len").alias("avg_chunk_len"),
        F.countDistinct("row_hash").alias("distinct_rows")
    )
    .orderBy(F.desc("total_chunks"))
)

display(qc)
