In [0]:
# Step 1 Install + imports
%pip install chromadb openai
%pip install -U hnswlib


In [0]:
# Step 2 Init Chroma Persistent DB (di DBFS)
import chromadb
from chromadb.utils import embedding_functions
from pyspark.sql import functions as F
import os
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

PERSIST_DIR = "/Volumes/skills_intelligence/temp/temp/chroma_skills_intelligence"
COLLECTION_NAME = "financial_sector_data_or_bi_analyst_chunks"

os.makedirs(PERSIST_DIR, exist_ok=True)

settings = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",  # âœ… embedded/local mode
    persist_directory=PERSIST_DIR
)

chroma_client = chromadb.Client(settings)

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=dbutils.secrets.get("openai", "api_key"),
    model_name="text-embedding-3-small"
)

collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=openai_ef
)

print("Chroma path:", PERSIST_DIR)
print("Collection:", COLLECTION_NAME)
print("Count:", collection.count())

In [0]:
# Step 3 Make the chunks of data

SRC_CHUNKS = "skills_intelligence.02_silver.financial_sector_data_or_bi_analyst_core_chunks"

df = (
    spark.table(SRC_CHUNKS)
    .select("chunk_hash", "chunk", "Company", "row_hash", "chunk_len", "created_at")
    .where(F.col("chunk").isNotNull())
    .where(F.length("chunk") > 0)
    .dropDuplicates(["chunk_hash"])   # penting: karena chunk_hash jadi ID
)

print("Rows to embed:", df.count())
display(df.limit(5))

In [0]:
# Step 4 Upsert to Chroma

BATCH_SIZE = 128  # bisa 64/128/256. mulai 128 biar stabil.

batch_ids, batch_docs, batch_metas = [], [], []
total = 0

rows_iter = df.toLocalIterator()

for r in rows_iter:
    batch_ids.append(str(r["chunk_hash"]))
    batch_docs.append(str(r["chunk"]))
    batch_metas.append({
        "Company": str(r["Company"]),
        "row_hash": str(r["row_hash"]),
        "chunk_len": int(r["chunk_len"]) if r["chunk_len"] is not None else None,
        "created_at": str(r["created_at"]) if r["created_at"] is not None else None,
        "source_table": SRC_CHUNKS
    })

    if len(batch_ids) >= BATCH_SIZE:
        # upsert jika tersedia, else add
        if hasattr(collection, "upsert"):
            collection.upsert(ids=batch_ids, documents=batch_docs, metadatas=batch_metas)
        else:
            collection.add(ids=batch_ids, documents=batch_docs, metadatas=batch_metas)

        total += len(batch_ids)
        batch_ids, batch_docs, batch_metas = [], [], []
        print("Upserted:", total)

# flush sisa batch
if batch_ids:
    if hasattr(collection, "upsert"):
        collection.upsert(ids=batch_ids, documents=batch_docs, metadatas=batch_metas)
    else:
        collection.add(ids=batch_ids, documents=batch_docs, metadadatas=batch_metas)

    total += len(batch_ids)

print("DONE. Total upserted:", total)
print("Collection count now:", collection.count())

In [0]:
# Step 5 Query test to Chroma

q = "stakeholder management communication data analysis reporting"
res = collection.query(
    query_texts=[q],
    n_results=5,
    include=["documents", "metadatas", "distances"]
)

for i in range(4):
    print("\n---", i+1, "---")
    print("distance:", res["distances"][0][i])
    print("Company :", res["metadatas"][0][i].get("Company"))
    print(res["documents"][0][i][:350], "...")

In [0]:
# Step 6 Set up the GPT and check the soft_skill_livrea

from openai import OpenAI
import json
import time
import pandas as pd
from pyspark.sql import functions as F

CHAT_MODEL = "gpt-4o-mini"
client = OpenAI(api_key=dbutils.secrets.get("openai", "api_key"))

SOFT_TABLE = "skills_intelligence.01_bronze.livrea_soft_skill"

pdf_soft = (
    spark.table(SOFT_TABLE)
    .select("soft_skill", "description")
    .where(F.col("soft_skill").isNotNull())
    .toPandas()
)

pdf_soft.head()

In [0]:
# Step 7 Check again the Chroma

import os
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

PERSIST_DIR = "/Volumes/skills_intelligence/temp/temp/chroma_skills_intelligence"
COLLECTION_NAME = "financial_sector_data_or_bi_analyst_chunks"

os.makedirs(PERSIST_DIR, exist_ok=True)

settings = Settings(
    chroma_api_impl="chromadb.api.segment.SegmentAPI",
    persist_directory=PERSIST_DIR
)
chroma_client = chromadb.Client(settings)

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=dbutils.secrets.get("openai", "api_key"),
    model_name="text-embedding-3-small"
)

collection = chroma_client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=openai_ef
)

print("Chroma count:", collection.count())

In [0]:
# Step 8 Set up the System Prompt for the GPT

SYSTEM_PROMPT = """
You are an HR analytics rater.
Rate how relevant a given soft skill is for a Financial Sector Data/BI Analyst job market,
based ONLY on the provided evidence snippets from job descriptions.

Return ONLY valid JSON with this schema:
{
  "soft_skill": "string",
  "score": number,              // 0..1
  "rationale": "string",        // 1-2 short sentences
  "evidence_companies": ["string", ...]
}

Rules:
- score must be between 0 and 1 inclusive.
- be consistent: 0.0 = not present at all, 1.0 = strongly and repeatedly required.
- do not output anything other than JSON.
"""

In [0]:

# Step 8 Set up the System Prompt for the GPT

def build_evidence_block(res, max_chars_per_snippet=450):
    docs = res["documents"][0]
    metas = res["metadatas"][0]
    dists = res.get("distances", [[None]*len(docs)])[0]

    lines = []
    companies = []
    for i, doc in enumerate(docs):
        comp = (metas[i] or {}).get("Company", "Unknown")
        companies.append(comp)
        snippet = doc.strip().replace("\n", " ")
        if len(snippet) > max_chars_per_snippet:
            snippet = snippet[:max_chars_per_snippet].rstrip() + "..."
        dist = dists[i]
        lines.append(f"- ({comp}) [distance={dist}] {snippet}")

    return "\n".join(lines), sorted(list(set(companies)))

def call_gpt_score(soft_skill: str, desc: str, evidence_text: str, companies: list, max_retries=3):
    user_prompt = f"""
Soft skill:
- soft_skill: {soft_skill}
- description: {desc}

Evidence snippets (top matches from job descriptions):
{evidence_text}

Provide the JSON result now.
"""

    last_err = None
    for attempt in range(1, max_retries+1):
        try:
            resp = client.chat.completions.create(
                model=CHAT_MODEL,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=0.0
            )
            raw = resp.choices[0].message.content.strip()

            # Parse JSON robustly (strip code fences if any)
            raw = raw.strip("`").strip()
            data = json.loads(raw)

            # enforce schema + bounds
            score = float(data.get("score", 0.0))
            score = max(0.0, min(1.0, score))
            out = {
                "soft_skill": soft_skill,
                "score": score,
                "rationale": str(data.get("rationale", "")).strip(),
                "evidence_companies": data.get("evidence_companies", companies) or companies
            }
            return out, raw

        except Exception as e:
            last_err = e
            time.sleep(1.2 * attempt)

    raise last_err

In [0]:
# Step 9 Get top K matches for each soft skill from the Chroma and give it to GPT to score 0-10

TOP_K = 5
results = []

for idx, r in pdf_soft.iterrows():
    soft_skill = str(r["soft_skill"]).strip()
    desc = "" if pd.isna(r["description"]) else str(r["description"]).strip()

    query = f"{soft_skill}. {desc}".strip()

    res = collection.query(
        query_texts=[query],
        n_results=TOP_K,
        include=["documents", "metadatas", "distances"]
    )

    evidence_text, companies = build_evidence_block(res, max_chars_per_snippet=450)

    scored, raw_json = call_gpt_score(
        soft_skill=soft_skill,
        desc=desc,
        evidence_text=evidence_text,
        companies=companies,
        max_retries=3
    )

    scored["top_k"] = TOP_K
    scored["query_text"] = query
    scored["scored_at"] = pd.Timestamp.utcnow().isoformat()
    scored["model"] = CHAT_MODEL

    results.append(scored)

    if (idx + 1) % 5 == 0:
        print(f"Scored {idx+1}/{len(pdf_soft)} soft skills")

In [0]:
# Step 10 Check the result

results[:20]

In [0]:
# Step 11 store to Golden

target = "skills_intelligence.03_golden.financial_sector_data_or_bi_analyst_x_livrea_soft_skill"

df_out = spark.createDataFrame(pd.DataFrame(results)) \
    .withColumn("scored_at_ts", F.current_timestamp())

(
    df_out
    .write
    .mode("overwrite")
    .saveAsTable(target)
)

print("Saved:", target)
display(spark.table(target).orderBy(F.desc("score")).limit(20))