
# HR Career Advisor — PG Vector + AWS KB (Prototype v2)

**What’s new in v2**
- Placeholder **user name/email** for quick testing (no AWT needed).
- If profile isn’t found, the agent **still answers** with best-effort results.
- For common question: *“What jobs and courses should I look at for data engineering?”*,
  we include **fallback roles/courses** if retrieval returns empty.
- After answering, the agent **asks for profile details** (email or name+division).
- If not found, we offer a **30‑sec Quick Profile** (role, skills, interests).

Run order:
1) Configure env (or use defaults in this notebook).  
2) Execute smoke tests at bottom or call `run_workflow(...)` interactively.


## 0) Setup & Environment

In [None]:

import os, json, time
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
import numpy as np
from IPython.display import display, Markdown

# %pip install -q psycopg[binary] boto3 python-dotenv

from dotenv import load_dotenv
load_dotenv()

AWS_REGION = os.getenv("AWS_REGION", "us-west-2")
AWS_MODEL_ID = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"  # per requirement

# --- PG & KBs ---
PG_DSN = os.getenv("PG_DSN","")  # postgresql://user:pass@host:5432/dbname (URL-encode password)
PG_COLLECTIONS = [
    "internal_private_employee_profiles_vectorstore",
    "internal_curated_informa_vectorstore",
]
JOB_KB_ID = os.getenv("JOB_KB_ID","")
COURSES_KB_ID = os.getenv("COURSES_KB_ID","")

# --- Prototype placeholders (safe defaults; override via env) ---
DEFAULT_USER_NAME  = os.getenv("DEFAULT_USER_NAME",  "Kedar Santosh Prabhu")
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL", "kedarsantosh.prabhu@informa.com")
DEFAULT_USER_DIV   = os.getenv("DEFAULT_USER_DIVISION", "")

print("Env present:", dict(
    PG=bool(PG_DSN),
    JOB_KB=bool(JOB_KB_ID),
    COURSES_KB=bool(COURSES_KB_ID),
    DEFAULT_USER_NAME=DEFAULT_USER_NAME,
    DEFAULT_USER_EMAIL=DEFAULT_USER_EMAIL
))


## 1) PGVector Retrieval (uses stored `embedding` + `document`)

In [None]:

import psycopg

def get_pg_conn():
    if not PG_DSN:
        raise RuntimeError("PG_DSN not set")
    return psycopg.connect(PG_DSN)

# NOTE: e.id does not exist in your schema — use e.uuid AS id
KEYWORD_PREFILTER_SQL = """SELECT e.uuid AS id, e.embedding, e.document, e.cmetadata, c.name as collection
FROM ai.langchain_pg_embedding e
JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
WHERE c.name = %(collection)s
  AND (e.document ILIKE '%%' || %(query)s || '%%'
       OR CAST(e.cmetadata AS TEXT) ILIKE '%%' || %(query)s || '%%')
LIMIT %(k)s;
"""

def _to_meta(meta):
    if isinstance(meta,(dict,list)): return meta
    try: return json.loads(meta)
    except: return {"raw": str(meta)}

def _cosine(a: np.ndarray, b: np.ndarray) -> float:
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    if denom == 0: return 0.0
    return float(np.dot(a, b) / denom)

def pg_search_hybrid(collection: str, query: str, pre_k: int = 24, top_k: int = 8) -> List[Dict[str,Any]]:
    with get_pg_conn() as conn, conn.cursor() as cur:
        cur.execute(KEYWORD_PREFILTER_SQL, {"collection": collection, "query": query, "k": pre_k})
        rows = cur.fetchall()
    if not rows: return []
    embs, items = [], []
    for _id, emb, doc, meta, coll in rows:
        v = np.array(emb, dtype=np.float32)
        embs.append(v)
        items.append({"id": _id, "embedding": emb, "document": doc, "metadata": _to_meta(meta), "collection": coll})
    centroid = np.mean(embs, axis=0)
    for it in items:
        it["score"] = _cosine(centroid, np.array(it["embedding"], dtype=np.float32))
    items.sort(key=lambda x: x.get("score",0.0), reverse=True)
    return items[:top_k]

def pg_multi_search(query: str, collections: List[str]) -> List[Dict[str,Any]]:
    hits = []
    for coll in collections:
        try:
            hits.extend(pg_search_hybrid(coll, query, 24, 8))
        except Exception as e:
            print(f"⚠️ PG search failed for {coll}: {e}")
    hits.sort(key=lambda x: x.get("score",0.0), reverse=True)
    return hits[: max(6, len(collections)) ]

# Profile lookup: prefer name+division if available; email as tiebreaker
def profile_lookup(email: Optional[str] = None,
                   name: Optional[str] = None,
                   division: Optional[str] = None) -> List[Dict[str,Any]]:
    if not PG_DSN:
        return []
    results: List[Dict[str,Any]] = []
    with get_pg_conn() as conn, conn.cursor() as cur:
        if name:
            q_name = """            SELECT e.document, e.cmetadata
            FROM ai.langchain_pg_embedding e
            JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
            WHERE c.name = 'internal_private_employee_profiles_vectorstore'
              AND (e.cmetadata->>'name') ILIKE %(name)s
              AND (%(division)s IS NULL OR (e.cmetadata->>'division') ILIKE %(division)s)
            LIMIT 25;
            """
            cur.execute(q_name, {"name": f"%{name}%", "division": None if not division else f"%{division}%"})
            rows = cur.fetchall()
            for doc, meta in rows:
                try: meta = meta if isinstance(meta, dict) else json.loads(meta)
                except: meta = {"raw": str(meta)}
                results.append({"document": doc, "metadata": meta})
            if email:
                exact = [r for r in results if (r["metadata"] or {}).get("email") == email]
                if exact:
                    return exact
            return results
        elif email:
            q_email = """            SELECT e.document, e.cmetadata
            FROM ai.langchain_pg_embedding e
            JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
            WHERE c.name = 'internal_private_employee_profiles_vectorstore'
              AND (e.cmetadata->>'email') = %(email)s
            LIMIT 10;
            """
            cur.execute(q_email, {"email": email})
            rows = cur.fetchall()
            for doc, meta in rows:
                try: meta = meta if isinstance(meta, dict) else json.loads(meta)
                except: meta = {"raw": str(meta)}
                results.append({"document": doc, "metadata": meta})
            return results
        else:
            return []


## 2) AWS Knowledge Bases Retrieval

In [None]:

import boto3
try:
    kb_rt = boto3.client("bedrock-agent-runtime", region_name=AWS_REGION) if (JOB_KB_ID or COURSES_KB_ID) else None
except Exception as e:
    kb_rt = None
    print("⚠️ AWS KB unavailable:", e)

def kb_retrieve(kb_id: str, query: str, top_k: int = 5) -> List[Dict[str,Any]]:
    if not kb_rt or not kb_id:
        return []
    try:
        resp = kb_rt.retrieve(
            knowledgeBaseId=kb_id,
            retrievalConfiguration={"vectorSearchConfiguration": {"numberOfResults": top_k}},
            retrievalQuery={"text": query},
        )
        out = []
        for r in resp.get("retrievalResults", []):
            c = r.get("content", {})
            out.append({
                "title": c.get("title") or (c.get("text","").split("\n")[0][:80]).strip(),
                "snippet": c.get("snippetText") or c.get("text","")[:240],
                "score": r.get("score"),
                "kb_id": kb_id,
                "metadata": r.get("metadata") or {},
                "source": r.get("location", {}).get("s3Location", {}).get("uri"),
                "type": r.get("metadata",{}).get("type")
            })
        return out
    except Exception as e:
        print("⚠️ KB retrieve failed:", e)
        return []

def kb_search_all(query: str) -> Dict[str, List[Dict[str,Any]]]:
    return {
        "jobs":    kb_retrieve(JOB_KB_ID, query, 6) if JOB_KB_ID else [],
        "courses": kb_retrieve(COURSES_KB_ID, query, 6) if COURSES_KB_ID else [],
    }


## 3) Prohibitor, State, Intent, Onboarding Helpers

In [None]:

AllowedIntents = {"courses","job","development_plan","manager_toolkit","leadership_strategy","career"}

def prohibitor(user_text: str) -> Dict[str,Any]:
    t = user_text.lower()
    allowed = any(k in t for k in ["career","course","job","role","roles","learn","upskill","development","manager","leadership","okr","coaching","promotion","ladder","mentoring","objective","okrs"])
    intents = []
    if any(k in t for k in ["job","jobs","opening","openings","role","roles"]): intents.append("job")
    if any(k in t for k in ["course","courses","learn","training","upskill"]): intents.append("courses")
    if any(k in t for k in ["mentoring","mentor"]): intents.append("manager_toolkit")
    if any(k in t for k in ["objective","okr","okrs"]): intents.append("leadership_strategy")
    if any(k in t for k in ["development plan","30-day","60-day","90-day","dev plan"]): intents.append("development_plan")
    if not intents and allowed: intents.append("career")
    return {"allowed": allowed and bool(intents), "intents": intents or [], "rationale": "heuristic v0.2"}

@dataclass
class AgentState:
    email: Optional[str] = None
    name: Optional[str] = None
    division: Optional[str] = None
    employee_id: Optional[str] = None
    is_manager: bool = False
    prompt: Optional[str] = None
    quick_profile: Optional[Dict[str,Any]] = None

def derive_is_manager_from_profile(meta: dict) -> bool:
    if str(meta.get("is_manager","")).lower() in {"true","1","yes"}: return True
    if int(meta.get("direct_reports",0) or 0) > 0: return True
    title = (meta.get("title") or "").lower()
    if any(k in title for k in [" manager","lead","head of","director","vp"]): return True
    return False

def setup_state(email: Optional[str], name: Optional[str], division: Optional[str],
                override_is_manager: Optional[bool], user_text: str) -> Tuple[AgentState, dict]:
    rows = profile_lookup(email=email or DEFAULT_USER_EMAIL,
                          name=name or DEFAULT_USER_NAME,
                          division=division or DEFAULT_USER_DIV)
    meta = rows[0]["metadata"] if rows else {}
    is_mgr = override_is_manager if override_is_manager is not None else derive_is_manager_from_profile(meta)
    st = AgentState(email=email or DEFAULT_USER_EMAIL, name=name or DEFAULT_USER_NAME,
                    division=division or DEFAULT_USER_DIV, employee_id=meta.get("employee_id"),
                    is_manager=is_mgr, prompt=user_text)
    return st, meta

def intent_persona(intents: List[str]) -> List[str]:
    return sorted(set(i for i in intents if i in AllowedIntents))

def onboarding_message() -> str:
    return (
        "**To tailor this to you**, please enter your corporate **email** or **name + division**.\n\n"
        "If we can’t find you, I’ll run a 30‑sec Quick Profile wizard (role, top 5 skills, interests)."
    )

def quick_profile_wizard_stub() -> Dict[str,Any]:
    return {
        "role": "Software Engineer",
        "skills": ["Python","SQL","AWS","Airflow","Data Modeling"],
        "interests": ["Data Engineering","Platform","Analytics"]
    }


## 4) Tools (jobs & courses) + Fallbacks + Reflexion

In [None]:

def tool_pg_search(query: str, k: int = 8) -> List[Dict[str,Any]]:
    return pg_multi_search(query, PG_COLLECTIONS)[:k]

def tool_kb_search(query: str, top_k: int = 6) -> Dict[str, List[Dict[str,Any]]]:
    return kb_search_all(query)

MANAGER_KEYWORDS = {"manager","leadership","org design","hiring","coaching","performance review","okr","okrs","succession"}

def looks_manager_only(item: Dict[str,Any]) -> bool:
    meta = (item.get("metadata") or {})
    audience = str(meta.get("audience","")).lower()
    title = (item.get("title") or item.get("document") or "").lower()
    tags = " ".join(meta.get("tags", [])).lower()
    if audience in {"manager","leadership"}: return True
    haystack = f"{title} {tags}"
    return any(kw in haystack for kw in MANAGER_KEYWORDS)

def explicit_manager_request(prompt: str) -> bool:
    p = (prompt or "").lower()
    return any(k in p for k in MANAGER_KEYWORDS)

FALLBACKS = {
    "data engineering": {
        "jobs": [
            {"title": "Data Engineer (Platform)"},
            {"title": "Analytics Engineer"},
        ],
        "courses": [
            {"title": "Data Engineering on AWS — Foundations"},
            {"title": "Modern Data Pipelines with Python & Airflow"},
        ]
    }
}

def infer_topic(user_text: str) -> Optional[str]:
    t = user_text.lower()
    if "data engineering" in t or "data engineer" in t:
        return "data engineering"
    return None

def job_tool(query: str) -> List[Dict[str,Any]]:
    jobs = []
    kb = tool_kb_search(query).get("jobs", [])
    pg = [h for h in tool_pg_search(query, 12) if (h.get("metadata") or {}).get("type") in {"job","role"}]
    jobs.extend(kb[:6] or []); jobs.extend(pg[:6] or [])
    seen, dedup = set(), []
    for j in jobs:
        t = (j.get("title") or (j.get("metadata") or {}).get("title") or "").strip().lower()
        if not t or t in seen: continue
        seen.add(t); dedup.append(j)
    if not dedup:
        topic = infer_topic(query)
        if topic and FALLBACKS.get(topic, {}).get("jobs"):
            dedup = FALLBACKS[topic]["jobs"]
    return dedup[:4]

def courses_tool(query: str, state: 'AgentState') -> List[Dict[str,Any]]:
    courses = []
    kb = tool_kb_search(query).get("courses", [])
    pg = [h for h in tool_pg_search(query, 12) if (h.get("metadata") or {}).get("type") == "course"]
    courses.extend(kb[:8] or []); courses.extend(pg[:6] or [])
    if state.is_manager or explicit_manager_request(state.prompt or ""):
        filtered = courses
    else:
        filtered = [c for c in courses if not looks_manager_only(c)]
    out, seen = [], set()
    for c in filtered:
        title = c.get("title") or (c.get("metadata") or {}).get("title") or "Course"
        if title.lower() in seen: continue
        seen.add(title.lower())
        out.append({"title": title, "metadata": c.get("metadata") or {}, "source": c.get("source") or "KB/PG"})
    if not out:
        topic = infer_topic(query)
        if topic and FALLBACKS.get(topic, {}).get("courses"):
            out = FALLBACKS[topic]["courses"]
    return out[:4]

def job_reflexion(items: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
    return sorted(items, key=lambda x: (-float(x.get("score") or 0.0), len(x.get("title",""))))

def courses_reflexion(items: List[Dict[str,Any]], is_manager: bool) -> List[Dict[str,Any]]:
    def rank(it):
        meta = it.get("metadata") or {}
        aud = (meta.get("audience") or "").lower()
        penal = 0 if is_manager else (1 if aud in {"manager","leadership"} else 0)
        return (penal, -float(it.get("score") or 0.0))
    return sorted(items, key=rank)


## 5) Consolidation & Compose

In [None]:

def dopamine_block() -> str:
    return (
        "**Let’s make this worth your time.**\n"
        "_I don’t store your info — anything you share is used only for this session._\n\n"
        "In 2 minutes, I’ll:\n"
        "✅ Recommend 2 career paths in Informa\n"
        "✅ Show the 3 most valuable skills to build next\n"
        "✅ Give you 2 courses to start this month\n"
    )

def consolidation_summary(sections: Dict[str,Any], include_onboarding: bool = True) -> str:
    blocks = [dopamine_block()]
    jobs = sections.get("jobs") or []
    courses = sections.get("courses") or []
    if jobs:
        blocks.append("\n### Closest roles to explore")
        for j in jobs[:2]:
            t = j.get("title") or (j.get("metadata") or {}).get("title") or "Role"
            blocks.append(f"- {t}")
    if courses:
        blocks.append("\n### Courses to start this month")
        for c in courses[:2]:
            blocks.append(f"- {c.get('title','Course')}")
    if include_onboarding:
        blocks.append("\n" + onboarding_message())
    blocks.append("\n**Would you like to explore other courses or open jobs?**")
    return "\n".join(blocks)


## 6) Orchestrator

In [None]:

def run_workflow(user_text: str,
                 email: Optional[str] = None,
                 name: Optional[str] = None,
                 division: Optional[str] = None,
                 override_is_manager: Optional[bool] = None) -> Dict[str,Any]:
    gate = prohibitor(user_text)
    if not gate.get("allowed"):
        answer = "I’m scoped to career planning (roles/jobs, courses, development plans, manager/leadership toolkits, leadership strategy). Ask me one of those and I’ll help fast."
        return {"blocked": True, "gate": gate, "answer": answer}

    state, profile_meta = setup_state(email=email, name=name, division=division,
                                      override_is_manager=override_is_manager, user_text=user_text)
    intents = intent_persona(gate.get("intents", []))

    sections = {}
    if "job" in intents:
        sections["jobs"] = job_reflexion(job_tool(user_text))
    if "courses" in intents:
        sections["courses"] = courses_reflexion(courses_tool(user_text, state), state.is_manager)
    if "development_plan" in intents:
        ctx = tool_pg_search("development plan " + (user_text or ""), 6)
        sections["development_plan"] = ctx[:5]
    if "manager_toolkit" in intents:
        ctx = tool_pg_search("manager coaching " + (user_text or ""), 6)
        sections["manager_toolkit"] = ctx[:5]
    if "leadership_strategy" in intents:
        ctx = tool_pg_search("capability gaps portfolio " + (user_text or ""), 6)
        sections["leadership_strategy"] = ctx[:5]

    include_onboarding = not bool(profile_meta)
    final = consolidation_summary(sections, include_onboarding=include_onboarding)
    return {
        "blocked": False,
        "gate": gate,
        "state": state,
        "profile_found": bool(profile_meta),
        "sections": sections,
        "answer": final
    }


## 7) Smoke Tests

In [None]:

tests = [
    ("Reset my laptop password", None, None, None, None),
    ("What jobs and courses should I look at for data engineering?", None, None, None, False),
    ("What jobs and courses should I look at for data engineering?", None, None, None, True),
]

for text, email, name, div, is_mgr in tests:
    print("\n---\nQ:", text, "| override_is_manager:", is_mgr)
    out = run_workflow(text, email=email, name=name, division=div, override_is_manager=is_mgr)
    if out.get("blocked"):
        print("BLOCKED:", out["answer"])
    else:
        display(Markdown(out["answer"]))
        print("intents:", out["gate"]["intents"], "| is_manager:", out["state"].is_manager, "| profile_found:", out["profile_found"])
