
# Career Advisor — **Connected** Jupyter Prototype (PGVector + AWS KB)

**LLM fixed:** `us.anthropic.claude-3-7-sonnet-20250219-v1:0`

This notebook is designed to be engaging *and* grounded in enterprise data sources:
- Rotating learning quote + trust-first greeting
- Identity Option A (email/ID) and *Find-me* (name + division) via PGVector
- **Always** queries multiple PGVector collections for extra context
- Pulls from AWS Knowledge Bases (Jobs & Courses) if IDs are provided
- Contextual benchmark (not cloud-by-default), persona-aware synthesis
- Simple, dopamine-friendly outputs (roles, gaps, courses)


In [None]:

# --- Setup & Config ---
import os, json, re, time, math, statistics, random
from typing import List, Dict, Any, Optional, Literal, Tuple
from dataclasses import dataclass, field

import pandas as pd
import numpy as np
from IPython.display import display, Markdown

# %pip install -q psycopg[binary] boto3 python-dotenv

from dotenv import load_dotenv
load_dotenv()

Persona = Literal["IC","Manager","SeniorLeader"]
Intent = Literal["job","courses","development_plan","manager_toolkit","leadership_strategy","profile"]

AWS_REGION = os.getenv("AWS_REGION", "us-west-2")
# LLM is fixed as requested:
AWS_MODEL_ID = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"
EMBEDDING_MODEL = os.getenv("BEDROCK_EMBEDDING_MODEL", "amazon.titan-embed-text-v2:0")

JOB_KB_ID = os.getenv("JOB_KB_ID", "")
COURSES_KB_ID = os.getenv("COURSES_KB_ID", "")

PG_DSN = os.getenv("PG_DSN", "")

# Collections to always search per query
PG_COLLECTIONS = [
    "internal_private_employee_profiles_vectorstore",
    "internal_curated_informa_vectorstore",
    # add more collections here if needed
]

SESSION_ONLY = True
print("Configured LLM:", AWS_MODEL_ID)


In [None]:

# --- AWS Clients (Bedrock Runtime + Knowledge Bases) ---
import boto3

try:
    bedrock_rt = boto3.client("bedrock-runtime", region_name=AWS_REGION)
except Exception as e:
    bedrock_rt = None
    print("⚠️ Bedrock runtime client not available:", e)

try:
    kb_rt = boto3.client("bedrock-agent-runtime", region_name=AWS_REGION) if (JOB_KB_ID or COURSES_KB_ID) else None
except Exception as e:
    kb_rt = None
    print("⚠️ Bedrock KB client not available:", e)


In [None]:

# --- Embedding helper (Titan) ---
def embed_text(text: str) -> Optional[List[float]]:
    if not bedrock_rt:
        return None
    try:
        resp = bedrock_rt.invoke_model(
            modelId="amazon.titan-embed-text-v2:0",
            body=json.dumps({"inputText": text}),
            contentType="application/json",
            accept="application/json",
        )
        body = json.loads(resp.get("body").read())
        vec = body.get("embedding") or body.get("embeddings", [{}])[0].get("embedding")
        return vec
    except Exception as e:
        print("⚠️ Embedding failed:", e)
        return None


In [None]:

# --- Postgres (PGVector) connection & helpers ---
import psycopg

def get_pg_conn():
    if not PG_DSN:
        raise RuntimeError("PG_DSN not set in environment.")
    return psycopg.connect(PG_DSN)

SIMILARITY_SQL = """SELECT e.id,
       e.document,
       e.cmetadata,
       1 - (e.embedding <=> %(query_vec)s) AS score
FROM ai.langchain_pg_embedding e
JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
WHERE c.name = %(collection)s
ORDER BY e.embedding <=> %(query_vec)s
LIMIT %(k)s;
"""

KEYWORD_SQL = """SELECT e.id,
       e.document,
       e.cmetadata
FROM ai.langchain_pg_embedding e
JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
WHERE c.name = %(collection)s
  AND (e.document ILIKE '%%' || %(query)s || '%%'
       OR CAST(e.cmetadata AS TEXT) ILIKE '%%' || %(query)s || '%%')
LIMIT %(k)s;
"""

def pg_similarity_search(collection: str, query: str, k: int = 6) -> List[Dict[str, Any]]:
    vec = embed_text(query)
    use_embed = vec is not None
    with get_pg_conn() as conn, conn.cursor() as cur:
        if use_embed:
            cur.execute(SIMILARITY_SQL, {"collection": collection, "query_vec": vec, "k": k})
            rows = cur.fetchall()
        else:
            cur.execute(KEYWORD_SQL, {"collection": collection, "query": query, "k": k})
            rows = cur.fetchall()

    hits = []
    for r in rows:
        if use_embed:
            _id, doc, meta, score = r
        else:
            _id, doc, meta = r
            score = None
        if isinstance(meta, str):
            try: meta = json.loads(meta)
            except: meta = {"raw": meta}
        hits.append({"id": _id, "document": doc, "metadata": meta, "score": score, "collection": collection})
    return hits

def pg_multi_search(query: str, collections: List[str], k_each: int = 4) -> List[Dict[str, Any]]:
    all_hits = []
    for coll in collections:
        try:
            all_hits.extend(pg_similarity_search(coll, query, k=k_each))
        except Exception as e:
            print(f"⚠️ PG search failed for {coll}:", e)
    def key(h): return (h.get("score") or 0.0)
    return sorted(all_hits, key=key, reverse=True)[: max(5, len(collections)) ]


In [None]:

# --- AWS Knowledge Bases retrieval ---
def kb_retrieve(kb_id: str, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
    if not kb_rt or not kb_id:
        return []
    try:
        resp = kb_rt.retrieve(
            knowledgeBaseId=kb_id,
            retrievalConfiguration={"vectorSearchConfiguration": {"numberOfResults": top_k}},
            retrievalQuery={"text": query},
        )
        results = []
        for item in resp.get("retrievalResults", []):
            content = item.get("content", {})
            title = content.get("title") or (content.get("text", "").split("\n")[0][:80])
            results.append({
                "title": title,
                "snippet": content.get("snippetText") or content.get("text", "")[:200],
                "score": item.get("score"),
                "source": item.get("location", {}).get("s3Location", {}).get("uri"),
                "kb_id": kb_id,
            })
        return results
    except Exception as e:
        print("⚠️ KB retrieve failed:", e)
        return []

def kb_search_all(query: str) -> Dict[str, List[Dict[str, Any]]]:
    out = {}
    if JOB_KB_ID:
        out["jobs"] = kb_retrieve(JOB_KB_ID, query, top_k=5)
    if COURSES_KB_ID:
        out["courses"] = kb_retrieve(COURSES_KB_ID, query, top_k=5)
    return out


In [None]:

# --- Profile retrieval (Option A) ---
LOOKUP_BY_EMAIL_OR_ID = """SELECT e.document, e.cmetadata
FROM ai.langchain_pg_embedding e
JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
WHERE c.name = 'internal_private_employee_profiles_vectorstore'
  AND (e.custom_id = %(employee_id)s OR (e.cmetadata->>'email') = %(email)s)
LIMIT 50;
"""

FIND_ME_FALLBACK = """SELECT e.document, e.cmetadata
FROM ai.langchain_pg_embedding e
JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
WHERE c.name = 'internal_private_employee_profiles_vectorstore'
  AND (e.cmetadata->>'name') ILIKE '%%' || %(name)s || '%%'
  AND (%(division)s IS NULL OR (e.cmetadata->>'division') ILIKE '%%' || %(division)s || '%%')
LIMIT 10;
"""

def profile_lookup(email: Optional[str] = None, employee_id: Optional[str] = None, name: Optional[str] = None, division: Optional[str] = None) -> List[Dict[str, Any]]:
    if not PG_DSN:
        return []
    with get_pg_conn() as conn, conn.cursor() as cur:
        if email or employee_id:
            cur.execute(LOOKUP_BY_EMAIL_OR_ID, {"email": email, "employee_id": employee_id})
            rows = cur.fetchall()
        else:
            cur.execute(FIND_ME_FALLBACK, {"name": name or "", "division": division})
            rows = cur.fetchall()
    out = []
    for doc, meta in rows:
        if isinstance(meta, str):
            try: meta = json.loads(meta)
            except: meta = {"raw": meta}
        out.append({"document": doc, "metadata": meta})
    return out


In [None]:

# --- State & helpers ---
@dataclass
class Profile:
    employee_id: Optional[str] = None
    name: Optional[str] = None
    title: Optional[str] = None
    band: Optional[str] = None
    division: Optional[str] = None
    skills: List[str] = field(default_factory=list)
    interests: List[str] = field(default_factory=list)

@dataclass
class SessionState:
    employee_id: Optional[str] = None
    persona: Persona = "IC"
    intents: List[Intent] = field(default_factory=list)
    profile: Optional[Profile] = None
    extracted_skills: List[str] = field(default_factory=list)
    confirmed_skills: List[str] = field(default_factory=list)
    gaps: List[str] = field(default_factory=list)
    job_hits: List[Dict[str, Any]] = field(default_factory=list)
    course_hits: List[Dict[str, Any]] = field(default_factory=list)
    curated_hits: List[Dict[str, Any]] = field(default_factory=list)
    kb_hits: Dict[str, List[Dict[str, Any]]] = field(default_factory=dict)
    correlation_id: Optional[str] = None


In [None]:

# --- Engagement + Benchmark ---
QUOTES = [
    ("We are what we repeatedly do. Excellence, then, is not an act but a habit.", "Will Durant"),
    ("Learning never exhausts the mind.", "Leonardo da Vinci"),
    ("What we know is a drop; what we don’t know is an ocean.", "Isaac Newton"),
    ("Once you stop learning, you start dying.", "Albert Einstein"),
    ("The only limit to our realization of tomorrow is our doubts of today.", "F. D. Roosevelt"),
]
VALUE_PROMISE = (
    "In 2 minutes, I’ll:\n"
    "✅ Recommend 2 career paths in Informa\n"
    "✅ Show the 3 most valuable skills to build next\n"
    "✅ Give you 2 courses to start this month\n\n"
    "Shall we begin?"
)

CAPABILITY_MAP = {
    "data": "analytics_modeling", "analyst": "analytics_modeling", "scientist": "analytics_modeling",
    "ml": "ml_engineering", "backend": "systems_backend", "platform": "systems_backend", "sre": "reliability_engineering",
    "frontend": "frontend_engineering", "product": "product_discovery", "design": "ux_research",
    "manager": "hiring_coaching", "lead": "people_leadership", "director": "portfolio_strategy", "vp": "portfolio_strategy",
    "ops": "process_excellence", "support": "customer_success", "automation": "automation", "cloud": "cloud_platforms",
}
PRETTY = {
    "analytics_modeling": "analytics & modeling", "ml_engineering": "ML engineering", "systems_backend": "backend systems",
    "reliability_engineering": "site reliability", "frontend_engineering": "frontend engineering", "product_discovery": "product discovery",
    "ux_research": "UX research", "hiring_coaching": "hiring & coaching", "people_leadership": "people leadership",
    "portfolio_strategy": "portfolio strategy", "process_excellence": "process excellence", "customer_success": "customer success",
    "automation": "automation", "cloud_platforms": "cloud platforms",
}

def pretty_cap(cap: str) -> str:
    return PRETTY.get(cap, cap.replace("_", " "))

def choose_capability(state: SessionState) -> Optional[str]:
    title = (getattr(state.profile, "title", None) or "").lower()
    interests = set([s.lower() for s in getattr(state.profile, "interests", [])])
    skills = set([s.lower() for s in getattr(state.profile, "skills", [])])
    for key, cap in CAPABILITY_MAP.items():
        if key in title:
            return cap
    for key, cap in CAPABILITY_MAP.items():
        if any(key in s for s in interests) or any(key in s for s in skills):
            return cap
    return None

def capability_score(profile: Profile, capability: str) -> float:
    skills = [s.lower() for s in getattr(profile, "skills", [])]
    interest_bonus = 0.05 if any(k in (getattr(profile, "interests", []) or []) for k in ("career path", "learning", "mentoring")) else 0.0
    keys = [k for k, cap in CAPABILITY_MAP.items() if cap == capability]
    hits = sum(1 for s in skills if any(k in s for k in keys))
    score = min(1.0, (hits * 0.15) + interest_bonus)
    return max(0.0, score)

def rank_percentile(score: float, peers: List[float]) -> float:
    if not peers:
        return 0.5
    sorted_peers = sorted(peers)
    below = sum(1 for p in sorted_peers if p <= score)
    return below / max(1, len(sorted_peers))

def bucketize(score: float, peers: List[float]) -> str:
    if not peers or len(peers) < 10:
        return "above average" if score >= 0.5 else "developing"
    q = np.quantile(peers, [0.25, 0.5, 0.75])
    if score <= q[0]: return "bottom quartile"
    if score <= q[1]: return "below median"
    if score <= q[2]: return "above median"
    return "top quartile"

def fetch_peer_scores(persona: Persona, band: Optional[str], division: Optional[str]) -> List[float]:
    return [i/100 for i in range(10, 95, 3)]

def benchmark_line(state: SessionState) -> Optional[str]:
    cap = choose_capability(state)
    if not cap or not state.profile:
        return None
    score = capability_score(state.profile, cap)
    peers = fetch_peer_scores(state.persona, state.profile.band, state.profile.division)
    if len(peers) < 50:
        bucket = bucketize(score, peers)
        return f"Your {pretty_cap(cap)} capability looks {bucket} for your band."
    pct = round(100 * rank_percentile(score, peers))
    if pct >= 67:
        return f"You’re stronger in {pretty_cap(cap)} than about {pct}% of your peer group."
    if pct >= 50:
        return f"Your {pretty_cap(cap)} capability is above the median for your peer group."
    return f"Your {pretty_cap(cap)} capability is developing relative to peers; I’ll recommend quick wins."


In [None]:

def detect_intents(utterance: str) -> List[Intent]:
    txt = utterance.lower()
    intents = set()
    if any(k in txt for k in ["job", "role", "opening", "posting"]): intents.add("job")
    if any(k in txt for k in ["course", "learn", "upskill", "training"]): intents.add("courses")
    if any(k in txt for k in ["plan", "30-day", "development"]): intents.add("development_plan")
    if any(k in txt for k in ["manager", "team", "coach"]): intents.add("manager_toolkit")
    if any(k in txt for k in ["leadership", "strategy", "org"]): intents.add("leadership_strategy")
    return sorted(list(intents or {"profile"}))

def greeting_block(name: Optional[str], persona: Persona, brag: Optional[str]) -> str:
    quote, author = random.choice(QUOTES)
    title = f"Hi {name}, let’s make this worth your time." if name else "Let’s make this worth your time."
    lines = [f"“{quote}” — {author}", f"**{title}**"]
    if brag:
        lines.append(brag)
    lines.append("_I don’t store your info — anything you share is used only for this session._\n")
    lines.append(VALUE_PROMISE)
    return "\n\n".join(lines)


In [None]:

def retrieve_everywhere(query: str) -> Dict[str, Any]:
    pg_hits = pg_multi_search(query, PG_COLLECTIONS, k_each=4) if PG_DSN else []
    kb_hits = kb_search_all(query)
    return {"pg": pg_hits, "kb": kb_hits}

def suggest_roles_and_courses(state: SessionState, retrieved: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    text_blobs = []
    for h in retrieved.get("pg", []):
        text_blobs.append((h.get("document") or "") + " " + json.dumps(h.get("metadata") or {}))
    for k, lst in (retrieved.get("kb") or {}).items():
        for it in lst:
            text_blobs.append((it.get("snippet") or "") + " " + (it.get("title") or ""))
    big = " ".join(text_blobs).lower()

    roles = []
    if "frontend" in big:
        roles = [{"title": "Senior Frontend Engineer", "match": 0.86}, {"title": "UI Engineer", "match": 0.81}]
    elif "reliability" in big or "slo" in big:
        roles = [{"title": "Site Reliability Engineer", "match": 0.88}, {"title": "Platform Engineer", "match": 0.82}]
    elif "product" in big or "roadmap" in big:
        roles = [{"title": "Senior Product Manager", "match": 0.84}, {"title": "Product Lead", "match": 0.80}]
    else:
        roles = [{"title": "Senior Data Analyst", "match": 0.85}, {"title": "Analytics Engineer", "match": 0.82}]

    courses = []
    for it in (retrieved.get("kb", {}).get("courses", []) or [])[:2]:
        courses.append({"title": it.get("title"), "source": "KB", "reason": "Matched your gaps/context"})
    if not courses:
        courses = [{"title": "Learning How to Learn", "source": "L&D"}, {"title": "Outcome-Driven Roadmaps", "source": "L&D"}]
    return roles, courses


In [None]:

def run_session(
    utterance: str,
    persona: Persona = "IC",
    email: Optional[str] = None,
    employee_id: Optional[str] = None,
    name: Optional[str] = None,
    division: Optional[str] = None,
    quick_profile: Optional[Dict[str, Any]] = None,
) -> SessionState:
    state = SessionState(persona=persona)
    if quick_profile:
        state.profile = Profile(**quick_profile)
    else:
        results = profile_lookup(email=email, employee_id=employee_id, name=name, division=division)
        if results:
            meta = results[0]["metadata"]
            state.profile = Profile(
                employee_id=meta.get("employee_id"),
                name=meta.get("name"),
                title=meta.get("title"),
                band=meta.get("band"),
                division=meta.get("division"),
                skills=meta.get("skills", []),
                interests=meta.get("interests", []),
            )
        else:
            state.profile = Profile(skills=[], interests=[])

    state.intents = detect_intents(utterance)
    retrieved = retrieve_everywhere(utterance)
    state.curated_hits = retrieved.get("pg", [])
    state.kb_hits = retrieved.get("kb", {})

    brag = benchmark_line(state)
    display(Markdown(greeting_block(state.profile.name, state.persona, brag)))

    roles, courses = suggest_roles_and_courses(state, retrieved)
    state.job_hits = roles
    state.course_hits = courses

    if state.curated_hits:
        top2 = state.curated_hits[:2]
        bullets = "\n".join([f"• **{(h.get('metadata') or {}).get('title', 'Internal doc')}** (PG)" for h in top2])
        display(Markdown("**I’ll ground recommendations in these internal resources:**\n" + bullets))

    if state.kb_hits.get("jobs") or state.kb_hits.get("courses"):
        lines = []
        for k, lst in state.kb_hits.items():
            for it in lst[:2]:
                lines.append(f"• **{it.get('title','KB doc')}** (KB: {k})")
        if lines:
            display(Markdown("**Plus these Knowledge Base hits:**\n" + "\n".join(lines)))

    if roles:
        display(Markdown("### Closest roles to explore"))
        display(pd.DataFrame(roles))
    if courses:
        display(Markdown("### Courses to start this month"))
        display(pd.DataFrame(courses))

    return state


## Demo — Quick Profile (no identity persisted)

In [None]:

state = run_session(
    utterance="What job postings or courses fit me next quarter?",
    persona="IC",
    quick_profile={"name": "Alex", "title": "Senior Data Scientist", "band": "Band 5", "division": "Analytics",
                   "skills": ["Python","Machine Learning","Visualization"], "interests": ["career path","forecasting"]}
)


## Demo — Find-me (name + division) via PGVector profiles

In [None]:

# Requires PG_DSN configured and your profiles collection populated.
state = run_session(
    utterance="Show roles and a 30-day plan.",
    persona="Manager",
    name="Jordan",
    division="Platform"
)
