# HR Career Advisor — PG Vector + AWS KB (Prototype **v3**)

**What’s new in v3**
- Email-first profile lookup (name/division optional)
- Parse *Skills* and *Topics of Interest* from profile
- Profile-driven queries + ranking boost
- Type guessing for PG rows without `metadata.type`
- Same dopamine onboarding + fallbacks

**Flow:** prohibitor → setup_state → intent_persona → tools (PG+KB + profile-driven) → reflexion → consolidation

## 0) Setup & Environment

In [1]:

import os, json, re
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
import numpy as np
from IPython.display import display, Markdown
from dotenv import load_dotenv
load_dotenv()

AWS_REGION = os.getenv("AWS_REGION", "us-west-2")
AWS_MODEL_ID = "us.anthropic.claude-3-7-sonnet-20250219-v1:0"

PG_DSN = os.getenv("PG_DSN","")  # postgresql://user:pass@host:5432/dbname?sslmode=require
PG_COLLECTIONS = [
    "internal_private_employee_profiles_vectorstore",
    "internal_curated_informa_vectorstore",
]
JOB_KB_ID = os.getenv("JOB_KB_ID","")
COURSES_KB_ID = os.getenv("COURSES_KB_ID","")

DEFAULT_USER_NAME  = os.getenv("DEFAULT_USER_NAME",  "Mary Ralicki")
DEFAULT_USER_EMAIL = os.getenv("DEFAULT_USER_EMAIL", "mary.ralicki@informa.com")
DEFAULT_USER_DIV   = os.getenv("DEFAULT_USER_DIVISION", "")

print("Env present:", dict(
    PG=bool(PG_DSN),
    JOB_KB=bool(JOB_KB_ID),
    COURSES_KB=bool(COURSES_KB_ID),
    DEFAULT_USER_NAME=DEFAULT_USER_NAME,
    DEFAULT_USER_EMAIL=DEFAULT_USER_EMAIL
))


Env present: {'PG': True, 'JOB_KB': True, 'COURSES_KB': True, 'DEFAULT_USER_NAME': 'Mary Ralicki', 'DEFAULT_USER_EMAIL': 'mary.ralicki@informa.com'}


## 1) PGVector Retrieval

In [2]:

import psycopg

def get_pg_conn():
    if not PG_DSN:
        raise RuntimeError("PG_DSN not set")
    return psycopg.connect(PG_DSN)

KEYWORD_PREFILTER_SQL = (
"SELECT e.uuid AS id, e.embedding, e.document, e.cmetadata, c.name as collection "
"FROM ai.langchain_pg_embedding e "
"JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id "
"WHERE c.name = %(collection)s "
"  AND (e.document ILIKE '%%' || %(query)s || '%%' "
"       OR CAST(e.cmetadata AS TEXT) ILIKE '%%' || %(query)s || '%%') "
"LIMIT %(k)s;"
)

def _to_meta(meta):
    if isinstance(meta,(dict,list)): return meta
    try: return json.loads(meta)
    except: return {"raw": str(meta)}

def _cosine(a: np.ndarray, b: np.ndarray) -> float:
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    if denom == 0: return 0.0
    return float(np.dot(a, b) / denom)

def pg_search_hybrid(collection: str, query: str, pre_k: int = 24, top_k: int = 8) -> List[Dict[str,Any]]:
    with get_pg_conn() as conn, conn.cursor() as cur:
        cur.execute(KEYWORD_PREFILTER_SQL, {"collection": collection, "query": query, "k": pre_k})
        rows = cur.fetchall()
    if not rows: return []
    embs, items = [], []
    for _id, emb, doc, meta, coll in rows:
        v = np.array(emb, dtype=np.float32)
        embs.append(v)
        items.append({"id": _id, "embedding": emb, "document": doc, "metadata": _to_meta(meta), "collection": coll})
    centroid = np.mean(embs, axis=0)
    for it in items:
        it["score"] = _cosine(centroid, np.array(it["embedding"], dtype=np.float32))
    items.sort(key=lambda x: x.get("score",0.0), reverse=True)
    return items[:top_k]

def pg_multi_search(query: str, collections: List[str]) -> List[Dict[str,Any]]:
    hits = []
    for coll in collections:
        try:
            hits.extend(pg_search_hybrid(coll, query, 24, 8))
        except Exception as e:
            print(f"⚠️ PG search failed for {coll}: {e}")
    hits.sort(key=lambda x: x.get("score",0.0), reverse=True)
    return hits[: max(6, len(collections)) ]


## 2) AWS Knowledge Bases Retrieval

In [3]:

import boto3
try:
    kb_rt = boto3.client("bedrock-agent-runtime", region_name=AWS_REGION) if (JOB_KB_ID or COURSES_KB_ID) else None
except Exception as e:
    kb_rt = None
    print("⚠️ AWS KB unavailable:", e)

def kb_retrieve(kb_id: str, query: str, top_k: int = 5) -> List[Dict[str,Any]]:
    if not kb_rt or not kb_id:
        return []
    try:
        resp = kb_rt.retrieve(
            knowledgeBaseId=kb_id,
            retrievalConfiguration={"vectorSearchConfiguration": {"numberOfResults": top_k}},
            retrievalQuery={"text": query},
        )
        out = []
        for r in resp.get("retrievalResults", []):
            c = r.get("content", {})
            out.append({
                "title": c.get("title") or (c.get("text","").split("\n")[0][:80]).strip(),
                "snippet": c.get("snippetText") or c.get("text","")[:240],
                "score": r.get("score"),
                "kb_id": kb_id,
                "metadata": r.get("metadata") or {},
                "source": r.get("location", {}).get("s3Location", {}).get("uri"),
                "type": r.get("metadata",{}).get("type")
            })
        return out
    except Exception as e:
        print("⚠️ KB retrieve failed:", e)
        return []

def kb_search_all(query: str) -> Dict[str, List[Dict[str,Any]]]:
    return {
        "jobs":    kb_retrieve(JOB_KB_ID, query, 6) if JOB_KB_ID else [],
        "courses": kb_retrieve(COURSES_KB_ID, query, 6) if COURSES_KB_ID else [],
    }


## 3) Prohibitor, State, Intent, Profile Lookup

In [4]:

AllowedIntents = {"courses","job","development_plan","manager_toolkit","leadership_strategy","career"}

def prohibitor(user_text: str) -> Dict[str,Any]:
    t = user_text.lower()
    allowed = any(k in t for k in ["career","course","job","role","roles","learn","upskill","development","manager","leadership","okr","coaching","promotion","ladder","mentoring","objective","okrs"])
    intents = []
    if any(k in t for k in ["job","jobs","opening","openings","role","roles"]): intents.append("job")
    if any(k in t for k in ["course","courses","learn","training","upskill"]): intents.append("courses")
    if any(k in t for k in ["mentoring","mentor"]): intents.append("manager_toolkit")
    if any(k in t for k in ["objective","okr","okrs"]): intents.append("leadership_strategy")
    if any(k in t for k in ["development plan","30-day","60-day","90-day","dev plan"]): intents.append("development_plan")
    if not intents and allowed: intents.append("career")
    return {"allowed": allowed and bool(intents), "intents": intents or [], "rationale": "heuristic v0.3"}

@dataclass
class AgentState:
    email: Optional[str] = None
    name: Optional[str] = None
    division: Optional[str] = None
    employee_id: Optional[str] = None
    is_manager: bool = False
    prompt: Optional[str] = None
    quick_profile: Optional[Dict[str,Any]] = None

def derive_is_manager_from_profile(meta: dict) -> bool:
    if str(meta.get("is_manager","")).lower() in {"true","1","yes"}: return True
    if int(meta.get("direct_reports",0) or 0) > 0: return True
    title = (meta.get("title") or "").lower()
    if any(k in title for k in [" manager","lead","head of","director","vp"]): return True
    return False

def profile_lookup(email: Optional[str] = None,
                   name: Optional[str] = None,
                   division: Optional[str] = None) -> List[Dict[str,Any]]:
    if not PG_DSN:
        return []
    results: List[Dict[str,Any]] = []
    with get_pg_conn() as conn, conn.cursor() as cur:
        if email:
            sql = """
                SELECT e.document, e.cmetadata
                FROM ai.langchain_pg_embedding e
                JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
                WHERE c.name = 'internal_private_employee_profiles_vectorstore'
                  AND (e.cmetadata->>'email') = %(email)s
                LIMIT 10;
            """
            cur.execute(sql, {"email": email})
            rows = cur.fetchall()
            for doc, meta in rows:
                try: meta = meta if isinstance(meta, dict) else json.loads(meta)
                except: meta = {"raw": str(meta)}
                results.append({"document": doc, "metadata": meta})
            if results:
                return results

        if name:
            sql = """
                SELECT e.document, e.cmetadata
                FROM ai.langchain_pg_embedding e
                JOIN ai.langchain_pg_collection c ON c.uuid = e.collection_id
                WHERE c.name = 'internal_private_employee_profiles_vectorstore'
                  AND (e.cmetadata->>'name') ILIKE %(name)s
            """
            params = {"name": f"%{name}%"}
            if division:
                sql += " AND (e.cmetadata->>'division') ILIKE %(division)s"
                params["division"] = f"%{division}%"
            sql += " LIMIT 25;"
            cur.execute(sql, params)
            rows = cur.fetchall()
            for doc, meta in rows:
                try: meta = meta if isinstance(meta, dict) else json.loads(meta)
                except: meta = {"raw": str(meta)}
                results.append({"document": doc, "metadata": meta})
        return results

def setup_state(email: Optional[str], name: Optional[str], division: Optional[str],
                override_is_manager: Optional[bool], user_text: str) -> Tuple[AgentState, dict]:
    rows = profile_lookup(email=email or DEFAULT_USER_EMAIL,
                          name=name or DEFAULT_USER_NAME,
                          division=division or DEFAULT_USER_DIV)
    meta = rows[0]["metadata"] if rows else {}
    is_mgr = override_is_manager if override_is_manager is not None else derive_is_manager_from_profile(meta)
    st = AgentState(email=email or DEFAULT_USER_EMAIL, name=name or DEFAULT_USER_NAME,
                    division=division or DEFAULT_USER_DIV, employee_id=meta.get("employee_id"),
                    is_manager=is_mgr, prompt=user_text)
    st.quick_profile = {"_doc": rows[0]["document"] if rows else ""}
    return st, meta

def intent_persona(intents: List[str]) -> List[str]:
    return sorted(set(i for i in intents if i in AllowedIntents))


## 4) Parse Skills/Topics and Build Profile Queries

In [5]:
# HOTFIX: (re)define profile parsers + query builder in one place

import re, json

def extract_profile_fields(document: str, meta: dict) -> dict:
    text = (document or "") + "\n" + json.dumps(meta or {})
    m_sk = re.search(r"(?im)^\s*-\s*Skills:\s*(.+)$", text)
    skills = [s.strip() for s in re.split(r"[;,]", m_sk.group(1)) if s.strip()] if m_sk else []

    m_to = re.search(r"(?im)^\s*-\s*Topics of Interest:\s*(.+)$", text)
    topics = [s.strip() for s in re.split(r"[;,]", m_to.group(1)) if s.strip()] if m_to else []

    title = (meta or {}).get("title") or ""
    if not title:
        m_t = re.search(r"(?im)^\s*-\s*Job Title:\s*(.+)$", text)
        if m_t: title = m_t.group(1).strip()

    name = (meta or {}).get("name") or ""
    if not name:
        m_n = re.search(r"(?im)^\s*-\s*Name:\s*(.+)$", text)
        if m_n: name = m_n.group(1).strip()

    return {"name": name, "title": title, "skills": skills, "topics": topics}

def build_profile_queries(fields: dict, max_items: int = 5) -> dict:
    skills = (fields.get("skills") or [])[:max_items]
    topics = (fields.get("topics") or [])[:max_items]
    role   = (fields.get("title") or "")

    job_q, crs_q = [], []

    # Jobs queries from topics/skills/role
    for t in topics:
        job_q += [f"{t} roles", f"{t} jobs"]
    for s in skills:
        job_q.append(f"{s} engineer jobs")
    if role:
        job_q.append(f"{role} career paths")

    # Courses queries from skills/topics
    for s in skills:
        crs_q += [f"{s} course", f"{s} training"]
    for t in topics:
        crs_q.append(f"{t} learning path")  # correct append(...)

    def dedup(seq):
        seen, out = set(), []
        for x in seq:
            xl = x.lower()
            if xl in seen: 
                continue
            seen.add(xl); out.append(x)
        return out

    return {"jobs": dedup(job_q)[:max_items], "courses": dedup(crs_q)[:max_items]}

# quick sanity check
print(build_profile_queries({"skills":["Python"], "topics":["data engineering"], "title":"Data Engineer"}))


{'jobs': ['data engineering roles', 'data engineering jobs', 'Python engineer jobs', 'Data Engineer career paths'], 'courses': ['Python course', 'Python training', 'data engineering learning path']}


In [6]:

def build_profile_queries(fields: dict, max_items: int = 5) -> dict:
    skills = (fields.get("skills") or [])[:max_items]
    topics = (fields.get("topics") or [])[:max_items]
    role   = (fields.get("title") or "")

    job_q, crs_q = [], []

    # Jobs queries from topics/skills/role
    for t in topics:
        job_q += [f"{t} roles", f"{t} jobs"]
    for s in skills:
        job_q.append(f"{s} engineer jobs")
    if role:
        job_q.append(f"{role} career paths")

    # Courses queries from skills/topics
    for s in skills:
        crs_q += [f"{s} course", f"{s} training"]
    for t in topics:
        crs_q.append(f"{t} learning path")  # <-- fixed: append(...), not append[...]

    def dedup(seq):
        seen = set(); out = []
        for x in seq:
            xl = x.lower()
            if xl in seen: 
                continue
            seen.add(xl); out.append(x)
        return out

    return {"jobs": dedup(job_q)[:max_items], "courses": dedup(crs_q)[:max_items]}

def build_profile_queries(fields: dict, max_items: int = 5) -> dict:
    skills = fields.get("skills", [])[:max_items]
    topics = fields.get("topics", [])[:max_items]
    role   = fields.get("title") or ""

    job_q, crs_q = [], []

    for t in topics:
        job_q += [f"{t} roles", f"{t} jobs"]
    for s in skills:
        job_q.append(f"{s} engineer jobs")
    if role:
        job_q.append(f"{role} career paths")

    for s in skills:
        crs_q += [f"{s} course", f"{s} training"]
    for t in topics:
        crs_q.append[f"{t} learning path"]

    def dedup(seq):
        seen=set(); out=[]
        for x in seq:
            xl=x.lower()
            if xl in seen: continue
            seen.add(xl); out.append(x)
        return out

    return {"jobs": dedup(job_q)[:max_items], "courses": dedup(crs_q)[:max_items]}


## 5) Tools (PG + KB) with Type Guessing, Profile Boost & Fallbacks

In [7]:

COURSE_HINTS = [
    "course","training","learning path","module","curriculum",
    "cert","certification","udemy","coursera","pluralsight","lynda",
    "academy","lesson","workshop"
]
JOB_HINTS = [
    "job","role","opening","position","vacancy","requisition","req id",
    "hiring"
]

def guess_type(item: dict) -> str:
    meta = (item.get("metadata") or {})
    t = str(meta.get("type") or "").lower().strip()
    if t:
        return t
    title = (item.get("title") or "").lower()
    doc = (item.get("document") or "").lower()
    text = f"{title} {doc}"
    if any(h in text for h in COURSE_HINTS): return "course"
    if any(h in text for h in JOB_HINTS): return "job"
    coll = (item.get("collection") or "").lower()
    if "course" in coll: return "course"
    if "job" in coll or "role" in coll: return "job"
    return "unknown"

def tool_pg_search(query: str, k: int = 8) -> List[Dict[str,Any]]:
    return pg_multi_search(query, PG_COLLECTIONS)[:k]

def tool_kb_search(query: str, top_k: int = 6) -> Dict[str, List[Dict[str,Any]]]:
    return kb_search_all(query)

MANAGER_KEYWORDS = {"manager","leadership","org design","hiring","coaching","performance review","okr","okrs","succession"}

def looks_manager_only(item: Dict[str,Any]) -> bool:
    meta = (item.get("metadata") or {})
    audience = str(meta.get("audience","")).lower()
    title = (item.get("title") or item.get("document") or "").lower()
    tags = " ".join(meta.get("tags", [])).lower()
    if audience in {"manager","leadership"}: return True
    haystack = f"{title} {tags}"
    return any(kw in haystack for kw in MANAGER_KEYWORDS)

def explicit_manager_request(prompt: str) -> bool:
    p = (prompt or "").lower()
    return any(k in p for k in MANAGER_KEYWORDS)

FALLBACKS = {
    "data engineering": {
        "jobs": [
            {"title": "Data Engineer (Platform)"},
            {"title": "Analytics Engineer"},
            {"title": "Data Engineer — ETL & Pipelines"},
        ],
        "courses": [
            {"title": "Data Engineering on AWS — Foundations"},
            {"title": "Modern Data Pipelines with Python & Airflow"},
            {"title": "Designing Data-Intensive Applications — Hands-on"},
        ]
    }
}

def infer_topic(user_text: str) -> Optional[str]:
    t = user_text.lower()
    if re.search(r"\bdata engineer(ing)?\b", t):
        return "data engineering"
    return None

def _score_profile_alignment(title: str, fields: dict) -> float:
    text = (title or "").lower()
    bonus = 0.0
    for s in (fields.get("skills") or [])[:6]:
        if s.lower() in text: bonus += 0.6
    for t in (fields.get("topics") or [])[:6]:
        if t.lower() in text: bonus += 0.5
    return bonus

def _run_multi_queries(base_results: list, queries: list, fn_retrieve) -> list:
    results = list(base_results)
    for q in queries:
        try:
            results.extend(fn_retrieve(q))
        except Exception as e:
            print("⚠️ subquery failed:", q, e)
    return results

def job_tool(query: str, profile_q: list = None, profile_fields: dict = None) -> List[Dict[str,Any]]:
    profile_q = profile_q or []
    profile_fields = profile_fields or {}

    kb = tool_kb_search(query).get("jobs", [])
    pg_raw = tool_pg_search(query, 16)
    pg = [h for h in pg_raw if guess_type(h) in {"job", "role"}]
    jobs = kb[:8] + pg[:8]

    if profile_q:
        jobs = _run_multi_queries(jobs, profile_q, lambda q: (
            tool_kb_search(q).get("jobs", []) + 
            [h for h in tool_pg_search(q, 12) if guess_type(h) in {"job","role"}]
        ))

    dedup = {}
    for j in jobs:
        title = (j.get("title") or (j.get("metadata") or {}).get("title") or "").strip()
        if not title: continue
        key = title.lower()
        score = float(j.get("score") or 0.0) + _score_profile_alignment(title, profile_fields)
        if key not in dedup or score > dedup[key]["_score"]:
            jj = dict(j); jj["_score"] = score; jj["title"] = title
            dedup[key] = jj

    ranked = sorted(dedup.values(), key=lambda x: -x["_score"])
    if not ranked:
        topic = infer_topic(query)
        if topic and FALLBACKS.get(topic, {}).get("jobs"):
            ranked = FALLBACKS[topic]["jobs"]
        else:
            ranked = [{"title": "Data Engineer (Platform)"}, {"title": "Analytics Engineer"}]
    return ranked[:4]

def courses_tool(query: str, state: 'AgentState', profile_q: list = None, profile_fields: dict = None) -> List[Dict[str,Any]]:
    profile_q = profile_q or []
    profile_fields = profile_fields or {}

    kb = tool_kb_search(query).get("courses", [])
    pg_raw = tool_pg_search(query, 16)
    pg = [h for h in pg_raw if guess_type(h) == "course"]
    courses = kb[:10] + pg[:8]

    if profile_q:
        courses = _run_multi_queries(courses, profile_q, lambda q: (
            tool_kb_search(q).get("courses", []) + 
            [h for h in tool_pg_search(q, 12) if guess_type(h) == "course"]
        ))

    if state.is_manager or explicit_manager_request(state.prompt or ""):
        filtered = courses
    else:
        filtered = [c for c in courses if not looks_manager_only(c)]

    bucket = {}
    for c in filtered:
        title = (c.get("title") or (c.get("metadata") or {}).get("title") or "Course").strip()
        if not title: continue
        key = title.lower()
        score = float(c.get("score") or 0.0) + _score_profile_alignment(title, profile_fields)
        if key not in bucket or score > bucket[key]["_score"]:
            cc = {"title": title, "metadata": c.get("metadata") or {}, "source": c.get("source") or "KB/PG", "_score": score}
            bucket[key] = cc

    ranked = sorted(bucket.values(), key=lambda x: -x["_score"])
    if not ranked:
        topic = infer_topic(query)
        if topic and FALLBACKS.get(topic, {}).get("courses"):
            ranked = FALLBACKS[topic]["courses"]
        else:
            ranked = [{"title": "Data Engineering on AWS — Foundations"},
                      {"title": "Modern Data Pipelines with Python & Airflow"}]
    return ranked[:4]

def job_reflexion(items: List[Dict[str,Any]]) -> List[Dict[str,Any]]:
    return sorted(items, key=lambda x: (-float(x.get("score") or x.get("_score") or 0.0), len((x.get("title") or ""))))

def courses_reflexion(items: List[Dict[str,Any]], is_manager: bool) -> List[Dict[str,Any]]:
    def rank(it):
        base = float(it.get("score") or it.get("_score") or 0.0)
        meta = it.get("metadata") or {}
        aud = (meta.get("audience") or "").lower()
        penal = 0 if is_manager else (1 if aud in {"manager","leadership"} else 0)
        return (penal, -base)
    return sorted(items, key=rank)


# HOTFIX: override build_profile_queries everywhere
def build_profile_queries(fields: dict, max_items: int = 5) -> dict:
    skills = (fields.get("skills") or [])[:max_items]
    topics = (fields.get("topics") or [])[:max_items]
    role   = (fields.get("title") or "")

    job_q, crs_q = [], []

    # Jobs queries from topics/skills/role
    for t in topics:
        job_q += [f"{t} roles", f"{t} jobs"]
    for s in skills:
        job_q.append(f"{s} engineer jobs")
    if role:
        job_q.append(f"{role} career paths")

    # Courses queries from skills/topics
    for s in skills:
        crs_q += [f"{s} course", f"{s} training"]
    for t in topics:
        crs_q.append(f"{t} learning path")  # <- correct append(...)

    def dedup(seq):
        seen = set(); out = []
        for x in seq:
            xl = x.lower()
            if xl in seen:
                continue
            seen.add(xl); out.append(x)
        return out

    return {"jobs": dedup(job_q)[:max_items], "courses": dedup(crs_q)[:max_items]}

# sanity check
_test = build_profile_queries({"skills":["Python"], "topics":["data engineering"], "title":"Data Engineer"})
print(_test)



{'jobs': ['data engineering roles', 'data engineering jobs', 'Python engineer jobs', 'Data Engineer career paths'], 'courses': ['Python course', 'Python training', 'data engineering learning path']}


In [8]:
# 5.6) Normalization + intersection/bridge ranking

import re

def _strip_md(s: str) -> str:
    if not s: return ""
    s = re.sub(r"```[\s\S]*?```", "", s)          # fenced blocks
    s = re.sub(r"\[(.*?)\]\((.*?)\)", r"\1", s)   # [text](url)
    s = s.replace("**","").replace("__","")
    s = re.sub(r"^#+\s*", "", s)                  # heading hashes
    return " ".join(s.split())

def normalize_item(item: dict) -> dict:
    meta = item.get("metadata") or {}
    title = item.get("title") or meta.get("title") or item.get("document","")
    title = _strip_md(title)[:160].strip() or "Untitled"
    url   = meta.get("url") or item.get("source") or ""
    audience = (meta.get("audience") or "").lower()
    tags  = meta.get("tags") or []
    typ   = (meta.get("type") or "").lower()
    score = float(item.get("score") or item.get("_score") or 0.0)
    coll  = (item.get("collection") or "").lower()
    return {
        "title": title, "url": url, "audience": audience,
        "tags": tags, "type": typ, "score": score, "collection": coll
    }

# light keyword sets to detect target domain & bridge
DE_KEYWORDS = {"data engineer","data engineering","analytics engineer","analytics engineering","etl","pipeline","airflow","spark","dbt","warehouse","lakehouse","bigquery","redshift","glue"}
MK_KEYWORDS = {"marketing","campaign","crm","email","b2b","b2c","audience","brand","seo","sem","martech","adtech","attribution","mql","sql (sales)"}

def _kw_in(text: str, kws: set) -> bool:
    t = text.lower()
    return any(k in t for k in kws)

def choose_candidates(user_text: str, items: list, profile_fields: dict, target="jobs", top_n=6):
    """
    Re-rank to surface intersection:
    - Strongly prefer items that are Data Eng *and* Marketing (bridge).
    - Then Data Eng only.
    - Then Marketing analytics/BI (on-ramp).
    - Penalize manager-only if user isn't a manager.
    """
    txt = user_text.lower()
    wants_de = _kw_in(txt, DE_KEYWORDS) or "data" in txt or "engineer" in txt

    skills = [s.lower() for s in (profile_fields.get("skills") or [])]
    topics = [t.lower() for t in (profile_fields.get("topics") or [])]
    mk_like = any(_kw_in(s, MK_KEYWORDS) for s in skills+topics)

    ranked = []
    for raw in (items or []):
        it = normalize_item(raw)
        t = (it["title"] or "").lower()
        base = it["score"]

        is_de = _kw_in(t, DE_KEYWORDS)
        is_mk = _kw_in(t, MK_KEYWORDS)

        bridge = 0.0
        if wants_de and mk_like:
            # intersection/bridge bonuses
            if is_de and is_mk:
                bridge += 3.0
            elif is_de:
                bridge += 2.0
            elif is_mk and any(x in t for x in ["data","analytics","bi","sql","python","warehouse"]):
                bridge += 1.2

        # slight boost for explicit skill/topic mentions
        for s in skills[:6]:
            if s in t: base += 0.4
        for tp in topics[:6]:
            if tp in t: base += 0.3

        # Final score
        it["_rank"] = base + bridge
        ranked.append(it)

    ranked.sort(key=lambda x: x["_rank"], reverse=True)
    # de-dup by title
    seen = set(); out = []
    for it in ranked:
        key = it["title"].lower()
        if key in seen: continue
        seen.add(key); out.append(it)
        if len(out) >= top_n: break
    return out

## 6) Compose & Orchestrate

In [9]:
# 6.5) Pure LLM synthesis on Bedrock (Claude 3.7 Sonnet)

import json, boto3

try:
    _bedrock = boto3.client("bedrock-runtime", region_name=AWS_REGION)
except Exception as _e:
    _bedrock = None
    print("⚠️ Bedrock runtime not available; set AWS creds/region to enable synthesis.")

SYSTEM_PROMPT = (
    "You are Informa’s internal career advisor. "
    "Write naturally and concisely, tailored to the employee’s background and the question. "
    "Explain tradeoffs, propose bridge steps if the target domain differs from the profile. "
    "Use only facts provided; do not invent links or data."
)

def _compact(items):
    out = []
    for x in (items or []):
        out.append({
            "title": x.get("title"),
            "url": x.get("url"),
            "audience": x.get("audience"),
            "tags": x.get("tags"),
            "score": x.get("score"),
            "collection": x.get("collection"),
        })
    return out[:8]

def synthesize_answer_llm(user_text: str, intents: list, is_manager: bool,
                          profile_fields: dict, sections: dict) -> str:
    if not _bedrock:
        raise RuntimeError("Bedrock not configured")

    # Prepare model-facing JSON (lean)
    payload = {
        "query": user_text,
        "intents": intents,
        "persona": {"is_manager": bool(is_manager)},
        "profile": {
            "name": profile_fields.get("name"),
            "title": profile_fields.get("title"),
            "skills": profile_fields.get("skills") or [],
            "topics": profile_fields.get("topics") or [],
        },
        "retrieval": {
            "jobs": _compact(sections.get("jobs")),
            "courses": _compact(sections.get("courses")),
            "development_plan": _compact(sections.get("development_plan")),
            "manager_toolkit": _compact(sections.get("manager_toolkit")),
            "leadership_strategy": _compact(sections.get("leadership_strategy")),
        }
    }

    user_msg = (
        "Using only this JSON, answer the user naturally. "
        "Pick items that best fit the query and the profile; prefer intersection/bridge when needed. "
        "If information is insufficient, ask for the minimum missing detail.\n\n"
        + json.dumps(payload, ensure_ascii=False)
    )

    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 700,
        "temperature": 0.4,
        "system": [{"type":"text","text": SYSTEM_PROMPT}],   # <-- system at top-level
        "messages": [
            {"role":"user","content":[{"type":"text","text": user_msg}]}
        ]
    }

    resp = _bedrock.invoke_model(
        modelId="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
        body=json.dumps(body),
        accept="application/json",
        contentType="application/json",
    )
    out = json.loads(resp["body"].read().decode("utf-8"))
    parts = out.get("content", [])
    text = "\n".join(p.get("text","") for p in parts if p.get("type")=="text").strip()
    return text or "(no content)"

In [10]:
## 6) Compose & Orchestrate

def run_workflow(
    user_text: str,
    email: Optional[str] = None,
    name: Optional[str] = None,
    division: Optional[str] = None,
    override_is_manager: Optional[bool] = None
) -> Dict[str,Any]:

    gate = prohibitor(user_text)
    if not gate.get("allowed"):
        return {"blocked": True, "gate": gate, "answer": "out_of_scope"}

    # Load state/profile
    state, profile_meta = setup_state(
        email=email, name=name, division=division,
        override_is_manager=override_is_manager, user_text=user_text
    )

    # Parse profile
    fields: Dict[str, Any] = {}
    try:
        if state.quick_profile and state.quick_profile.get("_doc") is not None:
            fields = extract_profile_fields(state.quick_profile["_doc"], profile_meta or {})
    except Exception as e:
        print("⚠️ profile parse failed:", e)
        fields = {}

    # Intents & profile-driven queries
    intents = intent_persona(gate.get("intents", []))
    profile_qs = build_profile_queries(fields) if fields else {"jobs": [], "courses": []}

    # Raw tool retrieval
    sections_raw: Dict[str, Any] = {}
    if "job" in intents:
        sections_raw["jobs"] = job_reflexion(
            job_tool(user_text, profile_q=profile_qs.get("jobs") or [], profile_fields=fields)
        )
    if "courses" in intents:
        sections_raw["courses"] = courses_reflexion(
            courses_tool(user_text, state, profile_q=profile_qs.get("courses") or [], profile_fields=fields),
            state.is_manager
        )
    if "development_plan" in intents:
        sections_raw["development_plan"] = tool_pg_search("development plan " + (user_text or ""), 6)[:5]
    if "manager_toolkit" in intents:
        sections_raw["manager_toolkit"] = tool_pg_search("manager coaching " + (user_text or ""), 6)[:5]
    if "leadership_strategy" in intents:
        sections_raw["leadership_strategy"] = tool_pg_search("capability gaps portfolio " + (user_text or ""), 6)[:5]

    # Intersection/bridge selection so LLM sees the right candidates
    if "job" in intents:
        sections_jobs = choose_candidates(user_text, sections_raw.get("jobs"), fields, target="jobs", top_n=6)
    else:
        sections_jobs = []
    if "courses" in intents:
        sections_courses = choose_candidates(user_text, sections_raw.get("courses"), fields, target="courses", top_n=6)
    else:
        sections_courses = []

    sections = dict(sections_raw)  # keep other sections as-is
    sections["jobs"] = sections_jobs
    sections["courses"] = sections_courses

    # LLM writes the final answer (no hardcoded copy)
    try:
        final = synthesize_answer_llm(
            user_text=user_text,
            intents=intents,
            is_manager=state.is_manager,
            profile_fields=fields or {},
            sections=sections
        )
    except Exception as e:
        final = f"(LLM unavailable: {e})"

    return {
        "blocked": False,
        "gate": gate,
        "state": state,
        "profile_found": bool(profile_meta),
        "profile_fields": fields,
        "sections": sections,          # now already intersection-weighted
        "answer": final
    }

# 6.7) Simple streaming renderer for notebooks

from IPython.display import display, Markdown, clear_output
import time

def render_stream(generator, refresh=0.05):
    """
    Renders streaming text in-place. Call with the generator returned by synthesize_answer_llm_stream.
    """
    buf = []
    handle = display(Markdown(""), display_id=True)
    last_flush = time.time()
    for chunk in generator:
        buf.append(chunk)
        if time.time() - last_flush >= refresh:
            handle.update(Markdown("".join(buf)))
            last_flush = time.time()
    # final flush
    handle.update(Markdown("".join(buf)))
    return "".join(buf)

In [11]:
# 6.6) Streaming synthesis (Claude 3.7 Sonnet on Bedrock)
# - Uses invoke_model_with_response_stream
# - Yields text deltas as they arrive
# - Falls back to non-streaming if not supported/enabled

import json, sys, time
from typing import Generator

def _make_messages_body(user_text: str, intents: list, is_manager: bool, profile_fields: dict, sections: dict):
    payload = {
        "query": user_text,
        "intents": intents,
        "persona": {"is_manager": bool(is_manager)},
        "profile": {
            "name": profile_fields.get("name"),
            "title": profile_fields.get("title"),
            "skills": profile_fields.get("skills") or [],
            "topics": profile_fields.get("topics") or [],
        },
        "retrieval": {
            "jobs":   [{"title": x.get("title"), "url": x.get("url")} for x in (sections.get("jobs") or [])][:8],
            "courses":[{"title": x.get("title"), "url": x.get("url")} for x in (sections.get("courses") or [])][:8],
            "development_plan": [{"title": x.get("title") or (x.get("metadata") or {}).get("title","")} for x in (sections.get("development_plan") or [])][:6],
            "manager_toolkit":  [{"title": x.get("title") or (x.get("metadata") or {}).get("title","")} for x in (sections.get("manager_toolkit")  or [])][:6],
            "leadership_strategy":[{"title": x.get("title") or (x.get("metadata") or {}).get("title","")} for x in (sections.get("leadership_strategy") or [])][:6],
        }
    }
    SYSTEM_PROMPT = (
        "You are Informa’s internal career advisor. "
        "Write naturally and concisely, tailored to the employee’s background and the question. "
        "Prefer bridges when profile and target domain differ; pick only from provided facts; no invented links."
    )
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 700,
        "temperature": 0.4,
        "system": [{"type":"text","text": SYSTEM_PROMPT}],
        "messages": [
            {"role":"user","content":[
                {"type":"text","text": "Using only this JSON, answer naturally. "
                                       "Pick items that best fit the query and profile; prefer intersection/bridge when needed. "
                                       "If info is insufficient, ask for the minimum missing detail.\n\n"
                                       + json.dumps(payload, ensure_ascii=False)}
            ]}
        ]
    }
    return body

def synthesize_answer_llm_stream(
    user_text: str,
    intents: list,
    is_manager: bool,
    profile_fields: dict,
    sections: dict,
    model_id: str = "us.anthropic.claude-3-7-sonnet-20250219-v1:0",
) -> Generator[str, None, None]:
    """
    Yields chunks of text as they arrive from Bedrock streaming API.
    """
    if _bedrock is None:
        raise RuntimeError("Bedrock not configured")

    body = _make_messages_body(user_text, intents, is_manager, profile_fields, sections)

    try:
        resp = _bedrock.invoke_model_with_response_stream(
            modelId=model_id,
            body=json.dumps(body),
            accept="application/json",
            contentType="application/json",
        )
    except Exception as e:
        # Fallback to non-streaming path if streaming not available
        yield f"(Streaming unavailable: {e})"
        return

    # Anthropic streaming returns a sequence of JSON events inside resp['body']
    # Event types include: message_start, content_block_start, content_block_delta{text}, content_block_stop, message_delta, message_stop
    try:
        for event in resp.get("body"):
            if "chunk" not in event:
                continue
            try:
                payload = json.loads(event["chunk"]["bytes"].decode("utf-8"))
            except Exception:
                continue
            etype = payload.get("type")

            # The useful text arrives in 'content_block_delta' with {"delta":{"type":"text_delta","text":"..."}}
            if etype == "content_block_delta":
                delta = payload.get("delta", {})
                if delta.get("type") == "text_delta":
                    txt = delta.get("text", "")
                    if txt:
                        yield txt
            # You can inspect other events if desired for telemetry:
            # elif etype in {"message_start","message_delta","message_stop","content_block_start","content_block_stop"}: pass
    except Exception as e:
        yield f"\n(Streaming error: {e})"

In [12]:
# 6.7) Simple streaming renderer for notebooks

from IPython.display import display, Markdown, clear_output
import time

def render_stream(generator, refresh=0.05):
    """
    Renders streaming text in-place. Call with the generator returned by synthesize_answer_llm_stream.
    """
    buf = []
    handle = display(Markdown(""), display_id=True)
    last_flush = time.time()
    for chunk in generator:
        buf.append(chunk)
        if time.time() - last_flush >= refresh:
            handle.update(Markdown("".join(buf)))
            last_flush = time.time()
    # final flush
    handle.update(Markdown("".join(buf)))
    return "".join(buf)

In [13]:
# 6.7) Simple streaming renderer for notebooks

from IPython.display import display, Markdown, clear_output
import time

def render_stream(generator, refresh=0.05):
    """
    Renders streaming text in-place. Call with the generator returned by synthesize_answer_llm_stream.
    """
    buf = []
    handle = display(Markdown(""), display_id=True)
    last_flush = time.time()
    for chunk in generator:
        buf.append(chunk)
        if time.time() - last_flush >= refresh:
            handle.update(Markdown("".join(buf)))
            last_flush = time.time()
    # final flush
    handle.update(Markdown("".join(buf)))
    return "".join(buf)

In [14]:
# Streaming run (prints progressively in the output cell)
out = run_workflow(
    "What jobs and courses should I look at for data engineering?",
    email=None, name=None, division=None,
    override_is_manager=False,
    stream=True
)
if out.get("blocked"):
    print("BLOCKED")
else:
    rendered = render_stream(out["stream"])  # shows increments live
    # 'rendered' holds the final text if you need to store/log it


TypeError: run_workflow() got an unexpected keyword argument 'stream'

## 7) Smoke Tests

In [None]:

tests = [
    ("Reset my laptop password", None, None, None, None),
    ("Analyze my current skillset against Informa's digital transformation needs and recommend 5 specific learning opportunities to close these gaps.", None, None, None, False),
    ("Create a 30-day plan to master machine learning with daily practice steps and metrics to track my progress within my current role at Informa.", None, None, None, True),
]

for text, email, name, div, is_mgr in tests:
    print("\n---\nQ:", text, "| override_is_manager:", is_mgr)
    out = run_workflow(text, email=email, name=name, division=div, override_is_manager=is_mgr)
    if out.get("blocked"):
        print("BLOCKED:", out["answer"])
    else:
        display(Markdown(out["answer"]))
        print("intents:", out["gate"]["intents"], "| is_manager:", out["state"].is_manager, "| profile_found:", out["profile_found"])
        print("[debug] profile fields:", out.get("profile_fields"))
        print("[debug] jobs:", [j.get("title") for j in out.get("sections",{}).get("jobs",[])])
        print("[debug] courses:", [c.get("title") for c in out.get("sections",{}).get("courses",[])])



---
Q: Reset my laptop password | override_is_manager: None
BLOCKED: out_of_scope

---
Q: Analyze my current skillset against Informa's digital transformation needs and recommend 5 specific learning opportunities to close these gaps. | override_is_manager: False



# Streaming Utilities (Drop‑in)

This section adds **token streaming** with graceful fallbacks for both **AWS Bedrock** and **OpenAI** backends.
Use `stream_complete(...)` to get a generator that yields text chunks. Works for synchronous scripts and Jupyter.

**How to use**
```python
from streaming import stream_complete, print_stream

gen = stream_complete(
    provider=os.getenv("LLM_PROVIDER", "bedrock"),          # "bedrock" or "openai"
    model=os.getenv("LLM_MODEL", "anthropic.claude-3-5-sonnet-20240620"),  # or "gpt-4.1-mini" etc.
    messages=[{"role":"user","content":"Give me a 2-line summary of the career agent architecture."}],
    temperature=0.2,
)
print_stream(gen)  # prints tokens as they arrive
```
If credentials or networking are unavailable, it will **auto‑fallback** to a local mock stream so notebooks keep running.


In [16]:

# %%streaming.py (inline cell for convenience)
# Robust token streaming for Bedrock + OpenAI with fallback to a local mock.
import os, sys, json, time
from typing import Dict, Iterator, List, Optional

def _env_true(v: Optional[str]) -> bool:
    return (v or "").strip().lower() in {"1","true","yes","y","on"}

def _mock_stream(text: str, delay: float = 0.02) -> Iterator[str]:
    """Yield tokens from a local string when real streaming isn't available."""
    for tok in text.split():
        yield tok + " "
        time.sleep(delay)

def _print_err_once(msg: str, _printed=set()):
    if msg not in _printed:
        print(f"[stream:fallback] {msg}", file=sys.stderr)
        _printed.add(msg)

def _openai_stream(model: str, messages: List[Dict], temperature: float=0.2) -> Iterator[str]:
    try:
        # OpenAI responses streaming (requires openai>=1.0)
        from openai import OpenAI
        client = OpenAI()
        stream = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            stream=True,
        )
        for event in stream:
            delta = event.choices[0].delta
            if delta and delta.content:
                yield delta.content
    except Exception as e:
        _print_err_once(f"OpenAI streaming unavailable: {e}")
        yield from _mock_stream("<<OPENAI STREAM FALLBACK>> " + _mock_reply_from_messages(messages))

def _bedrock_stream(model: str, messages: List[Dict], temperature: float=0.2) -> Iterator[str]:
    """
    Streams text tokens from AWS Bedrock. Supports Anthropic/Meta style "messages".
    Uses InvokeModelWithResponseStream on bedrock-runtime.
    """
    try:
        import boto3
        br = boto3.client("bedrock-runtime")

        # If messages are OpenAI-style, map to Anthropic Messages API format if model is Anthropic
        if "anthropic" in model:
            def to_anthropic_content(msg):
                c = msg.get("content", "")
                if isinstance(c, list):
                    return c
                return [{"type":"text","text": c}]
            anthropic_msgs = [{"role": m["role"], "content": to_anthropic_content(m)} for m in messages if m["role"] in {"user","assistant"}]
            body = {
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 1024,
                "temperature": temperature,
                "messages": anthropic_msgs,
            }
        else:
            user_text = "\n".join([m.get("content","") for m in messages if m.get("role")=="user"])
            body = {
                "input_text": user_text,
                "temperature": temperature,
                "max_tokens": 1024,
            }

        resp = br.invoke_model_with_response_stream(
            modelId=model,
            body=json.dumps(body),
            accept="application/json",
            contentType="application/json",
        )

        for event in resp.get("body"):
            if "chunk" in event:
                try:
                    chunk = json.loads(event["chunk"]["bytes"].decode("utf-8"))
                    if "delta" in chunk and isinstance(chunk["delta"], dict) and "text" in chunk["delta"]:
                        yield chunk["delta"]["text"]
                    elif "outputText" in chunk:
                        yield chunk["outputText"]
                    elif "message" in chunk and isinstance(chunk["message"], dict) and "content" in chunk["message"]:
                        parts = chunk["message"]["content"]
                        for p in parts:
                            if p.get("type")=="text" and "text" in p:
                                yield p["text"]
                except Exception:
                    continue

    except Exception as e:
        _print_err_once(f"Bedrock streaming unavailable: {e}")
        yield from _mock_stream("<<BEDROCK STREAM FALLBACK>> " + _mock_reply_from_messages(messages))

def _mock_reply_from_messages(messages: List[Dict]) -> str:
    last_user = ""
    for m in reversed(messages):
        if m.get("role") == "user":
            last_user = m.get("content","")
            break
    return f"(mock) I received your request: {last_user[:120]}..."

def stream_complete(provider: str,
                    model: str,
                    messages: List[Dict],
                    temperature: float = 0.2) -> Iterator[str]:
    """
    provider: "bedrock" | "openai"
    returns: generator yielding text chunks
    """
    provider = (provider or "bedrock").lower()
    if provider == "openai":
        return _openai_stream(model, messages, temperature)
    else:
        return _bedrock_stream(model, messages, temperature)

def print_stream(gen: Iterator[str]) -> None:
    """Pretty-print tokens as they arrive (stdout flush)."""
    for chunk in gen:
        print(chunk, end="", flush=True)
    print()

def demo_stream():
    provider = os.getenv("LLM_PROVIDER","bedrock")
    model = os.getenv("LLM_MODEL","anthropic.claude-3-5-sonnet-20240620")
    messages = [{"role":"user","content":"Create a strategy to increase my visibility within Informa by contributing to internal knowledge sharing and company initiatives."}]
    gen = stream_complete(provider, model, messages, temperature=0.3)
    print_stream(gen)

print("Loaded streaming utilities: stream_complete(), print_stream(), demo_stream().")


Loaded streaming utilities: stream_complete(), print_stream(), demo_stream().


In [17]:

# Example usage (safe to run without cloud creds; will fallback to mock stream):
demo_stream()


<<BEDROCK STREAM FALLBACK>> (mock) I received your request: Create a 

[stream:fallback] Bedrock streaming unavailable: An error occurred (ValidationException) when calling the InvokeModelWithResponseStream operation: The provided model identifier is invalid.


strategy to increase my visibility within Informa by contributing to internal knowledge sharing and company ini... 



## Streaming Diagnostics

Use this to confirm streaming is live and measure **Time To First Byte (TTFB)** and **throughput**.


In [18]:

import time
from datetime import datetime

def stream_diagnostics(provider, model, messages, temperature=0.2, max_print=2000):
    """
    Prints timestamps for each chunk, measures TTFB and tokens/sec.
    """
    print(f"Provider: {provider} | Model: {model}")
    start = time.time()
    first_time = None
    n_chars = 0
    n_chunks = 0

    gen = stream_complete(provider, model, messages, temperature=temperature)
    for chunk in gen:
        now = time.time()
        if first_time is None:
            first_time = now
            print(f"[{datetime.now().isoformat(timespec='seconds')}] FIRST CHUNK after {first_time - start:.3f}s")
        n_chunks += 1
        n_chars += len(chunk)
        # Print small preview of each chunk with timestamp to verify live updates
        preview = chunk[:80].replace("\n", " ")
        print(f"[{datetime.now().isoformat(timespec='seconds')}] chunk#{n_chunks} (+{len(chunk)} chars): {preview}", flush=True)

    end = time.time()
    if first_time is None:
        print("No chunks received.")
        return
    ttfb = first_time - start
    total = end - start
    body_time = total - ttfb
    cps = (n_chars / body_time) if body_time > 0 else float('inf')
    print("\n--- Streaming Stats ---")
    print(f"TTFB (s): {ttfb:.3f}")
    print(f"Total time (s): {total:.3f}")
    print(f"Post-first-chunk time (s): {body_time:.3f}")
    print(f"Chars received: {n_chars}")
    print(f"Chunks received: {n_chunks}")
    print(f"Throughput (chars/sec after first chunk): {cps:.1f}")

# Example: run once to test (uses env vars or defaults; falls back to mock if needed)
provider=os.getenv("LLM_PROVIDER","bedrock")
model=os.getenv("LLM_MODEL","anthropic.claude-3-5-sonnet-20240620")
messages=[{"role":"user","content":"Create a strategy to increase my visibility within Informa by contributing to internal knowledge sharing and company initiatives."}]
stream_diagnostics(provider, model, messages)


Provider: bedrock | Model: anthropic.claude-3-5-sonnet-20240620
[2025-08-12T09:45:49] FIRST CHUNK after 0.205s
[2025-08-12T09:45:49] chunk#1 (+10 chars): <<BEDROCK 
[2025-08-12T09:45:49] chunk#2 (+7 chars): STREAM 
[2025-08-12T09:45:49] chunk#3 (+11 chars): FALLBACK>> 
[2025-08-12T09:45:49] chunk#4 (+7 chars): (mock) 
[2025-08-12T09:45:49] chunk#5 (+2 chars): I 
[2025-08-12T09:45:50] chunk#6 (+9 chars): received 
[2025-08-12T09:45:50] chunk#7 (+5 chars): your 
[2025-08-12T09:45:50] chunk#8 (+9 chars): request: 
[2025-08-12T09:45:50] chunk#9 (+7 chars): Create 
[2025-08-12T09:45:50] chunk#10 (+2 chars): a 
[2025-08-12T09:45:50] chunk#11 (+9 chars): strategy 
[2025-08-12T09:45:50] chunk#12 (+3 chars): to 
[2025-08-12T09:45:50] chunk#13 (+9 chars): increase 
[2025-08-12T09:45:50] chunk#14 (+3 chars): my 
[2025-08-12T09:45:50] chunk#15 (+11 chars): visibility 
[2025-08-12T09:45:50] chunk#16 (+7 chars): within 
[2025-08-12T09:45:50] chunk#17 (+8 chars): Informa 
[2025-08-12T09:45:50] chunk#