Data load

In [2]:
# Path configuration + Loading JSON

import json
from pathlib import Path

BASE = Path.cwd()  

P_SKILLS = BASE / "skills_index.json"            # 课程索引）
P_CAREERS = BASE / "careers_normalized.json"     # （职业库，已规范化）
P_USERS = BASE / "users_normalized.json"         # （用户进度，已规范化）
P_UNITS = BASE / "units_placeholder.json"        #（占位学习内容）

assert P_SKILLS.exists() and P_CAREERS.exists() and P_USERS.exists(), "缺少清洗后的数据文件"

skills_index = json.loads(P_SKILLS.read_text(encoding="utf-8"))
careers_doc = json.loads(P_CAREERS.read_text(encoding="utf-8"))
users_doc = json.loads(P_USERS.read_text(encoding="utf-8"))
units = json.loads(P_UNITS.read_text(encoding="utf-8")) if P_UNITS.exists() else []

careers = careers_doc["careers"] if isinstance(careers_doc, dict) and "careers" in careers_doc else careers_doc
users = users_doc["users"] if isinstance(users_doc, dict) and "users" in users_doc else users_doc

len(skills_index), len(careers), len(users), len(units)


(33, 105, 40, 99)

规则版推荐引擎的核心打分层：用少量可调参数把“用户画像 × 职业/学习单元”转成可排序、可解释的分数。三个参数——penalty_if_unmet（职业未达门槛/阈值时的折扣）、gap_when_zero（用户在某知识点为0级时的缺口权重）、gap_when_ge1（已有基础但薄弱的权重）——控制整体策略。score_career 按“探究技能门槛→知识加权覆盖→与阈值比较”计算职业得分，并返回原因（未达标技能、覆盖/阈值、未满足节点清单）；score_unit 按单元覆盖的知识节点，对“0级缺口/≥1薄弱”分别加权累加得到原始分（再对高难度轻惩罚），并给出“覆盖节点/难度”的解释；aggregate_knowledge_gaps 则在候选单元集合里聚合用户未掌握节点的累计权重，生成“高价值缺口”排序。三者合用即可产出可解释的三类推荐：职业匹配度、下一步学习单元及其理由、优先补齐的知识缺口。

In [3]:
# Core Scoring Function of the Rule Engine

from typing import List, Dict, Any, Tuple

penalty_if_unmet = 0.6
gap_when_zero = 1.0
gap_when_ge1 = 0.5

def score_career(career: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
    """Professional scoring：
    1)Explore the skill threshold (if provided, all must be met)
    2)Knowledge weighted coverage: If the user's level is greater than or equal to min_level, add "weight"
    3)Compare with the threshold; Not meeting the standard or not crossing the threshold → multiply the final score by the penalty
    """

    gate = career.get("min_skill_levels", {}) or {}
    unmet_skills = [(k, v) for k, v in gate.items() if (user.get("inquiry_skills", {}).get(k, 0) < v)]
    gate_pass = len(unmet_skills) == 0

    required = career.get("required_knowledge", []) or []
    covered = 0.0
    total_w = 0.0
    unmet_nodes = []
    for rk in required:
        node = rk["node"]
        min_lv = int(rk["min_level"])
        w = float(rk["weight"])
        total_w += w
        have = int(user.get("knowledge", {}).get(node, 0))
        if have >= min_lv:
            covered += w
        else:
            unmet_nodes.append({"node": node, "need": min_lv, "have": have, "w": w})

    threshold = float(career.get("threshold", 0))
    threshold_pass = (covered >= threshold)
    base = (covered / total_w) if total_w > 0 else 0.0
    score = base if (gate_pass and threshold_pass) else base * penalty_if_unmet

    return {
        "score": max(0.0, min(1.0, score)),
        "reasons": [
            "探究技能：达标" if gate_pass else f"探究技能未达标：{','.join(k for k,_ in unmet_skills)}",
            f"知识覆盖：{covered:.2f} / 阈值 {threshold:.2f}（总权重 {total_w:.2f}）"
        ],
        "unmet_skills": unmet_skills,
        "unmet_nodes": sorted(unmet_nodes, key=lambda x: -x["w"]),
        "base": base,
        "covered": covered,
        "total_w": total_w,
        "threshold_pass": threshold_pass,
        "gate_pass": gate_pass,
    }

def score_unit(unit: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
    """Learning unit scoring (Simplified version without occupational context)：
    - If a user's level at a certain node is 0, it is regarded as a gap: gap=1.0; A value greater than or equal to 1 is considered weak: gap=0.5
    - Each knowledge node is accumulated by weight*gap as the raw score
    - Difficulty =3. Give a slight penalty
    """
    raw = 0.0
    for kn in unit.get("knowledge_nodes", []):
        node_id = kn["id"]
        w = float(kn.get("weight", 1.0))
        have = int(user.get("knowledge", {}).get(node_id, 0))
        gap = gap_when_zero if have == 0 else gap_when_ge1
        raw += w * gap
    if unit.get("difficulty") == 3:
        raw *= 0.95
    reasons = [
        "覆盖节点：" + ", ".join(kn["id"] for kn in unit.get("knowledge_nodes", [])),
        f"难度={unit.get('difficulty')}"
    ]
    return {"raw": raw, "reasons": reasons}

def aggregate_knowledge_gaps(units: List[Dict[str, Any]], user: Dict[str, Any]) -> List[Tuple[str, float]]:
    """Knowledge gap: Calculate the cumulative weight of nodes with user level==0 in all units."""
    acc: Dict[str, float] = {}
    for u in units:
        for kn in u.get("knowledge_nodes", []):
            node = kn["id"]
            if int(user.get("knowledge", {}).get(node, 0)) == 0:
                acc[node] = acc.get(node, 0.0) + float(kn.get("weight", 1.0))
    return sorted(acc.items(), key=lambda x: -x[1])


Step 3 代码把前面计算好的“职业/单元/缺口”打分整合成一次可直接返回给前端的 Top-K 推荐结果：对每个职业先用 score_career 得到分数与“门槛/覆盖/阈值”解释，并生成“优先补齐”节点；对每个学习单元用 score_unit 产出原始分 raw，再在本批候选内按 raw/max_raw 归一化、附带“覆盖节点/难度”和动作指引（看视频或完成活动）；然后基于候选单元聚合用户 level==0 的知识节点累计权重，形成“高价值缺口 Top-K”。最终返回统一结构 {units, careers, knowledge, meta}，每条含 score/confidence/reasons/nextSteps（职业还带 debug 细节），既可排序又可解释，前端可直接渲染。

In [4]:
# Produce Top-K recommendations
from datetime import datetime

def get_recommendations(user: Dict[str, Any],
                        careers: List[Dict[str, Any]],
                        units: List[Dict[str, Any]],
                        topK: int = 5) -> Dict[str, Any]:
    # Careers
    career_scored = []
    for c in careers:
        s = score_career(c, user)
        career_scored.append({
            "type": "career",
            "id": c["id"],
            "title": c.get("title", c["id"]),
            "score": s["score"],
            "confidence": 0.85 if (s["gate_pass"] and s["threshold_pass"]) else 0.7,
            "reasons": s["reasons"],
            "nextSteps": [f"优先补齐：{s['unmet_nodes'][0]['node']}"] if s["unmet_nodes"] else None,
            "debug": s
        })
    careers_top = sorted(career_scored, key=lambda x: -x["score"])[:topK]

    # Units
    unit_raw = [{"u": u, **score_unit(u, user)} for u in units]
    max_raw = max([x["raw"] for x in unit_raw], default=1e-6)
    units_top = []
    for x in sorted(unit_raw, key=lambda y: -y["raw"])[:topK]:
        units_top.append({
            "type": "unit",
            "id": x["u"]["id"],
            "title": x["u"].get("title", x["u"]["id"]),
            "score": x["raw"] / max_raw,
            "confidence": 0.85 if x["u"].get("difficulty") == 3 else 0.9,
            "reasons": x["reasons"],
            "nextSteps": ["观看并完成测验"] if x["u"].get("kind") == "video" else ["完成该活动"],
            "debug": {"raw": x["raw"], "nodes": x["u"].get("knowledge_nodes", [])}
        })

    # Knowledge gaps
    gaps = aggregate_knowledge_gaps(units, user)[:topK]
    knowledge_top = [{
        "type": "knowledge",
        "id": nid,
        "title": nid,
        "score": min(1.0, w),
        "confidence": 0.8,
        "reasons": [f"高价值缺口（累计权重={w:.2f}）"]
    } for nid, w in gaps]

    return {
        "units": units_top,
        "careers": careers_top,
        "knowledge": knowledge_top,
        "meta": {"engine": "rules", "generatedAt": datetime.utcnow().isoformat() + "Z"}
    }


Step 4（patched）把规则引擎封装成“一键跑通”的完整流程：先加载并规范化用户/职业/内容数据，按学科与难度过滤候选单元，再用“只推下一层级（h→h+1）”生成更可执行的学习列表；对用户做冷启动判定（可配置为冷启动时隐藏职业），随后分别计算三路 Top-K——学习单元（缺口优先、难度轻惩罚并在池内归一化）、职业（探究技能门槛 + 知识加权覆盖与阈值，不达标按比例降分）和知识缺口（基于候选单元聚合 level==0 节点权重）。最终返回结构化 JSON（含 score、confidence、reasons、nextSteps 与元信息），并打印与落盘，便于前端直接渲染与后续调参对比。

In [5]:
# Step 4 (patched) 

import json, re
from pathlib import Path
from typing import Any, Dict, List, Tuple
from datetime import datetime

# setting
ONLY_NEXT_LEVEL_UNITS     = True   
HIDE_CAREERS_ON_COLDSTART = True   
SUBJECT_FILTER            = set()  
MAX_DIFFICULTY            = 3      
TOPK                      = 5      

penalty_if_unmet      = 0.6   
gap_when_zero         = 1.0   
gap_when_ge1          = 0.5   
difficulty_penalty_3  = 0.95  

#  data lodaing 
BASE = Path.cwd()
P_SKILLS  = BASE / "skills_index.json"
P_CAREERS = BASE / "careers_normalized.json"
P_USERS   = BASE / "users_normalized.json"
P_UNITS   = BASE / "units_placeholder.json"   

assert P_SKILLS.exists() and P_CAREERS.exists() and P_USERS.exists(), "缺少清洗后的数据文件"

skills_index = json.loads(P_SKILLS.read_text(encoding="utf-8"))

careers_doc = json.loads(P_CAREERS.read_text(encoding="utf-8"))
careers: List[Dict[str, Any]] = careers_doc["careers"] if isinstance(careers_doc, dict) and "careers" in careers_doc else careers_doc

users_doc = json.loads(P_USERS.read_text(encoding="utf-8"))
users_raw = users_doc["users"] if isinstance(users_doc, dict) and "users" in users_doc else users_doc

units_all: List[Dict[str, Any]] = json.loads(P_UNITS.read_text(encoding="utf-8")) if P_UNITS.exists() else []

#  users setting
def _to_nonneg_int(x, default=0) -> int:
    try:
        n = int(round(float(x)))
        return n if n >= 0 else 0
    except Exception:
        return default

def normalize_users_structure(users_in) -> List[Dict[str, Any]]:
    if isinstance(users_in, dict) and "users" in users_in and isinstance(users_in["users"], list):
        arr = users_in["users"]
    elif isinstance(users_in, list):
        arr = users_in
    elif isinstance(users_in, dict):
        arr = []
        for k, v in users_in.items():
            if isinstance(v, dict):
                u = dict(v); u.setdefault("id", k); arr.append(u)
    else:
        raise ValueError("Unsupported users structure")
    out = []
    for i, u in enumerate(arr):
        uid = u.get("id") or u.get("user_id") or f"user-{i+1:03d}"
        grade = u.get("grade", u.get("year", 0))
        inquiry = u.get("inquiry_skills", {}) or {}
        knowledge = u.get("knowledge", u.get("knowledge_progress", {})) or {}
        interests = u.get("career_interests", [])
        out.append({
            "id": uid,
            "grade": _to_nonneg_int(grade, 0),
            "inquiry_skills": {k: _to_nonneg_int(v, 0) for k, v in inquiry.items()},
            "knowledge": {k: _to_nonneg_int(v, 0) for k, v in knowledge.items()},
            "career_interests": interests
        })
    return out


users = globals().get("users_norm") or normalize_users_structure(users_raw)

# 工具函数：冷启动/过滤/下一层级选择 
def is_cold_start(user: dict) -> bool:
    total_know = sum(int(v) for v in (user.get("knowledge") or {}).values())
    total_inq  = sum(int(v) for v in (user.get("inquiry_skills") or {}).values())
    return (total_know == 0) or (total_inq == 0)

def subject_of_unit(u: dict) -> str:
    kns = u.get("knowledge_nodes") or []
    if not kns: return ""
    node = kns[0].get("id", "")
    return node.split(".")[0] if "." in node else ""

def filter_units(units_in: List[dict]) -> List[dict]:
    out = []
    for u in units_in:
        if MAX_DIFFICULTY and int(u.get("difficulty", 1)) > MAX_DIFFICULTY:
            continue
        subj = subject_of_unit(u)
        if SUBJECT_FILTER and subj not in SUBJECT_FILTER:
            continue
        out.append(u)
    return out

def pick_next_level_units(units_in: List[dict], user: dict) -> List[dict]:
    """对每个知识节点，只保留“下一层级”的那个单元（优先 h+1，找不到就选最小的 > h）。"""
    have = user.get("knowledge", {}) or {}
    best = {}  # nodeId -> (level, unit)
    for u in units_in:
        kns = u.get("knowledge_nodes") or []
        if not kns: 
            continue
        node = kns[0]["id"]                 
        h = int(have.get(node, 0))
        m = re.search(r"::L(\d+)$", u.get("id",""))
        lvl = int(m.group(1)) if m else int(u.get("difficulty", 1))
        if lvl <= h:
            continue
        prev = best.get(node)
        def rank(x): return (x == h+1, -x) 
        if prev is None or rank(lvl) > rank(prev[0]):
            best[node] = (lvl, u)
    return [u for _, u in best.values()]

#  scoring
def score_career(career: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
    gate = career.get("min_skill_levels", {}) or {}
    unmet_skills = [(k, v) for k, v in gate.items() if (user.get("inquiry_skills", {}).get(k, 0) < v)]
    gate_pass = len(unmet_skills) == 0

    required = career.get("required_knowledge", []) or []
    covered, total_w = 0.0, 0.0
    unmet_nodes = []
    for rk in required:
        node = rk["node"]; min_lv = int(rk["min_level"]); w = float(rk["weight"])
        total_w += w
        have = int(user.get("knowledge", {}).get(node, 0))
        if have >= min_lv: covered += w
        else: unmet_nodes.append({"node": node, "need": min_lv, "have": have, "w": w})

    threshold = float(career.get("threshold", 0))
    threshold_pass = (covered >= threshold)
    base = (covered / total_w) if total_w > 0 else 0.0
    score = base if (gate_pass and threshold_pass) else base * penalty_if_unmet

    return {
        "score": max(0.0, min(1.0, score)),
        "reasons": [
            "探究技能：达标" if gate_pass else f"探究技能未达标：{','.join(k for k,_ in unmet_skills)}",
            f"知识覆盖：{covered:.2f} / 阈值 {threshold:.2f}（总权重 {total_w:.2f}）"
        ],
        "unmet_skills": unmet_skills,
        "unmet_nodes": sorted(unmet_nodes, key=lambda x: -x["w"]),
        "base": base, "covered": covered, "total_w": total_w,
        "threshold_pass": threshold_pass, "gate_pass": gate_pass,
    }

def score_unit(unit: Dict[str, Any], user: Dict[str, Any]) -> Dict[str, Any]:
    raw = 0.0
    for kn in unit.get("knowledge_nodes", []):
        node_id = kn["id"]; w = float(kn.get("weight", 1.0))
        have = int(user.get("knowledge", {}).get(node_id, 0))
        gap = gap_when_zero if have == 0 else gap_when_ge1
        raw += w * gap
    if int(unit.get("difficulty", 1)) == 3:
        raw *= difficulty_penalty_3
    reasons = [
        "覆盖节点：" + ", ".join(kn["id"] for kn in unit.get("knowledge_nodes", [])),
        f"难度={unit.get('difficulty')}"
    ]
    return {"raw": raw, "reasons": reasons}

def aggregate_knowledge_gaps(units_in: List[Dict[str, Any]], user: Dict[str, Any]) -> List[Tuple[str, float]]:
    acc: Dict[str, float] = {}
    for u in units_in:
        for kn in u.get("knowledge_nodes", []):
            node = kn["id"]
            if int(user.get("knowledge", {}).get(node, 0)) == 0:
                acc[node] = acc.get(node, 0.0) + float(kn.get("weight", 1.0))
    return sorted(acc.items(), key=lambda x: -x[1])

#  主函数 
def get_recommendations(user: Dict[str, Any],
                        careers_in: List[Dict[str, Any]],
                        units_in: List[Dict[str, Any]],
                        topK: int = 5) -> Dict[str, Any]:

   
    filtered_units = filter_units(units_in)


    if ONLY_NEXT_LEVEL_UNITS:
        filtered_units = pick_next_level_units(filtered_units, user)

    cold = is_cold_start(user)

    # Careers
    careers_top = []
    if not (cold and HIDE_CAREERS_ON_COLDSTART):
        career_scored = []
        for c in careers_in:
            s = score_career(c, user)
            career_scored.append({
                "type": "career",
                "id": c["id"],
                "title": c.get("title", c["id"]),
                "score": s["score"],
                "confidence": 0.85 if (s["gate_pass"] and s["threshold_pass"]) else 0.7,
                "reasons": s["reasons"],
                "nextSteps": [f"优先补齐：{s['unmet_nodes'][0]['node']}"] if s["unmet_nodes"] else None,
                "debug": s
            })
        careers_top = sorted(career_scored, key=lambda x: -x["score"])[:topK]

    # Units
    unit_raw = [{"u": u, **score_unit(u, user)} for u in filtered_units]
    max_raw = max([x["raw"] for x in unit_raw], default=1e-6)
    units_top = []
    for x in sorted(unit_raw, key=lambda y: -y["raw"])[:topK]:
        units_top.append({
            "type": "unit",
            "id": x["u"]["id"],
            "title": x["u"].get("title", x["u"]["id"]),
            "score": x["raw"] / max_raw,
            "confidence": 0.85 if int(x["u"].get("difficulty",1)) == 3 else 0.9,
            "reasons": x["reasons"],
            "nextSteps": ["观看并完成测验"] if x["u"].get("kind") == "video" else ["完成该活动"],
            "debug": {"raw": x["raw"], "nodes": x["u"].get("knowledge_nodes", [])}
        })

    # Knowledge gaps（基于全部候选的暴露度）
    gaps = aggregate_knowledge_gaps(filtered_units, user)[:topK]
    knowledge_top = [{
        "type": "knowledge",
        "id": nid,
        "title": nid,
        "score": min(1.0, w),
        "confidence": 0.8,
        "reasons": [f"高价值缺口（累计权重={w:.2f}）"]
    } for nid, w in gaps]

    return {
        "units": units_top,
        "careers": careers_top,
        "knowledge": knowledge_top,
        "meta": {
            "engine": "rules",
            "generatedAt": datetime.utcnow().isoformat() + "Z",
            "filters": {
                "onlyNextLevelUnits": ONLY_NEXT_LEVEL_UNITS,
                "hideCareersOnColdStart": HIDE_CAREERS_ON_COLDSTART,
                "subjectFilter": sorted(SUBJECT_FILTER),
                "maxDifficulty": MAX_DIFFICULTY
            }
        }
    }

# 选择用户并运行 
_users = globals().get("users_norm") or users
assert isinstance(_users, list) and _users, "用户列表为空或不是 list"

user_ids = [u.get("id") or u.get("user_id") or f"user-{i+1:03d}" for i, u in enumerate(_users)]
print("可用用户ID（前10）:", user_ids[:10], "…共", len(user_ids), "个")

USER_INDEX = 0  # 改这里选择第几个用户（0 表示第一个）；也可以把 USER_ID 改成具体字符串
USER_ID = user_ids[USER_INDEX]
user = next(u for u in _users if (u.get("id") or u.get("user_id")) == USER_ID)

res = get_recommendations(user, careers, units_all, topK=TOPK)

from pprint import pprint
def _brief(items, fields=("id","title","score","reasons")):
    return [{k: (round(v,4) if isinstance(v, float) else v) for k,v in x.items() if k in fields} for x in items]

print(f"\n=== Careers（Top-{TOPK}）for {USER_ID} ===")
pprint(_brief(res["careers"]), width=120)

print(f"\n=== Units（Top-{TOPK}）for {USER_ID} ===")
pprint(_brief(res["units"]), width=120)

print(f"\n=== Knowledge Gaps（Top-{TOPK}）for {USER_ID} ===")
pprint(res["knowledge"], width=120)

OUT = BASE / f"reco_result_{USER_ID}.json"
OUT.write_text(json.dumps(res, ensure_ascii=False, indent=2), encoding="utf-8")
print("\nSaved:", OUT)


可用用户ID（前10）: ['Y3_U1', 'Y3_U2', 'Y3_U3', 'Y3_U4', 'Y3_U5', 'Y4_U1', 'Y4_U2', 'Y4_U3', 'Y4_U4', 'Y4_U5'] …共 40 个

=== Careers（Top-5）for Y3_U1 ===
[]

=== Units（Top-5）for Y3_U1 ===
[{'id': 'BIO.Y3.AC9S3U01::L1',
  'reasons': ['覆盖节点：BIO.Y3.AC9S3U01', '难度=1'],
  'score': 1.0,
  'title': '提升「Living vs non-living; life cycles (intro)」到 L1'},
 {'id': 'EARTH.Y3.AC9S3U02::L1',
  'reasons': ['覆盖节点：EARTH.Y3.AC9S3U02', '难度=1'],
  'score': 1.0,
  'title': '提升「Soils, rocks, minerals (properties & resources)」到 L1'},
 {'id': 'PHYS.Y3.AC9S3U03::L1',
  'reasons': ['覆盖节点：PHYS.Y3.AC9S3U03', '难度=1'],
  'score': 1.0,
  'title': '提升「Heat sources & temperature change」到 L1'},
 {'id': 'CHEM.Y3.AC9S3U04::L1',
  'reasons': ['覆盖节点：CHEM.Y3.AC9S3U04', '难度=1'],
  'score': 1.0,
  'title': '提升「Solids & liquids; changes of state」到 L1'},
 {'id': 'BIO.Y4.AC9S4U01::L1',
  'reasons': ['覆盖节点：BIO.Y4.AC9S4U01', '难度=1'],
  'score': 1.0,
  'title': '提升「Life cycles (development & variation)」到 L1'}]

=== Knowledge Gaps（Top-5）for Y

In [5]:
# 诊断 users 的真实结构
print("type(users) =", type(users), "len =", (len(users) if hasattr(users, "__len__") else "n/a"))
if isinstance(users, list) and users:
    for i, u in enumerate(users[:3]):
        print(f"[list] users[{i}] keys:", list(u.keys()))
elif isinstance(users, dict):
    sample_keys = list(users.keys())[:3]
    print("[dict] top-level keys (likely user ids):", sample_keys)
    for k in sample_keys:
        if isinstance(users[k], dict):
            print(f"users['{k}'] keys:", list(users[k].keys()))


type(users) = <class 'list'> len = 40
[list] users[0] keys: ['user_id', 'year', 'skills_levels', 'knowledge_progress', 'career_interests', 'grade', 'inquiry_skills', 'knowledge']
[list] users[1] keys: ['user_id', 'year', 'skills_levels', 'knowledge_progress', 'career_interests', 'grade', 'inquiry_skills', 'knowledge']
[list] users[2] keys: ['user_id', 'year', 'skills_levels', 'knowledge_progress', 'career_interests', 'grade', 'inquiry_skills', 'knowledge']


自定义用户测试

In [12]:
# Warm Start demo 
demo_user_y9 = {
    "id": "demo_y9_progress",
    "grade": 9,

    "inquiry_skills": {"QP": 5, "PC": 6, "PAD": 6, "EVAL": 5, "COMM": 5},

    "knowledge": {
        "BIO.Y9.AC9S9U01": 2,  
        "BIO.Y10.AC9S10U01": 2, 
        "EARTH.Y9.AC9S9U02": 1, 

  
        "PHYS.Y9.AC9S9U03": 0,  
        "CHEM.Y9.AC9S9U04": 1, 
        "BIO.Y8.AC9S8U01": 2   
    },

   
    "career_interests": ["career.001"]
}


HIDE_CAREERS_ON_COLDSTART = False

user = demo_user_y9
res = get_recommendations(user, careers, units_all, topK=5)

from pprint import pprint
print("\n=== Careers (Top-5) — demo_y9_progress ===")
pprint([{k: x[k] for k in ("id","title","score","reasons")} for x in res["careers"]], width=120)

print("\n=== Units (Top-5) — demo_y9_progress ===")
pprint([{k: x[k] for k in ("id","title","score","reasons")} for x in res["units"]], width=120)

print("\n=== Knowledge Gaps (Top-5) — demo_y9_progress ===")
pprint(res["knowledge"], width=120)



=== Careers (Top-5) — demo_y9_progress ===
[{'id': 'career.001',
  'reasons': ['探究技能：达标', '知识覆盖：2.35 / 阈值 1.50（总权重 2.35）'],
  'score': 1.0,
  'title': 'Marine Biologist'},
 {'id': 'career.002', 'reasons': ['探究技能：达标', '知识覆盖：2.35 / 阈值 1.50（总权重 2.35）'], 'score': 1.0, 'title': 'Wildlife Carer'},
 {'id': 'career.003', 'reasons': ['探究技能：达标', '知识覆盖：2.35 / 阈值 1.50（总权重 2.35）'], 'score': 1.0, 'title': 'Park Ranger'},
 {'id': 'career.004',
  'reasons': ['探究技能：达标', '知识覆盖：2.35 / 阈值 1.50（总权重 2.35）'],
  'score': 1.0,
  'title': 'Ecology Field Assistant'},
 {'id': 'career.005',
  'reasons': ['探究技能：达标', '知识覆盖：2.35 / 阈值 1.50（总权重 2.35）'],
  'score': 1.0,
  'title': 'Conservation Officer'}]

=== Units (Top-5) — demo_y9_progress ===
[{'id': 'BIO.Y3.AC9S3U01::L1',
  'reasons': ['覆盖节点：BIO.Y3.AC9S3U01', '难度=1'],
  'score': 1.0,
  'title': '提升「Living vs non-living; life cycles (intro)」到 L1'},
 {'id': 'EARTH.Y3.AC9S3U02::L1',
  'reasons': ['覆盖节点：EARTH.Y3.AC9S3U02', '难度=1'],
  'score': 1.0,
  'title': '提升「Soils

检测所有的用户

In [10]:
# —— 兼容各种结构，安全求和 —— 
def _safe_sum_levels(x) -> int:
    """把 dict/list/scalar 的等级/数值安全求和为整数。"""
    if x is None:
        return 0
    # dict: 取 values
    if isinstance(x, dict):
        iterable = x.values()
    # list: 支持 [int/float/str] 或 [dict(level=..|value=..|score=..|lvl=..)]
    elif isinstance(x, list):
        buf = []
        for item in x:
            if isinstance(item, dict):
                for key in ("level", "value", "score", "lvl", "min_level", "have", "amount"):
                    if key in item:
                        buf.append(item[key])
                        break
            else:
                buf.append(item)
        iterable = buf
    else:
        iterable = [x]

    total = 0
    for v in iterable:
        try:
            total += int(round(float(v)))
        except Exception:
            pass
    return max(0, total)

# —— 冷启动判定（软硬兼容，可按需调阈值） ——
COLD_KNOWLEDGE_SUM_THRESH = 0
COLD_INQUIRY_SUM_THRESH   = 0

def is_cold(u: dict) -> bool:
    know_raw = u.get("knowledge") or u.get("knowledge_progress") or {}
    inq_raw  = u.get("inquiry_skills") or {}
    sum_know = _safe_sum_levels(know_raw)
    sum_inq  = _safe_sum_levels(inq_raw)
    return (sum_know <= COLD_KNOWLEDGE_SUM_THRESH) or (sum_inq <= COLD_INQUIRY_SUM_THRESH)

# —— 重新分组 —— 
cold_ids, warm_ids = [], []
for i, u in enumerate(users_list):
    uid = user_ids[i]
    (cold_ids if is_cold(u) else warm_ids).append(uid)

print("用户总数：", len(user_ids))
print("冷启动人数：", len(cold_ids))
print("非冷启动人数：", len(warm_ids))
print("冷启动示例：", cold_ids[:10])
print("非冷启动示例：", warm_ids[:10])


用户总数： 40
冷启动人数： 40
非冷启动人数： 0
冷启动示例： ['Y3_U1', 'Y3_U2', 'Y3_U3', 'Y3_U4', 'Y3_U5', 'Y4_U1', 'Y4_U2', 'Y4_U3', 'Y4_U4', 'Y4_U5']
非冷启动示例： []


用“已存在用户”（默认第一个 Y3_U1），先打印该用户是否冷启动，再跑推荐并打印三类结果；

In [11]:
# === 用已存在用户做一次推荐测试（可直接运行） ===
import json, time
from pathlib import Path
from pprint import pprint

# 1) 取已加载的 users / careers / units
assert 'users' in globals(), "未找到 users，请先运行前面加载数据的单元"
assert 'careers' in globals(), "未找到 careers"
assert 'units_all' in globals() or 'units' in globals(), "未找到 units/units_all"
_units = globals().get('units_all') or globals().get('units')

# 2) 选择一个已存在用户（改这里即可）
user_ids = [u.get("id") or u.get("user_id") for u in users]
print("可选用户（前10）：", user_ids[:10], "…共", len(user_ids), "个")
USER_ID = user_ids[0]   # ← 想测谁就把这里改成那个 ID，比如 "Y4_U3"

user = next(u for u in users if (u.get("id") or u.get("user_id")) == USER_ID)

# 3) 冷启动检查
def _sum_vals(x):
    if not x: return 0
    if isinstance(x, dict): return sum(int(v) for v in x.values())
    if isinstance(x, list): 
        s=0
        for v in x:
            try: s += int(v if not isinstance(v, dict) else list(v.values())[0])
            except: pass
        return s
    try: return int(x)
    except: return 0

sum_know = _sum_vals(user.get("knowledge") or user.get("knowledge_progress"))
sum_inq  = _sum_vals(user.get("inquiry_skills"))
is_cold  = (sum_know == 0) or (sum_inq == 0)
print(f"\n测试用户：{USER_ID}｜年级={user.get('grade')}｜sum_knowledge={sum_know}｜sum_inquiry={sum_inq}｜cold_start={is_cold}")

# 4) 为了观察职业侧逻辑：即便冷启动也显示职业（若你不想看可改回 True）
HIDE_CAREERS_ON_COLDSTART = False

# 5) 跑推荐
assert 'get_recommendations' in globals(), "未定义 get_recommendations()，请先运行引擎代码单元"
TOPK = 5
res = get_recommendations(user, careers, _units, topK=TOPK)

# 6) 打印结果
print(f"\n=== Careers (Top-{TOPK}) for {USER_ID} ===")
pprint([{k: x[k] for k in ("id","title","score","reasons")} for x in res["careers"]], width=120)

print(f"\n=== Units (Top-{TOPK}) for {USER_ID} ===")
pprint([{k: x[k] for k in ("id","title","score","reasons")} for x in res["units"]], width=120)

print(f"\n=== Knowledge Gaps (Top-{TOPK}) for {USER_ID} ===")
pprint(res["knowledge"], width=120)

# 7) 保存输出，方便留档/对比
out_path = Path(f"reco_result_{USER_ID}_{int(time.time())}.json")
out_path.write_text(json.dumps(res, ensure_ascii=False, indent=2), encoding="utf-8")
print("\n已保存：", out_path.resolve())


可选用户（前10）： ['Y3_U1', 'Y3_U2', 'Y3_U3', 'Y3_U4', 'Y3_U5', 'Y4_U1', 'Y4_U2', 'Y4_U3', 'Y4_U4', 'Y4_U5'] …共 40 个

测试用户：Y3_U1｜年级=3｜sum_knowledge=0｜sum_inquiry=0｜cold_start=True

=== Careers (Top-5) for Y3_U1 ===
[{'id': 'career.001',
  'reasons': ['探究技能未达标：QP,PC,PAD,EVAL,COMM', '知识覆盖：0.00 / 阈值 1.50（总权重 2.35）'],
  'score': 0.0,
  'title': 'Marine Biologist'},
 {'id': 'career.002',
  'reasons': ['探究技能未达标：QP,PC,PAD,EVAL,COMM', '知识覆盖：0.00 / 阈值 1.50（总权重 2.35）'],
  'score': 0.0,
  'title': 'Wildlife Carer'},
 {'id': 'career.003',
  'reasons': ['探究技能未达标：QP,PC,PAD,EVAL,COMM', '知识覆盖：0.00 / 阈值 1.50（总权重 2.35）'],
  'score': 0.0,
  'title': 'Park Ranger'},
 {'id': 'career.004',
  'reasons': ['探究技能未达标：QP,PC,PAD,EVAL,COMM', '知识覆盖：0.00 / 阈值 1.50（总权重 2.35）'],
  'score': 0.0,
  'title': 'Ecology Field Assistant'},
 {'id': 'career.005',
  'reasons': ['探究技能未达标：QP,PC,PAD,EVAL,COMM', '知识覆盖：0.00 / 阈值 1.50（总权重 2.35）'],
  'score': 0.0,
  'title': 'Conservation Officer'}]

=== Units (Top-5) for Y3_U1 ===
[{'id': 'B