# Các thư viện sử dụng

In [1]:
import json, re
import pandas as pd
from pathlib import Path
from collections import defaultdict

In [4]:


data = []
with open("Data/entities/problem.json", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:  # bỏ qua dòng trống
            data.append(json.loads(line))

print(data[:10])
print(type(data))  # sẽ là list


[{'problem_id': 1730, 'title': '第一课 导论与三家分晋--习题', 'content': '1、《资治通鉴》卷1记载：智宣子将以瑶为后，智果曰：“……瑶之贤于人者五，其不逮者一也。美鬓长大则贤，射御足力则贤，伎艺毕给则贤，巧文辩惠则贤，强毅果敢则贤；如是而甚不仁。夫以其五贤陵人而以不仁行之，其谁能待之？”这是关于智氏家族立接班人的问题，关于智瑶的特点描述，下列哪一项是不符合实际的：', 'option': {'A': '武艺超群，精通射御之术', 'B': '礼贤下士，虚怀若谷', 'C': '反映敏捷，文辞和口才俱佳', 'D': '敢作敢为，处事果断'}, 'answer': '["B"]', 'score': 1.0, 'type': 1, 'typetext': '单选题', 'location': '1.4', 'context_id': [8045581, 8045582, 8045583], 'exercise_id': 'Ex_856', 'language': 'Chinese'}, {'problem_id': 1731, 'title': '第一课 导论与三家分晋--习题', 'content': '2、《资治通鉴》是一部____史书。', 'option': {'A': '纪传体', 'B': '编年体', 'C': '纪事本末体', 'D': '国别体'}, 'answer': '["B"]', 'score': 1.0, 'type': 1, 'typetext': '单选题', 'location': '1.4', 'context_id': [8045581, 8045582, 8045583], 'exercise_id': 'Ex_856', 'language': 'Chinese'}, {'problem_id': 1732, 'title': '第一课 导论与三家分晋--习题', 'content': '3、《资治通鉴》原名____，后由____赐名“资治通鉴”。', 'option': {'A': '《通鉴》；宋神宗', 'B': '《通志》；宋徽宗', 'C': '《通鉴》；宋徽宗', 'D': '《通志》；宋神宗'}, 'answer': '["D"]', 'score': 1.0, '

<h1>13.3.3.4. test relation concept-other</h1>

In [5]:

BASE = Path("Data")
CONCEPT_JSON = BASE / "entities" / "concept.json"
OTHER_JSON   = BASE / "entities" / "other.json"
REL_FILE     = BASE / "relations" / "concept-other.txt"

# --- utils ---
def norm_id(x: str) -> str:
    """Chuẩn hoá ID: về str, strip khoảng trắng. (Không bóc số vì ID dạng 'K_*' / 'Ot_*')."""
    if x is None: return ""
    return str(x).strip()

def iter_json_items(path: Path):
    """
    Đọc linh hoạt file JSON lớn:
      - JSON array: [ {...}, {...}, ... ]
      - NDJSON: mỗi dòng 1 object
    Bỏ qua dòng hỏng.
    """
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            try:
                arr = json.load(f)
                for obj in arr:
                    if isinstance(obj, dict):
                        yield obj
            except Exception:
                pass
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line: 
                    continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except json.JSONDecodeError:
                    continue

# --- 1) Load relation (concept-other.txt) ---
pairs = []
with REL_FILE.open("r", encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln: 
            continue
        parts = ln.split("\t")
        if len(parts) != 2:
            continue
        c_raw, o_raw = parts[0], parts[1]
        cid, oid = norm_id(c_raw), norm_id(o_raw)
        if cid and oid:
            pairs.append((cid, oid))

unique_pairs = set(pairs)
concept_in_rel = {c for c, _ in unique_pairs}
other_in_rel   = {o for _, o in unique_pairs}

# --- 2) Scan concept.json & other.json ---
concept_in_json = set()
for obj in iter_json_items(CONCEPT_JSON):
    cid = norm_id(obj.get("id"))
    if cid:
        concept_in_json.add(cid)

other_in_json = set()
for obj in iter_json_items(OTHER_JSON):
    oid = norm_id(obj.get("id"))
    if oid:
        other_in_json.add(oid)

# --- 3) Mismatch sets ---
missing_concepts      = sorted(concept_in_rel - concept_in_json)   # có trong relation nhưng không có trong concept.json
concepts_without_rel  = sorted(concept_in_json - concept_in_rel)   # có trong concept.json nhưng không xuất hiện trong relation

missing_others        = sorted(other_in_rel - other_in_json)       # có trong relation nhưng không có trong other.json
others_without_rel    = sorted(other_in_json - other_in_rel)       # có trong other.json nhưng không xuất hiện trong relation

# --- 4) Cardinality chỉ trên cặp HỢP LỆ (đôi bên đều tồn tại) ---
valid_pairs = {(c, o) for (c, o) in unique_pairs if (c in concept_in_json) and (o in other_in_json)}

c_to_o = defaultdict(set)
o_to_c = defaultdict(set)
for c, o in valid_pairs:
    c_to_o[c].add(o)
    o_to_c[o].add(c)

sizes_c = [len(v) for v in c_to_o.values()]
sizes_o = [len(v) for v in o_to_c.values()]

min_o_per_c = min(sizes_c) if sizes_c else 0
max_o_per_c = max(sizes_c) if sizes_c else 0
avg_o_per_c = (sum(sizes_c) / len(sizes_c)) if sizes_c else 0.0

min_c_per_o = min(sizes_o) if sizes_o else 0
max_c_per_o = max(sizes_o) if sizes_o else 0  # hữu ích để biết có N–N hay N–1
avg_c_per_o = (sum(sizes_o) / len(sizes_o)) if sizes_o else 0.0

# --- 5) Report ---
print("===== CONCEPT–OTHER CONSISTENCY REPORT =====")
print(f"Rows in relations file:                        {len(pairs):,}")
print(f"Unique (concept, other) pairs:                 {len(unique_pairs):,}\n")

print(f"Concepts in concept.json:                      {len(concept_in_json):,}")
print(f"Concepts referenced in relation:               {len(concept_in_rel):,}")
print(f"→ Missing concepts (rel ↛ concept.json):       {len(missing_concepts):,}")
print(f"→ Concepts without relation (only JSON):       {len(concepts_without_rel):,}\n")

print(f"Others in other.json:                          {len(other_in_json):,}")
print(f"Others referenced in relation:                  {len(other_in_rel):,}")
print(f"→ Missing others (rel ↛ other.json):            {len(missing_others):,}")
print(f"→ Others without relation (only JSON):          {len(others_without_rel):,}\n")

print(f"#Others per Concept (valid pairs only):         min={min_o_per_c}, max={max_o_per_c}, avg≈{avg_o_per_c:.2f}")
print(f"#Concepts per Other (valid pairs only):         min={min_c_per_o}, max={max_c_per_o}, avg={avg_c_per_o:.2f}")

# (tuỳ chọn) kết luận nhanh 1–1 / 1–N / N–1 / N–N
if   max_o_per_c==1 and max_c_per_o==1: rel_type="1–1"
elif max_o_per_c>1 and  max_c_per_o==1: rel_type="1–N (1 concept → nhiều other)"
elif max_o_per_c==1 and max_c_per_o>1:  rel_type="N–1 (nhiều concept → 1 other)"
else:                                   rel_type="N–N (nhiều–nhiều)"
print(f"⇒ Cardinality (valid pairs only):              {rel_type}")

# --- (tuỳ chọn) Top offenders để debug nhanh ---
def show_top(d, title, prefix, k=5):
    if not d:
        print(f"\n{title}: (trống)")
        return
    top = sorted(d.items(), key=lambda kv: -len(kv[1]))[:k]
    print(f"\n{title}:")
    for key, vals in top:
        sample = ", ".join(sorted(vals)[:5])
        print(f"  {prefix}{key} → {len(vals)} items (vd: {sample})")

show_top(c_to_o, "Top concepts có nhiều other nhất", "K_")
show_top(o_to_c, "Top others có nhiều concept nhất", "Ot_")


===== CONCEPT–OTHER CONSISTENCY REPORT =====
Rows in relations file:                        379,926
Unique (concept, other) pairs:                 246,171

Concepts in concept.json:                      637,572
Concepts referenced in relation:               101,548
→ Missing concepts (rel ↛ concept.json):       0
→ Concepts without relation (only JSON):       536,024

Others in other.json:                          207,653
Others referenced in relation:                  64,858
→ Missing others (rel ↛ other.json):            0
→ Others without relation (only JSON):          142,795

#Others per Concept (valid pairs only):         min=1, max=6, avg≈2.42
#Concepts per Other (valid pairs only):         min=1, max=222, avg=3.80
⇒ Cardinality (valid pairs only):              N–N (nhiều–nhiều)

Top concepts có nhiều other nhất:
  K_K_刚体运动_力学 → 6 items (vd: Ot_2wXX5i7TZeiwmKHFdHAAR8, Ot_3uHAuGJww7pSNzuBabzjwm, Ot_DUqUWemgp3N23FiLhm7snm, Ot_QwWP6uipqrSiWJUL2bCXjV, Ot_RRU4GoVWbtasqWMGDWQmg7)
  K_

<h1>1.3.3.5. kiểm tra relation concept-paper</h1>

In [6]:

BASE = Path("Data")
CONCEPT_JSON = BASE / "entities" / "concept.json"
PAPER_JSON   = BASE / "entities" / "paper.json"
REL_FILE     = BASE / "relations" / "concept-paper.txt"

# ---------- utils ----------
NUM = re.compile(r"\d+")

def norm_concept_id(x):
    """Concept ID: giữ nguyên (K_...), chỉ strip."""
    if x is None: return ""
    return str(x).strip()

def digits(x) -> str:
    """Rút chuỗi SỐ từ bất kỳ đầu vào (int/str có tiền tố)."""
    if x is None: return ""
    s = str(x).strip()
    m = NUM.search(s)
    return m.group(0) if m else s  # nếu đã là số thuần thì giữ nguyên

def iter_json_items(path: Path):
    """Đọc được JSON array lớn hoặc NDJSON (mỗi dòng 1 object)."""
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            try:
                for obj in json.load(f):
                    if isinstance(obj, dict): yield obj
            except Exception:
                pass
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line: continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict): yield obj
                except json.JSONDecodeError:
                    continue

# --- 1) Load relation (concept-paper.txt) ---
pairs = []
with REL_FILE.open("r", encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln: 
            continue
        parts = ln.split("\t")
        if len(parts) != 2:
            continue
        c_raw, p_raw = parts[0].strip(), parts[1].strip()
        cid = norm_concept_id(c_raw)   # concept giữ nguyên
        # paper trong REL có tiền tố P_ -> bỏ đi
        pid = p_raw[2:] if p_raw.startswith("P_") else p_raw
        if cid and pid:
            pairs.append((cid, pid))

unique_pairs   = set(pairs)
concept_in_rel = {c for c, _ in unique_pairs}
paper_in_rel   = {p for _, p in unique_pairs}

# ---------- 2) Scan concept.json & paper.json ----------
concept_in_json = set()
for obj in iter_json_items(CONCEPT_JSON):
    cid = norm_concept_id(obj.get("id") or obj.get("concept_id"))
    if cid: concept_in_json.add(cid)

paper_in_json = set()
for obj in iter_json_items(PAPER_JSON):
    pid_raw = obj.get("id") or obj.get("paper_id")
    if pid_raw:
        paper_in_json.add(str(pid_raw).strip())

# ---------- 3) Mismatch sets ----------
missing_concepts      = sorted(concept_in_rel - concept_in_json)
concepts_without_rel  = sorted(concept_in_json - concept_in_rel)

missing_papers        = sorted(paper_in_rel - paper_in_json)
papers_without_rel    = sorted(paper_in_json - paper_in_rel)

# ---------- 4) Cardinality (chỉ trên cặp HỢP LỆ) ----------
valid_pairs = {(c, p) for (c, p) in unique_pairs if (c in concept_in_json) and (p in paper_in_json)}

from collections import defaultdict
c_to_p, p_to_c = defaultdict(set), defaultdict(set)
for c, p in valid_pairs:
    c_to_p[c].add(p)
    p_to_c[p].add(c)

sizes_c = [len(v) for v in c_to_p.values()]  # #papers per concept
sizes_p = [len(v) for v in p_to_c.values()]  # #concepts per paper

min_p_per_c = min(sizes_c) if sizes_c else 0
max_p_per_c = max(sizes_c) if sizes_c else 0
avg_p_per_c = (sum(sizes_c) / len(sizes_c)) if sizes_c else 0.0

min_c_per_p = min(sizes_p) if sizes_p else 0
max_c_per_p = max(sizes_p) if sizes_p else 0
avg_c_per_p = (sum(sizes_p) / len(sizes_p)) if sizes_p else 0.0

# ---------- 5) Report ----------
print("===== CONCEPT–PAPER CONSISTENCY REPORT =====")
print(f"Rows in relations file:                        {len(pairs):,}")
print(f"Unique (concept, paper) pairs:                 {len(unique_pairs):,}\n")

print(f"Concepts in concept.json:                      {len(concept_in_json):,}")
print(f"Concepts referenced in relation:               {len(concept_in_rel):,}")
print(f"→ Missing concepts (rel ↛ concept.json):       {len(missing_concepts):,}")
print(f"→ Concepts without relation (only JSON):       {len(concepts_without_rel):,}\n")

print(f"Papers in paper.json (numeric IDs):            {len(paper_in_json):,}")
print(f"Papers referenced in relation:                 {len(paper_in_rel):,}")
print(f"→ Missing papers (rel ↛ paper.json):           {len(missing_papers):,}")
print(f"→ Papers without relation (only JSON):         {len(papers_without_rel):,}\n")

print(f"#Papers per Concept (valid pairs only):        min={min_p_per_c}, max={max_p_per_c}, avg≈{avg_p_per_c:.2f}")
print(f"#Concepts per Paper (valid pairs only):        min={min_c_per_p}, max={max_c_per_p}, avg≈{avg_c_per_p:.2f}")

if   max_p_per_c==1 and max_c_per_p==1: rel_type="1–1"
elif max_p_per_c>1 and  max_c_per_p==1: rel_type="1–N (1 concept → nhiều paper)"
elif max_p_per_c==1 and max_c_per_p>1:  rel_type="N–1 (nhiều concept → 1 paper)"
else:                                   rel_type="N–N (nhiều–nhiều)"
print(f"⇒ Cardinality (valid pairs only):              {rel_type}")

# Optional: samples để debug
def sample(lst, k=5): return lst[:k] if len(lst)>k else lst
print("\nSamples:")
print("  Missing concepts:", sample(missing_concepts))
print("  Missing papers:", sample(missing_papers))


===== CONCEPT–PAPER CONSISTENCY REPORT =====
Rows in relations file:                        5,410,742
Unique (concept, paper) pairs:                 5,410,742

Concepts in concept.json:                      637,572
Concepts referenced in relation:               542,101
→ Missing concepts (rel ↛ concept.json):       104,636
→ Concepts without relation (only JSON):       200,107

Papers in paper.json (numeric IDs):            1,734,211
Papers referenced in relation:                 1,734,211
→ Missing papers (rel ↛ paper.json):           0
→ Papers without relation (only JSON):         0

#Papers per Concept (valid pairs only):        min=1, max=10, avg≈9.98
#Concepts per Paper (valid pairs only):        min=1, max=6498, avg≈2.71
⇒ Cardinality (valid pairs only):              N–N (nhiều–nhiều)

Samples:
  Missing concepts: ['K_0.618_建筑学', 'K_113号元素_化学', 'K_114号元素_化学', 'K_24节气_地球物理学', 'K_316L不锈钢_化学工程与技术']
  Missing papers: []


In [7]:
path = "Data/entities/paper.json"

# chunksize => pandas chỉ đọc một phần file mỗi lần
reader = pd.read_json(path, lines=True, chunksize=10)  # lấy đúng 10 dòng đầu
first_chunk = next(reader)  # DataFrame 10 dòng
print(first_chunk)


                                            abstract  \
0  Sedimentary pyrite formation during early diag...   
1                                                      
2  Pyrite framboid formation may be the result of...   
3  The pyritization of reactive trace elements in...   
4  Pyrite framboids are densely packed, generally...   
5  Through the use of the electrum-tarnish method...   
6  The reactions of Fe(II) and Fe(III) solutions ...   
7                                                      
8  Rates of aqueous, abiotic pyrite oxidation wer...   
9  Concentrations of magnetite were determined, w...   

                                             authors  \
0  [{'id': '54862cecdabfaed7b5fa27cb', 'name': 'R...   
1  [{'id': '54862cecdabfaed7b5fa27cb', 'name': 'r...   
2  [{'id': '53f4420ddabfaee02ad0ad00', 'name': 'R...   
3  [{'id': '53f43764dabfaeee229b3eb1', 'name': 'M...   
4  [{'id': '53f4420ddabfaee02ad0ad00', 'name': 'R...   
5  [{'id': '53f319b0dabfae9a8442d10d', 'name': 

In [8]:
REL_FILE = Path("Data/relations/concept-paper.txt")

paper_ids = set()   # dùng set để loại bỏ trùng lặp
with REL_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split("\t")
        if len(parts) != 2:
            continue
        _, p_raw = parts
        paper_ids.add(p_raw.strip())

print(f"Tổng số paper ID khác nhau trong file: {len(paper_ids):,}")


Tổng số paper ID khác nhau trong file: 1,734,211


In [9]:
PAPER_JSON = Path("Data/entities/paper.json")

paper_ids = set()

def iter_json_items(path: Path):
    """Đọc JSON array hoặc NDJSON (mỗi dòng 1 object)."""
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            arr = json.load(f)
            for obj in arr:
                if isinstance(obj, dict):
                    yield obj
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line: 
                    continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except json.JSONDecodeError:
                    continue

# Thu ID vào set
for obj in iter_json_items(PAPER_JSON):
    pid = obj.get("id") or obj.get("paper_id")
    if pid is not None:
        paper_ids.add(str(pid).strip())

print(f"Tổng số paper ID khác nhau trong paper.json: {len(paper_ids):,}")


Tổng số paper ID khác nhau trong paper.json: 1,734,211


In [10]:
REL_FILE = Path("Data/relations/concept-paper.txt")

unique_lines = set()
with REL_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        unique_lines.add(line)   # lưu nguyên dòng (concept_id \t paper_id)

print(f"Tổng số dòng khác nhau trong file: {len(unique_lines):,}")


Tổng số dòng khác nhau trong file: 5,410,743


# Kiểm tra relation concept-problem.txt

In [11]:
# --- Paths ---
BASE = Path("Data")
CONCEPT_JSON = BASE / "entities" / "concept.json"
PROBLEM_JSON = BASE / "entities" / "problem.json"
REL_FILE     = BASE / "relations" / "concept-problem.txt"

# --- JSON reader: array hoặc NDJSON ---
def iter_json_items(path: Path):
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            try:
                arr = json.load(f)
                for obj in arr:
                    if isinstance(obj, dict):
                        yield obj
            except Exception:
                pass
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except json.JSONDecodeError:
                    continue

# --- 1) Load relation (concept-problem.txt: TSV 2 cột) ---
pairs = []
with REL_FILE.open("r", encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln:
            continue
        parts = ln.split("\t")
        if len(parts) != 2:
            continue
        c_raw, pr_raw = parts[0].strip(), parts[1].strip()
        if c_raw and pr_raw:
            pairs.append((c_raw, pr_raw))

unique_pairs     = set(pairs)
concept_in_rel   = {c for c, _ in unique_pairs}
problem_in_rel   = {p for _, p in unique_pairs}

# --- 2) Scan concept.json & problem.json (giữ nguyên id) ---
concept_in_json = set()
for obj in iter_json_items(CONCEPT_JSON):
    cid = obj.get("id") or obj.get("concept_id")
    if cid:
        concept_in_json.add(str(cid).strip())

problem_in_json = set()
for obj in iter_json_items(PROBLEM_JSON):
    pid = obj.get("id") or obj.get("problem_id")
    if pid:
        problem_in_json.add(str(pid).strip())

# --- 3) Mismatch sets ---
missing_concepts      = sorted(concept_in_rel - concept_in_json)
concepts_without_rel  = sorted(concept_in_json - concept_in_rel)

missing_problems      = sorted(problem_in_rel - problem_in_json)
problems_without_rel  = sorted(problem_in_json - problem_in_rel)

# --- 4) Cardinality trên cặp hợp lệ ---
valid_pairs = {(c, p) for (c, p) in unique_pairs
               if (c in concept_in_json) and (p in problem_in_json)}

c_to_p = defaultdict(set)  # concept -> {problem}
p_to_c = defaultdict(set)  # problem -> {concept}
for c, p in valid_pairs:
    c_to_p[c].add(p)
    p_to_c[p].add(c)

sizes_c = [len(v) for v in c_to_p.values()]  # #problems per concept
sizes_p = [len(v) for v in p_to_c.values()]  # #concepts per problem

min_p_per_c = min(sizes_c) if sizes_c else 0
max_p_per_c = max(sizes_c) if sizes_c else 0
avg_p_per_c = (sum(sizes_c) / len(sizes_c)) if sizes_c else 0.0

min_c_per_p = min(sizes_p) if sizes_p else 0
max_c_per_p = max(sizes_p) if sizes_p else 0
avg_c_per_p = (sum(sizes_p) / len(sizes_p)) if sizes_p else 0.0

# --- 5) Report ---
print("===== CONCEPT–PROBLEM CONSISTENCY REPORT =====")
print(f"Rows in relations file:                        {len(pairs):,}")
print(f"Unique (concept, problem) pairs:               {len(unique_pairs):,}\n")

print(f"Concepts in concept.json:                      {len(concept_in_json):,}")
print(f"Concepts referenced in relation:               {len(concept_in_rel):,}")
print(f"→ Missing concepts (rel ↛ concept.json):       {len(missing_concepts):,}")
print(f"→ Concepts without relation (only JSON):       {len(concepts_without_rel):,}\n")

print(f"Problems in problem.json:                      {len(problem_in_json):,}")
print(f"Problems referenced in relation:               {len(problem_in_rel):,}")
print(f"→ Missing problems (rel ↛ problem.json):       {len(missing_problems):,}")
print(f"→ Problems without relation (only JSON):       {len(problems_without_rel):,}\n")

print(f"#Problems per Concept (valid pairs only):       min={min_p_per_c}, max={max_p_per_c}, avg≈{avg_p_per_c:.2f}")
print(f"#Concepts per Problem (valid pairs only):       min={min_c_per_p}, max={max_c_per_p}, avg≈{avg_c_per_p:.2f}")

# Kết luận cardinality
if   max_p_per_c==1 and max_c_per_p==1: rel_type="1–1"
elif max_p_per_c>1 and  max_c_per_p==1: rel_type="1–N (1 concept → nhiều problem)"
elif max_p_per_c==1 and max_c_per_p>1:  rel_type="N–1 (nhiều concept → 1 problem)"
else:                                   rel_type="N–N (nhiều–nhiều)"
print(f"⇒ Cardinality (valid pairs only):              {rel_type}")

# (tuỳ chọn) Mẫu để debug nhanh
def sample(lst, k=5): return lst[:k] if len(lst) > k else lst
print("\nSamples:")
print("  Missing concepts:", sample(missing_concepts))
print("  Missing problems:", sample(missing_problems))


===== CONCEPT–PROBLEM CONSISTENCY REPORT =====
Rows in relations file:                        33,180
Unique (concept, problem) pairs:               29,141

Concepts in concept.json:                      637,572
Concepts referenced in relation:               13,822
→ Missing concepts (rel ↛ concept.json):       6,890
→ Concepts without relation (only JSON):       630,640

Problems in problem.json:                      2,454,422
Problems referenced in relation:               19,983
→ Missing problems (rel ↛ problem.json):       19,983
→ Problems without relation (only JSON):       2,454,422

#Problems per Concept (valid pairs only):       min=0, max=0, avg≈0.00
#Concepts per Problem (valid pairs only):       min=0, max=0, avg≈0.00
⇒ Cardinality (valid pairs only):              N–N (nhiều–nhiều)

Samples:
  Missing concepts: ['K_4G技术_计算机科学与技术', 'K_ABS塑料_材料科学与工程', 'K_ATP水解_生物学', 'K_ATP酶_药学', 'K_Android_计算机科学与技术']
  Missing problems: ['Pm_1005436', 'Pm_1007592', 'Pm_1007594', 'Pm_1007606', 

In [12]:
# --- Paths ---
BASE = Path("Data")
CONCEPT_JSON = BASE / "entities" / "concept.json"
PROBLEM_JSON = BASE / "entities" / "problem.json"
REL_FILE     = BASE / "relations" / "concept-problem.txt"

# --- JSON reader: array hoặc NDJSON ---
def iter_json_items(path: Path):
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            try:
                for obj in json.load(f):
                    if isinstance(obj, dict):
                        yield obj
            except Exception:
                pass
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line: continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except json.JSONDecodeError:
                    continue

# --- helper: bỏ tiền tố ở problem_id trong REL ---
def strip_problem_prefix(s: str) -> str:
    """Bỏ 'Pm_' hoặc 'P_' nếu có; giữ nguyên nếu không có tiền tố."""
    s = s.strip()
    if s.startswith("Pm_"): return s[3:]
    if s.startswith("P_"):  return s[2:]
    return s

# --- 1) Load relation (concept-problem.txt: TSV 2 cột) ---
pairs = []
with REL_FILE.open("r", encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln: continue
        parts = ln.split("\t")
        if len(parts) != 2: continue
        c_raw, pr_raw = parts[0].strip(), parts[1].strip()
        cid = c_raw                     # concept giữ nguyên (K_…)
        pid = strip_problem_prefix(pr_raw)  # problem: bỏ tiền tố Pm_/P_
        if cid and pid:
            pairs.append((cid, pid))

unique_pairs   = set(pairs)
concept_in_rel = {c for c, _ in unique_pairs}
problem_in_rel = {p for _, p in unique_pairs}

# --- 2) Scan concept.json & problem.json (giữ nguyên id) ---
concept_in_json = set()
for obj in iter_json_items(CONCEPT_JSON):
    cid = obj.get("id") or obj.get("concept_id")
    if cid:
        concept_in_json.add(str(cid).strip())

problem_in_json = set()
for obj in iter_json_items(PROBLEM_JSON):
    pid = obj.get("id") or obj.get("problem_id")
    if pid:
        problem_in_json.add(str(pid).strip())

# --- 3) Mismatch sets ---
missing_concepts      = sorted(concept_in_rel - concept_in_json)
concepts_without_rel  = sorted(concept_in_json - concept_in_rel)

missing_problems      = sorted(problem_in_rel - problem_in_json)
problems_without_rel  = sorted(problem_in_json - problem_in_rel)

# --- 4) Cardinality trên cặp hợp lệ ---
valid_pairs = {(c, p) for (c, p) in unique_pairs
               if (c in concept_in_json) and (p in problem_in_json)}

c_to_p = defaultdict(set)  # concept -> {problem}
p_to_c = defaultdict(set)  # problem -> {concept}
for c, p in valid_pairs:
    c_to_p[c].add(p)
    p_to_c[p].add(c)

sizes_c = [len(v) for v in c_to_p.values()]  # #problems per concept
sizes_p = [len(v) for v in p_to_c.values()]  # #concepts per problem

min_p_per_c = min(sizes_c) if sizes_c else 0
max_p_per_c = max(sizes_c) if sizes_c else 0
avg_p_per_c = (sum(sizes_c) / len(sizes_c)) if sizes_c else 0.0

min_c_per_p = min(sizes_p) if sizes_p else 0
max_c_per_p = max(sizes_p) if sizes_p else 0
avg_c_per_p = (sum(sizes_p) / len(sizes_p)) if sizes_p else 0.0

# --- 5) Report ---
print("===== CONCEPT–PROBLEM CONSISTENCY REPORT =====")
print(f"Rows in relations file:                        {len(pairs):,}")
print(f"Unique (concept, problem) pairs:               {len(unique_pairs):,}\n")

print(f"Concepts in concept.json:                      {len(concept_in_json):,}")
print(f"Concepts referenced in relation:               {len(concept_in_rel):,}")
print(f"→ Missing concepts (rel ↛ concept.json):       {len(missing_concepts):,}")
print(f"→ Concepts without relation (only JSON):       {len(concepts_without_rel):,}\n")

print(f"Problems in problem.json:                      {len(problem_in_json):,}")
print(f"Problems referenced in relation:               {len(problem_in_rel):,}")
print(f"→ Missing problems (rel ↛ problem.json):       {len(missing_problems):,}")
print(f"→ Problems without relation (only JSON):       {len(problems_without_rel):,}\n")

print(f"#Problems per Concept (valid pairs only):       min={min_p_per_c}, max={max_p_per_c}, avg≈{avg_p_per_c:.2f}")
print(f"#Concepts per Problem (valid pairs only):       min={min_c_per_p}, max={max_c_per_p}, avg≈{avg_c_per_p:.2f}")

if   max_p_per_c==1 and max_c_per_p==1: rel_type="1–1"
elif max_p_per_c>1 and  max_c_per_p==1: rel_type="1–N (1 concept → nhiều problem)"
elif max_p_per_c==1 and max_c_per_p>1:  rel_type="N–1 (nhiều concept → 1 problem)"
else:                                   rel_type="N–N (nhiều–nhiều)"
print(f"⇒ Cardinality (valid pairs only):              {rel_type}")

# (tuỳ chọn) mẫu để debug
def sample(lst, k=5): return lst[:k] if len(lst) > k else lst
print("\nSamples:")
print("  Missing concepts:", sample(missing_concepts))
print("  Missing problems:", sample(missing_problems))


===== CONCEPT–PROBLEM CONSISTENCY REPORT =====
Rows in relations file:                        33,180
Unique (concept, problem) pairs:               29,141

Concepts in concept.json:                      637,572
Concepts referenced in relation:               13,822
→ Missing concepts (rel ↛ concept.json):       6,890
→ Concepts without relation (only JSON):       630,640

Problems in problem.json:                      2,454,422
Problems referenced in relation:               19,983
→ Missing problems (rel ↛ problem.json):       3,130
→ Problems without relation (only JSON):       2,437,569

#Problems per Concept (valid pairs only):       min=1, max=6, avg≈2.04
#Concepts per Problem (valid pairs only):       min=1, max=10, avg≈1.35
⇒ Cardinality (valid pairs only):              N–N (nhiều–nhiều)

Samples:
  Missing concepts: ['K_4G技术_计算机科学与技术', 'K_ABS塑料_材料科学与工程', 'K_ATP水解_生物学', 'K_ATP酶_药学', 'K_Android_计算机科学与技术']
  Missing problems: ['1005436', '1007647', '1007652', '1007659', '1007660']


In [13]:
BASE = Path("Data")
PROBLEM_JSON = BASE / "entities" / "problem.json"

def iter_json_items(path: Path):
    """Đọc file JSON lớn: array hoặc NDJSON."""
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            arr = json.load(f)
            for obj in arr:
                if isinstance(obj, dict):
                    yield obj
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line: continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except json.JSONDecodeError:
                    continue

# --- Đếm số id khác nhau ---
problem_ids = set()
for obj in iter_json_items(PROBLEM_JSON):
    pid = obj.get("id") or obj.get("problem_id")
    if pid:
        problem_ids.add(str(pid).strip())

print("Số lượng id khác nhau trong problem.json:", len(problem_ids))


Số lượng id khác nhau trong problem.json: 2454422


In [14]:
REL_FILE = Path("Data/relations/concept-problem.txt")

problem_ids = set()   # dùng set để loại bỏ trùng lặp
with REL_FILE.open("r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split("\t")
        if len(parts) != 2:
            continue
        _, p_raw = parts
        problem_ids.add(p_raw.strip())

print(f"Tổng số paper ID khác nhau trong file: {len(problem_ids):,}")


Tổng số paper ID khác nhau trong file: 19,983


In [15]:
path = "Data/entities/problem.json"

# chunksize => pandas chỉ đọc một phần file mỗi lần
reader = pd.read_json(path, lines=True, chunksize=50)  # lấy đúng 10 dòng đầu
first_chunk = next(reader)  # DataFrame 10 dòng
print(first_chunk)


    problem_id            title  \
0         1730  第一课 导论与三家分晋--习题   
1         1731  第一课 导论与三家分晋--习题   
2         1732  第一课 导论与三家分晋--习题   
3         1733  第一课 导论与三家分晋--习题   
4         1734  第一课 导论与三家分晋--习题   
5         1735  第一课 导论与三家分晋--习题   
6         1736  第一课 导论与三家分晋--习题   
7         1737  第一课 导论与三家分晋--习题   
8         1738  第一课 导论与三家分晋--习题   
9         1739  第二课 战国前期的政治--习题   
10        1740  第二课 战国前期的政治--习题   
11        1741  第二课 战国前期的政治--习题   
12        1742     第三课 商鞅变法--习题   
13        1743     第三课 商鞅变法--习题   
14        1744     第三课 商鞅变法--习题   
15        1745     第三课 商鞅变法--习题   
16        1746     第三课 商鞅变法--习题   
17        1747     第三课 商鞅变法--习题   
18        1748     第三课 商鞅变法--习题   
19        1749     第三课 商鞅变法--习题   
20        1750    第四课 秦国的外交--习题   
21        1751    第四课 秦国的外交--习题   
22        1752    第四课 秦国的外交--习题   
23        1753    第四课 秦国的外交--习题   
24        1754    第四课 秦国的外交--习题   
25        1755    第四课 秦国的外交--习题   
26        1756    第四课 秦国的外交--习题   
27        1757    第四

# Kiểm tra relation concept-video.txt

In [16]:
# --- Paths ---
BASE = Path("Data")
CONCEPT_JSON = BASE / "entities" / "concept.json"
VIDEO_JSON   = BASE / "entities" / "video.json"
REL_FILE     = BASE / "relations" / "concept-video.txt"

# --- Reader: JSON array hoặc NDJSON ---
def iter_json_items(path: Path):
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            try:
                for obj in json.load(f):
                    if isinstance(obj, dict):
                        yield obj
            except Exception:
                pass
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line: 
                    continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except json.JSONDecodeError:
                    continue

# --- 1) Load relation (concept-video.txt: TSV 2 cột) ---
pairs = []
with REL_FILE.open("r", encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln: 
            continue
        parts = ln.split("\t")
        if len(parts) != 2:
            continue
        c_raw, v_raw = parts[0].strip(), parts[1].strip()  # giữ nguyên định dạng
        if c_raw and v_raw:
            pairs.append((c_raw, v_raw))

unique_pairs   = set(pairs)
concept_in_rel = {c for c, _ in unique_pairs}
video_in_rel   = {v for _, v in unique_pairs}

# --- 2) Scan concept.json & video.json (giữ nguyên id) ---
concept_in_json = set()
for obj in iter_json_items(CONCEPT_JSON):
    cid = obj.get("id") or obj.get("concept_id")
    if cid:
        concept_in_json.add(str(cid).strip())

video_in_json = set()
for obj in iter_json_items(VIDEO_JSON):
    vid = obj.get("ccid") or obj.get("id")  # theo schema: ccid là ID chính
    if vid:
        video_in_json.add(str(vid).strip())

# --- 3) Mismatch sets ---
missing_concepts     = sorted(concept_in_rel - concept_in_json)
concepts_without_rel = sorted(concept_in_json - concept_in_rel)

missing_videos       = sorted(video_in_rel - video_in_json)
videos_without_rel   = sorted(video_in_json - video_in_rel)

# --- 4) Cardinality (chỉ trên cặp HỢP LỆ) ---
valid_pairs = {(c, v) for (c, v) in unique_pairs
               if (c in concept_in_json) and (v in video_in_json)}

c_to_v = defaultdict(set)  # concept -> {video}
v_to_c = defaultdict(set)  # video   -> {concept}
for c, v in valid_pairs:
    c_to_v[c].add(v)
    v_to_c[v].add(c)

sizes_c = [len(vs) for vs in c_to_v.values()]  # #videos per concept
sizes_v = [len(cs) for cs in v_to_c.values()]  # #concepts per video

min_v_per_c = min(sizes_c) if sizes_c else 0
max_v_per_c = max(sizes_c) if sizes_c else 0
avg_v_per_c = (sum(sizes_c) / len(sizes_c)) if sizes_c else 0.0

min_c_per_v = min(sizes_v) if sizes_v else 0
max_c_per_v = max(sizes_v) if sizes_v else 0
avg_c_per_v = (sum(sizes_v) / len(sizes_v)) if sizes_v else 0.0

# --- 5) Report ---
print("===== CONCEPT–VIDEO CONSISTENCY REPORT =====")
print(f"Rows in relations file:                        {len(pairs):,}")
print(f"Unique (concept, video) pairs:                 {len(unique_pairs):,}\n")

print(f"Concepts in concept.json:                      {len(concept_in_json):,}")
print(f"Concepts referenced in relation:               {len(concept_in_rel):,}")
print(f"→ Missing concepts (rel ↛ concept.json):       {len(missing_concepts):,}")
print(f"→ Concepts without relation (only JSON):       {len(concepts_without_rel):,}\n")

print(f"Videos in video.json:                          {len(video_in_json):,}")
print(f"Videos referenced in relation:                 {len(video_in_rel):,}")
print(f"→ Missing videos (rel ↛ video.json):           {len(missing_videos):,}")
print(f"→ Videos without relation (only JSON):         {len(videos_without_rel):,}\n")

print(f"#Videos per Concept (valid pairs only):        min={min_v_per_c}, max={max_v_per_c}, avg≈{avg_v_per_c:.2f}")
print(f"#Concepts per Video (valid pairs only):        min={min_c_per_v}, max={max_c_per_v}, avg≈{avg_c_per_v:.2f}")

if   max_v_per_c==1 and max_c_per_v==1: rel_type="1–1"
elif max_v_per_c>1 and  max_c_per_v==1: rel_type="1–N (1 concept → nhiều video)"
elif max_v_per_c==1 and max_c_per_v>1:  rel_type="N–1 (nhiều concept → 1 video)"
else:                                   rel_type="N–N (nhiều–nhiều)"
print(f"⇒ Cardinality (valid pairs only):              {rel_type}")

# (tuỳ chọn) Mẫu để debug nhanh
def sample(lst, k=5): return lst[:k] if len(lst) > k else lst
print("\nSamples:")
print("  Missing concepts:", sample(missing_concepts))
print("  Missing videos:", sample(missing_videos))


===== CONCEPT–VIDEO CONSISTENCY REPORT =====
Rows in relations file:                        624,683
Unique (concept, video) pairs:                 624,683

Concepts in concept.json:                      637,572
Concepts referenced in relation:               217,038
→ Missing concepts (rel ↛ concept.json):       0
→ Concepts without relation (only JSON):       420,534

Videos in video.json:                          59,581
Videos referenced in relation:                 34,101
→ Missing videos (rel ↛ video.json):           0
→ Videos without relation (only JSON):         25,480

#Videos per Concept (valid pairs only):        min=1, max=3647, avg≈2.88
#Concepts per Video (valid pairs only):        min=1, max=183, avg≈18.32
⇒ Cardinality (valid pairs only):              N–N (nhiều–nhiều)

Samples:
  Missing concepts: []
  Missing videos: []


# Kiểm tra relation concept-comment.txt

In [17]:
# --- Paths ---
BASE = Path("Data")
CONCEPT_JSON = BASE / "entities" / "concept.json"
COMMENT_JSON = BASE / "entities" / "comment.json"
REL_FILE     = BASE / "relations" / "concept-comment.txt"

# --- Reader: JSON array hoặc NDJSON ---
def iter_json_items(path: Path):
    with path.open("r", encoding="utf-8") as f:
        head = f.read(2048); f.seek(0)
        if head.lstrip().startswith("["):
            try:
                for obj in json.load(f):
                    if isinstance(obj, dict):
                        yield obj
            except Exception:
                pass
        else:
            for line in f:
                line = line.strip().rstrip(",")
                if not line: 
                    continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except json.JSONDecodeError:
                    continue

# --- 1) Load relation (concept-comment.txt: TSV 2 cột) ---
pairs = []
with REL_FILE.open("r", encoding="utf-8") as f:
    for ln in f:
        ln = ln.strip()
        if not ln:
            continue
        parts = ln.split("\t")
        if len(parts) != 2:
            continue
        c_raw, cm_raw = parts[0].strip(), parts[1].strip()  # giữ nguyên định dạng
        if c_raw and cm_raw:
            pairs.append((c_raw, cm_raw))

unique_pairs   = set(pairs)
concept_in_rel = {c for c, _ in unique_pairs}
comment_in_rel = {cm for _, cm in unique_pairs}

# --- 2) Scan concept.json & comment.json (giữ nguyên id) ---
concept_in_json = set()
for obj in iter_json_items(CONCEPT_JSON):
    cid = obj.get("id") or obj.get("concept_id")
    if cid:
        concept_in_json.add(str(cid).strip())

comment_in_json = set()
for obj in iter_json_items(COMMENT_JSON):
    # theo schema: id là khóa chính của comment
    cmid = obj.get("id") or obj.get("comment_id")
    if cmid:
        comment_in_json.add(str(cmid).strip())

# --- 3) Mismatch sets ---
missing_concepts     = sorted(concept_in_rel - concept_in_json)
concepts_without_rel = sorted(concept_in_json - concept_in_rel)

missing_comments     = sorted(comment_in_rel - comment_in_json)
comments_without_rel = sorted(comment_in_json - comment_in_rel)

# --- 4) Cardinality (chỉ trên cặp HỢP LỆ) ---
valid_pairs = {(c, cm) for (c, cm) in unique_pairs
               if (c in concept_in_json) and (cm in comment_in_json)}

c_to_cm = defaultdict(set)  # concept -> {comment}
cm_to_c = defaultdict(set)  # comment -> {concept}
for c, cm in valid_pairs:
    c_to_cm[c].add(cm)
    cm_to_c[cm].add(c)

sizes_c  = [len(vs) for vs in c_to_cm.values()]  # #comments per concept
sizes_cm = [len(cs) for cs in cm_to_c.values()]  # #concepts per comment

min_cm_per_c = min(sizes_c) if sizes_c else 0
max_cm_per_c = max(sizes_c) if sizes_c else 0
avg_cm_per_c = (sum(sizes_c) / len(sizes_c)) if sizes_c else 0.0

min_c_per_cm = min(sizes_cm) if sizes_cm else 0
max_c_per_cm = max(sizes_cm) if sizes_cm else 0
avg_c_per_cm = (sum(sizes_cm) / len(sizes_cm)) if sizes_cm else 0.0

# --- 5) Report ---
print("===== CONCEPT–COMMENT CONSISTENCY REPORT =====")
print(f"Rows in relations file:                        {len(pairs):,}")
print(f"Unique (concept, comment) pairs:               {len(unique_pairs):,}\n")

print(f"Concepts in concept.json:                      {len(concept_in_json):,}")
print(f"Concepts referenced in relation:               {len(concept_in_rel):,}")
print(f"→ Missing concepts (rel ↛ concept.json):       {len(missing_concepts):,}")
print(f"→ Concepts without relation (only JSON):       {len(concepts_without_rel):,}\n")

print(f"Comments in comment.json:                      {len(comment_in_json):,}")
print(f"Comments referenced in relation:               {len(comment_in_rel):,}")
print(f"→ Missing comments (rel ↛ comment.json):       {len(missing_comments):,}")
print(f"→ Comments without relation (only JSON):       {len(comments_without_rel):,}\n")

print(f"#Comments per Concept (valid pairs only):      min={min_cm_per_c}, max={max_cm_per_c}, avg≈{avg_cm_per_c:.2f}")
print(f"#Concepts per Comment (valid pairs only):      min={min_c_per_cm}, max={max_c_per_cm}, avg≈{avg_c_per_cm:.2f}")

if   max_cm_per_c==1 and max_c_per_cm==1: rel_type="1–1"
elif max_cm_per_c>1 and  max_c_per_cm==1: rel_type="1–N (1 concept → nhiều comment)"
elif max_cm_per_c==1 and max_c_per_cm>1:  rel_type="N–1 (nhiều concept → 1 comment)"
else:                                     rel_type="N–N (nhiều–nhiều)"
print(f"⇒ Cardinality (valid pairs only):              {rel_type}")

# (tuỳ chọn) Mẫu để debug nhanh
def sample(lst, k=5): return lst[:k] if len(lst) > k else lst
print("\nSamples:")
print("  Missing concepts:", sample(missing_concepts))
print("  Missing comments:", sample(missing_comments))


===== CONCEPT–COMMENT CONSISTENCY REPORT =====
Rows in relations file:                        31,074
Unique (concept, comment) pairs:               28,229

Concepts in concept.json:                      637,572
Concepts referenced in relation:               9,983
→ Missing concepts (rel ↛ concept.json):       5,314
→ Concepts without relation (only JSON):       632,903

Comments in comment.json:                      8,395,141
Comments referenced in relation:               20,082
→ Missing comments (rel ↛ comment.json):       9,371
→ Comments without relation (only JSON):       8,384,430

#Comments per Concept (valid pairs only):      min=1, max=6, avg≈1.99
#Concepts per Comment (valid pairs only):      min=1, max=19, avg≈1.31
⇒ Cardinality (valid pairs only):              N–N (nhiều–nhiều)

Samples:
  Missing concepts: ['K_4n+2规则_化学', 'K_8-羟基喹啉_药学', 'K_Android_计算机科学与技术', 'K_A股_工商管理', 'K_B/S结构_计算机科学与技术']
  Missing comments: ['Cm_100021', 'Cm_10002289', 'Cm_1000366', 'Cm_1000420', 'Cm_10

In [18]:
path = "Data/entities/comment.json"
# chunksize => pandas chỉ đọc một phần file mỗi lần
reader = pd.read_json(path, lines=True, chunksize=50)  # lấy đúng 10 dòng đầu
first_chunk = next(reader)  # DataFrame 10 dòng
print(first_chunk)


        id   user_id                                               text  \
0     Cm_1  10030806                                               测试评论   
1     Cm_4   1705400                                                 嗯嗯   
2     Cm_5  10031537                                           是的，我也看不到   
3     Cm_7  10031502                                              大师傅as   
4    Cm_12  10031397                                                 点赞   
5    Cm_13  10031397                                                 好滴   
6    Cm_14  10031528                                             很好，赞一个   
7    Cm_16  10031531                                                老师好   
8    Cm_19  10031356                                                 好的   
9    Cm_20  10031356                                                 好的   
10   Cm_23  10031509                       讨论区无直接粘贴功能，无@老师或学生提醒指定人员回答功能   
11   Cm_24  10031531                                                 收到   
12   Cm_29  10031666     