In [1]:
from stark_qa import load_qa, load_skb

dataset_name = 'amazon'
qa_dataset = load_qa(dataset_name, "/workspace/yunhai/stark/stark")
skb = load_skb(dataset_name, download_processed=True)

  from pkg_resources import parse_version


Use file from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/qa/amazon/stark_qa/stark_qa_human_generated_eval.csv.
Loading from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/skb/amazon/processed!
Loading cached graph with meta link types ['brand', 'category', 'color']


### Load QA dataset

In [2]:
# Get one qa pair, we masked out metadata to avoid answer leaking
query, q_id, answer_ids, _ = qa_dataset[1]
print('Query:', query)
print('Query ID:', q_id)
print('Answer:\n', '\n'.join([skb[aid].title for aid in answer_ids]))

Query: Looking for a user-friendly fly fishing knot guide with clear, easy-to-understand illustrations. Ideally, it should be logically organised for easy learning and effective in teaching dependable knot tying techniques. It would be a bonus if it complements the Anglers Accessories Gehrke's Gink that I frequently use. Any recommendations?
Query ID: 1
Answer:
 Lake Products THREE-in-One Knot Tying Tool Fly Fishing
EZ Tie Blood Knot Tying Tool
BenchMaster Pocket Guide - Fly Fishing - Fishing


In [3]:
# We provide official random split for training, validation and test
print('Number of training examples:', len(qa_dataset.get_subset('train')))
print('Number of validation examples:', len(qa_dataset.get_subset('val')))
print('Number of test examples:', len(qa_dataset.get_subset('test')))

# Alternatively, you can get the split indices
qa_dataset.get_idx_split()

Number of training examples: 5910
Number of validation examples: 1548
Number of test examples: 1642


{'train': tensor([3885, 4522, 2110,  ..., 6839, 3967, 2814]),
 'val': tensor([1550, 1486, 6591,  ..., 5606, 1204, 3792]),
 'test': tensor([2905, 3863, 4651,  ..., 3891, 7631, 4472]),
 'test-0.1': tensor([   3,   85,  135,  173,  214,  222,  290,  291,  372,  601,  750,  755,
          788,  795,  850,  860,  861,  957, 1080, 1133, 1249, 1330, 1334, 1362,
         1398, 1436, 1524, 1605, 1676, 1815, 1842, 1846, 1938, 1945, 1973, 1991,
         2109, 2117, 2154, 2173, 2186, 2202, 2254, 2415, 2441, 2653, 2679, 2753,
         2759, 2787, 2856, 2992, 3002, 3061, 3123, 3198, 3211, 3293, 3352, 3411,
         3449, 3472, 3724, 3863, 3903, 3913, 4018, 4094, 4270, 4344, 4382, 4398,
         4512, 4568, 4614, 4636, 4637, 4640, 4646, 4811, 4942, 4997, 5001, 5129,
         5161, 5227, 5413, 5433, 5454, 5677, 5696, 5850, 5863, 5915, 5945, 5965,
         6035, 6072, 6094, 6246, 6289, 6312, 6321, 6336, 6369, 6418, 6425, 6609,
         6612, 6621, 6716, 6733, 6753, 6766, 6793, 6829, 6876, 6905, 6915, 7

### Load QA dataset - Human generated split

In [4]:
# We provide a human generated evaluation set
qa_dataset_hg = load_qa(dataset_name, human_generated_eval=True)
len(qa_dataset_hg)

Use file from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/qa/amazon/stark_qa/stark_qa_human_generated_eval.csv.


81

In [3]:
#!/usr/bin/env python3
# export_stark_to_sqlite.py

import sqlite3
import json
import math
from stark_qa import load_skb


def to_text(x):
    """
    将任意值转换为 SQLite 可存储的文本：
      - None 或 NaN -> 空字符串
      - 基本类型 (str/int/float) -> str(x)
      - 列表或字典 -> JSON 字符串（使用 default=str 以处理嵌套 NaN）
    """
    # None
    if x is None:
        return ""
    # NaN (float) 检测
    if isinstance(x, float) and (math.isnan(x)):
        return ""
    # 基本类型
    if isinstance(x, (str, int, float)):
        return str(x)
    # 其他（如 list, dict）
    try:
        return json.dumps(x, ensure_ascii=False, default=str)
    except Exception:
        return str(x)


def create_tables(conn):
    c = conn.cursor()
    # 实体表，包含所有指定的字段
    c.execute("""
    CREATE TABLE IF NOT EXISTS entity (
      id               INTEGER PRIMARY KEY,
      review           TEXT,
      qa               TEXT,
      asin             TEXT,
      title            TEXT,
      global_category  TEXT,
      category         TEXT,
      price            TEXT,
      brand            TEXT,
      rank             INTEGER,
      details          TEXT,
      description      TEXT
    );
    """)
    # 特征表，用于存储 feature 列表
    c.execute("""
    CREATE TABLE IF NOT EXISTS feature (
      entity_id     INTEGER,
      feature_order INTEGER,
      feature_text  TEXT,
      PRIMARY KEY(entity_id, feature_order),
      FOREIGN KEY(entity_id) REFERENCES entity(id)
    );
    """)
    # 关系表，存储所有边
    c.execute("""
    CREATE TABLE IF NOT EXISTS relation (
      src_id   INTEGER,
      dst_id   INTEGER,
      rel_type TEXT,
      PRIMARY KEY(src_id, dst_id, rel_type),
      FOREIGN KEY(src_id) REFERENCES entity(id),
      FOREIGN KEY(dst_id) REFERENCES entity(id)
    );
    """)
    # 索引
    c.execute("CREATE INDEX IF NOT EXISTS idx_rel_src ON relation(src_id);")
    c.execute("CREATE INDEX IF NOT EXISTS idx_rel_dst ON relation(dst_id);")
    conn.commit()


def export_skb_to_sqlite(skb, sqlite_path="stark_amazon.db"):
    conn = sqlite3.connect(sqlite_path)
    create_tables(conn)
    c = conn.cursor()

    print("→ 导出实体及其字段 …")
    for idx in range(1000):
        node = skb[idx]
        eid = idx
        # 读取并转换所有指定字段
        review          = to_text(getattr(node, "review", None))
        qa              = to_text(getattr(node, "qa", None))
        asin            = to_text(getattr(node, "asin", None))
        title           = to_text(getattr(node, "title", None))
        global_category = to_text(getattr(node, "global_category", None))
        category        = to_text(getattr(node, "category", None))
        price_raw       = getattr(node, "price", None)
        brand           = to_text(getattr(node, "brand", None))
        rank_raw        = getattr(node, "rank", None)
        rank            = int(rank_raw)     if (rank_raw  is not None and str(rank_raw).isdigit()) else None
        details         = to_text(getattr(node, "details", None))
        description     = to_text(getattr(node, "description", None))

        # 插入 entity
        c.execute(
          """
            INSERT OR IGNORE INTO entity(
              id, review, qa, asin, title, global_category,
              category, price, brand, rank, details, description
            ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
          """,
          (
            eid, review, qa, asin, title, global_category,
            category, price_raw, brand, rank, details, description
          )
        )

        # 插入 feature 列表
        feats = getattr(node, "feature", [])
        for order, ft in enumerate(feats):
            text_ft = to_text(ft)
            c.execute(
              """
                INSERT OR IGNORE INTO feature(entity_id, feature_order, feature_text)
                VALUES (?,?,?)
              """,
              (eid, order, text_ft)
            )

    conn.commit()
    print("→ 实体导出完成，开始导出关系 …")

    # 导出关系边
    for idx in range(1000):
        src_id = idx
        for rel in skb.rel_type_lst():
            nbrs = skb.get_neighbor_nodes(idx, rel)
            if not nbrs:
                continue
            for nbr in nbrs:
                dst_id = nbr
                c.execute(
                  """
                    INSERT OR IGNORE INTO relation(src_id, dst_id, rel_type)
                    VALUES (?,?,?)
                  """,
                  (src_id, dst_id, rel)
                )

    conn.commit()
    conn.close()
    print(f"√ 导出完成，数据库保存在 {sqlite_path}")


if __name__ == "__main__":
    # skb = load_skb(name="amazon", download_processed=True)
    export_skb_to_sqlite(skb)


→ 导出实体及其字段 …
→ 实体导出完成，开始导出关系 …
√ 导出完成，数据库保存在 stark_amazon.db


In [13]:
for idx in range(len(skb)):                 # 假设 SKB 支持按索引访问
    node = skb[idx]
    print(node)
    print(node.review)
    print(node.qa)
    break

--review
--qa
--asin
--title
--global_category
--category
--price
--brand
--feature
--rank
--details
--description

[{'reviewerID': 'AN5DRYRTXRBDW', 'summary': 'Looks Great', 'style': nan, 'reviewText': 'I am at the larger end of the size spectrum (size 14), and the waist band was still comfortable.', 'vote': '2', 'overall': 5.0, 'verified': True, 'reviewTime': '05 2, 2016'}, {'reviewerID': 'A2INZEU0RHYV7B', 'summary': 'Five Stars', 'style': nan, 'reviewText': 'Perfect!!', 'vote': nan, 'overall': 5.0, 'verified': True, 'reviewTime': '05 2, 2016'}, {'reviewerID': 'A3AJYZEIM88ILH', 'summary': 'Five Stars', 'style': nan, 'reviewText': 'Great color!', 'vote': nan, 'overall': 5.0, 'verified': True, 'reviewTime': '11 30, 2015'}, {'reviewerID': 'A36NU1M3F6ZD7U', 'summary': 'Three Stars', 'style': nan, 'reviewText': 'The tutu is nice, but I wish it was longer!  Thanks!', 'vote': '2', 'overall': 3.0, 'verified': True, 'reviewTime': '11 16, 2015'}, {'reviewerID': 'A38LGHDAXR0MKL', 'summary': 'On