In [23]:
from stark_qa import load_qa, load_skb

dataset_name = 'amazon'
qa_dataset = load_qa(dataset_name, "/workspace/yunhai/stark/stark")
skb = load_skb(dataset_name, download_processed=True)

Use file from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/qa/amazon/stark_qa/stark_qa_human_generated_eval.csv.
Loading from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/skb/amazon/processed!
Loading cached graph with meta link types ['brand', 'category', 'color']


### Load QA dataset

In [24]:
# Get one qa pair, we masked out metadata to avoid answer leaking
query, q_id, answer_ids, _ = qa_dataset[1]
print('Query:', query)
print('Query ID:', q_id)
print('Answer:\n', '\n'.join([skb[aid].title for aid in answer_ids]))
print('Answer IDs:', answer_ids)

Query: Looking for a user-friendly fly fishing knot guide with clear, easy-to-understand illustrations. Ideally, it should be logically organised for easy learning and effective in teaching dependable knot tying techniques. It would be a bonus if it complements the Anglers Accessories Gehrke's Gink that I frequently use. Any recommendations?
Query ID: 1
Answer:
 Lake Products THREE-in-One Knot Tying Tool Fly Fishing
EZ Tie Blood Knot Tying Tool
BenchMaster Pocket Guide - Fly Fishing - Fishing
Answer IDs: [291785, 416396, 30]


In [3]:
# We provide official random split for training, validation and test
print('Number of training examples:', len(qa_dataset.get_subset('train')))
print('Number of validation examples:', len(qa_dataset.get_subset('val')))
print('Number of test examples:', len(qa_dataset.get_subset('test')))

# Alternatively, you can get the split indices
qa_dataset.get_idx_split()

Number of training examples: 5910
Number of validation examples: 1548
Number of test examples: 1642


{'train': tensor([3885, 4522, 2110,  ..., 6839, 3967, 2814]),
 'val': tensor([1550, 1486, 6591,  ..., 5606, 1204, 3792]),
 'test': tensor([2905, 3863, 4651,  ..., 3891, 7631, 4472]),
 'test-0.1': tensor([   3,   85,  135,  173,  214,  222,  290,  291,  372,  601,  750,  755,
          788,  795,  850,  860,  861,  957, 1080, 1133, 1249, 1330, 1334, 1362,
         1398, 1436, 1524, 1605, 1676, 1815, 1842, 1846, 1938, 1945, 1973, 1991,
         2109, 2117, 2154, 2173, 2186, 2202, 2254, 2415, 2441, 2653, 2679, 2753,
         2759, 2787, 2856, 2992, 3002, 3061, 3123, 3198, 3211, 3293, 3352, 3411,
         3449, 3472, 3724, 3863, 3903, 3913, 4018, 4094, 4270, 4344, 4382, 4398,
         4512, 4568, 4614, 4636, 4637, 4640, 4646, 4811, 4942, 4997, 5001, 5129,
         5161, 5227, 5413, 5433, 5454, 5677, 5696, 5850, 5863, 5915, 5945, 5965,
         6035, 6072, 6094, 6246, 6289, 6312, 6321, 6336, 6369, 6418, 6425, 6609,
         6612, 6621, 6716, 6733, 6753, 6766, 6793, 6829, 6876, 6905, 6915, 7

### Load QA dataset - Human generated split

In [4]:
# We provide a human generated evaluation set
qa_dataset_hg = load_qa(dataset_name, human_generated_eval=True)
len(qa_dataset_hg)

Use file from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/qa/amazon/stark_qa/stark_qa_human_generated_eval.csv.


81

In [5]:
qa_dataset_hg.get_idx_split()

{'human_generated_eval': tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
         54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
         72, 73, 74, 75, 76, 77, 78, 79, 80])}

### Load Knowledge Base

In [6]:
# You can see part of the knowledge base schema here
skb.get_tuples()

[('brand', 'has_brand', 'product'),
 ('category', 'has_category', 'product'),
 ('color', 'has_color', 'product'),
 ('product', 'also_buy', 'product'),
 ('product', 'also_view', 'product'),
 ('product', 'has_brand', 'brand'),
 ('product', 'has_category', 'category'),
 ('product', 'has_color', 'color')]

In [7]:
# Similarly, you can get the node and relation types 
skb.node_type_lst(), skb.rel_type_lst()

(['product', 'brand', 'category', 'color'],
 ['also_buy', 'also_view', 'has_brand', 'has_category', 'has_color'])

In [8]:
print('Number of nodes:', skb.num_nodes())
print('Number of edges:', skb.num_edges())

Number of nodes: 1035542
Number of edges: 9443802


In [9]:
# We include the attributes in node's textual information as part of the schema
# Note that some nodes may not have all attributes while some may have additional attributes
skb.node_attr_dict

{'product': ['title',
  'dimensions',
  'weight',
  'description',
  'features',
  'reviews',
  'Q&A'],
 'brand': ['brand_name'],
 'category': ['category_name'],
 'color': ['color_name']}

In [10]:
skb.rel_type_lst()

['also_buy', 'also_view', 'has_brand', 'has_category', 'has_color']

In [11]:
# Each node has textual information
print(skb.get_doc_info(answer_ids[0], add_rel=True))

- product: Lake Products THREE-in-One Knot Tying Tool Fly Fishing
- brand: Lake
- description: NEW & IMPROVED - Replaces the Two-in-One Knot Tying Tool - still ties many over 14 different knots, but now adds a magnetic hook threader; made of Delron and stainless steel; instruction book included.Precision machined contact firmly grips any fishing line, without causing damageUp and down spring action with stainless steel springThe body is manufactured of strong, lightweight Acetel Delrin for years of reliable serviceStainless Steel Shaft, head and loop will not rust or corrodeAttachment loop to clip onto clothing
- features: 
#1: Precision machined contact firmly grips any fishing line, without causing damage
#2: Up and down spring action with stainless steel spring
#3: The body is manufactured of strong, lightweight Acetel Delrin for years of reliable service
#4: Stainless Steel Shaft, head and loop will not rust or corrode
#5: Attachment loop to clip onto clothing
- reviews: 
#9:
summa

In [12]:
# Each node can be linked to other nodes
neighbor_lst = skb.get_neighbor_nodes(answer_ids[0], edge_type='*')
print('The neighbors of the answer node are:', len(neighbor_lst))

The neighbors of the answer node are: 222


In [13]:
# You can textualize the relation information
print(skb.get_rel_info(answer_ids[0], n_rel=5))

- relations:
  products also purchased: 
#1: BenchMaster Pocket Guide - Fly Fishing - Fishing
#2: Rebel Lures Teeny Wee Crawfish Fishing Lure
#3: Eagle Claw Featherlight 3/4 Line Weight Fly Rod, 2 Piece (Yellow, 6-Feet 6-Inch), 4/5 weight
#4: Cortland 444 Classic Double Taper Floating Fly Line
#5: Water Gremlin Gremlin Green/Tin Removable Split Shot Selector, 28ea/BB, 20ea/3/0, 8ea/7, 6ea/5
  products also viewed: 
#1: Loon Outdoors UV Knot Sense
#2: Orvis Tie-fast Knot-tying Tool
#3: Umpqua Nylon Tippet Spools
#4: Umpqua Nylon Tippet Spools
#5: Stonfo Pinza Elite Hackle Plier - Original
  brand: Lake



In [14]:
# Count the number of each type
from collections import Counter
neighbor_types = [skb.get_node_type_by_id(neighbor) for neighbor in neighbor_lst]
print(Counter(neighbor_types))

Counter({'product': 221, 'brand': 1})


### Take PrimeKG as another example

In [15]:
from stark_qa import load_qa, load_skb
dataset_name = 'prime'

qa_dataset = load_qa(dataset_name)
skb = load_skb(dataset_name, download_processed=True)

Use file from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/qa/prime/stark_qa/stark_qa_human_generated_eval.csv.
Loading from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/skb/prime/processed!


In [16]:
qa_dataset_hg = load_qa(dataset_name, human_generated_eval=True)
len(qa_dataset_hg)

Use file from /workspace/.hf_home/hub/datasets--snap-stanford--stark/snapshots/88269e23e90587f99476c5dd74e235a0877e69be/qa/prime/stark_qa/stark_qa_human_generated_eval.csv.


98

In [17]:
skb.num_nodes(), skb.num_edges()

(129375, 8100498)

In [18]:
qa_dataset[1]

('What drugs target the CYP3A4 enzyme and are used to treat strongyloidiasis?',
 1,
 [15450],
 None)

In [19]:
print(skb.get_doc_info(15450, add_rel=False))

- name: Ivermectin
- type: drug
- source: DrugBank
- details:
  - description: Ivermectin is a broad-spectrum anti-parasite medication. It was first marketed under the name Stromectol® and used against worms (except tapeworms), but, in 2012, it was approved for the topical treatment of head lice infestations in patients 6 months of age and older, and marketed under the name Sklice™ as well. Ivermectin is mainly used in humans in the treatment of onchocerciasis, but is also effective against other worm infestations (such as strongyloidiasis, ascariasis, trichuriasis and enterobiasis).
  - half_life: 16 hours (also reported at 22-28 hours)
  - indication: For the treatment of intestinal (i.e., nondisseminated) strongyloidiasis due to the nematode parasite <i>Strongyloides stercoralis</i>. Also for the treatment of onchocerciasis (river blindness) due to the nematode parasite <i>Onchocerca volvulus</i>. Can be used to treat scabies caused by <i>Sarcoptes scabiei</i>.
  - mechanism_of_acti

In [20]:
print(skb.get_doc_info(15450, add_rel=False))

- name: Ivermectin
- type: drug
- source: DrugBank
- details:
  - description: Ivermectin is a broad-spectrum anti-parasite medication. It was first marketed under the name Stromectol® and used against worms (except tapeworms), but, in 2012, it was approved for the topical treatment of head lice infestations in patients 6 months of age and older, and marketed under the name Sklice™ as well. Ivermectin is mainly used in humans in the treatment of onchocerciasis, but is also effective against other worm infestations (such as strongyloidiasis, ascariasis, trichuriasis and enterobiasis).
  - half_life: 16 hours (also reported at 22-28 hours)
  - indication: For the treatment of intestinal (i.e., nondisseminated) strongyloidiasis due to the nematode parasite <i>Strongyloides stercoralis</i>. Also for the treatment of onchocerciasis (river blindness) due to the nematode parasite <i>Onchocerca volvulus</i>. Can be used to treat scabies caused by <i>Sarcoptes scabiei</i>.
  - mechanism_of_acti

In [21]:
print(skb.get_rel_info(15450, n_rel=-1))

- relations:
  enzyme: {gene/protein: (CYP3A4),}
  target: {gene/protein: (GABRB3, GLRA3),}
  transporter: {gene/protein: (ABCC2, ABCG2, ABCC1, ABCB1, SLCO1B1, SLCO1B3),}
  contraindication: {disease: (filariasis, loiasis),}
  indication: {disease: (onchocerciasis, strongyloidiasis),}
  synergistic_interaction: {drug: (Beclomethasone dipropionate, Betamethasone, Triamcinolone, Diethylstilbestrol, Liothyronine, Liotrix, Genistein, Ubidecarenone, Torasemide, Nelfinavir, Lovastatin, Ziprasidone, Phenytoin, Metoprolol, Dicoumarol, Conjugated estrogens, Etonogestrel, Desogestrel, Gefitinib, Meperidine, Duloxetine, Chlorpromazine, Raloxifene, Zidovudine, Ritonavir, Erlotinib, Ciprofloxacin, Nortriptyline, Methotrexate, Cephalexin, Clonidine, Enalapril, Medroxyprogesterone acetate, Chloroquine, Imatinib, Testosterone, Stavudine, Estrone, Tamoxifen, Warfarin, Lamivudine, Norethisterone, Irinotecan, Estradiol, Propofol, Clofazimine, Terbinafine, Tacrolimus, Quinidine, Repaglinide, Salmeterol, P

In [22]:
print(skb.get_neighbor_nodes(15450, edge_type='enzyme'))
print(skb.get_doc_info(8974, add_rel=False))

[8974]
- name: CYP3A4
- type: gene/protein
- source: NCBI
- details:
  - query: CYP3A4
  - alias (other gene names): ['CP33', 'CP34', 'CYP3A', 'CYP3A3', 'CYPIIIA3', 'CYPIIIA4', 'HLP', 'NF-25', 'P450C3', 'P450PCN1', 'VDDR3']
  - genomic_pos (genomic position): {'chr': '7', 'end': 99784248, 'ensemblgene': 'ENSG00000160868', 'start': 99756960, 'strand': -1}
  - name (gene name): cytochrome P450 family 3 subfamily A member 4
  - summary (protein summary text): This gene encodes a member of the cytochrome P450 superfamily of enzymes. The cytochrome P450 proteins are monooxygenases that catalyze many reactions involved in drug metabolism and synthesis of cholesterol, steroids and other lipids. This protein localizes to the endoplasmic reticulum and its expression is induced by glucocorticoids and some pharmacological agents. This enzyme is involved in the metabolism of approximately half the drugs in use today, including acetaminophen, codeine, cyclosporin A, diazepam, erythromycin, and chlo