In [7]:

import json
import pandas as pd
import os
from pathlib import Path
import sys

sys.path.append("../")

from config import DATA_DIR
from graph_types.prime import PrimeGraph
from graph_types.mag import MagGraph
from graph_types.amazon import AmazonGraph

name = "amazon"

if name == "prime":
    graph = PrimeGraph.load()
elif name == "mag":
    graph = MagGraph.load()
elif name == "amazon":
    graph = AmazonGraph.load()

In [8]:
logs_dir = DATA_DIR / f"experiments/{graph.name}_logs_2hop_filter_answer_type_and_starting_node_name"
json_files = sorted([f for f in logs_dir.glob("*.json")])

data = []

for json_file in json_files:
    with open(json_file, "r") as f:
        log_data = json.load(f)

    # Extract key information from each log entry
    record = {
        "file_id": int(json_file.stem),
        "question": log_data.get("question", ""),
        "starting_node_index": log_data.get(
            "starting_node_index", log_data.get("sorted_central_nodes_indices", [])[0]
        ),
        "sorted_central_nodes_indices": log_data.get("sorted_central_nodes_indices", []),
        "sorted_candidates_indices": log_data.get("sorted_candidates_indices", []),
        "answer_type": log_data.get("answer_type", ""),
        "answer_indices": log_data.get("answer_indices", []),
    }

    data.append(record)

df = pd.DataFrame(data).sort_values(by="file_id").reset_index(drop=True)

In [9]:
df

Unnamed: 0,file_id,question,starting_node_index,sorted_central_nodes_indices,sorted_candidates_indices,answer_type,answer_indices
0,0,Looking for a chess strategy guide from The Ho...,16,"[775196, 828679, 637969, 16, 413618, 59254, 96...","[16, 191, 21, 190, 182, 23, 848075, 692678, 17...",product,[16]
1,1,Looking for a user-friendly fly fishing knot g...,30,"[55227, 30, 409757, 492812, 333541, 389275]","[30, 492812, 213621, 234955, 636894, 291785, 1...",product,"[291785, 416396, 30]"
2,2,Can you recommend a map created by the US Fore...,697550,"[697550, 719981, 343250]","[11760, 697550, 386370, 593866, 168917, 783761...",product,[75]
3,3,"Is there a durable, waterproof trail map avail...",414623,"[414623, 93, 93, 414623, 421292, 421292, 69034...","[414623, 421292, 403502, 260893, 113528, 73, 1...",product,"[130947, 629636, 68, 38854, 73, 421292, 256684..."
4,4,What climbing guide do most people purchase wi...,1772,"[1772, 34702, 309614]","[1772, 45869, 94727, 238688, 855388, 142805, 8...",product,[98]
...,...,...,...,...,...,...,...
62,62,Where can I find the TT-24 Spotted Trout fishi...,1142,"[1142, 31150, 478135, 62390, 62850, 394780]","[1142, 31150, 62286, 159253, 394834, 331928, 4...",product,"[31150, 1142]"
63,63,Can you suggest a sturdy stainless steel fishi...,63110,"[155046, 63110, 707516, 392016, 410628, 424238]","[166373, 301313, 62512, 613560, 63110, 377364,...",product,"[377825, 166373, 158954, 5259, 121579, 1165, 6..."
64,64,Do you know of any marine-grade stainless stee...,1173,"[1173, 942354, 905600, 161660, 640259, 567852]","[1173, 901871, 701836, 133912, 506389, 492573,...",product,"[167584, 894403, 331558, 253707, 528908, 48414..."
65,65,Are there quick delivery options for Tempress ...,1176,"[75733, 80921, 701632, 52303, 1176, 133809]","[19373, 1174, 1176, 212603, 80921, 113873, 455...",product,"[19373, 1174]"


In [10]:
df["recall@all"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"])))
    / len(set(row["answer_indices"])),
    axis=1,
)
df["hit@1"] = df.apply(
    lambda row: row["sorted_candidates_indices"][0] in row["answer_indices"] if row["sorted_candidates_indices"] else False,
    axis=1,
)
df["hit@5"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"][:5]))) > 0,
    axis=1,
)
df["recall@20"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"][:20])))
    / len(set(row["answer_indices"])),
    axis=1,
)

### Metrics

In [11]:
[
    ("Hit@1", float(round(df["hit@1"].mean(), 3))),
    ("Hit@5", float(round(df["hit@5"].mean(), 3))),
    ("Recall@20", float(round(df["recall@20"].mean(), 3))),
    ("Recall@all", float(round(df["recall@all"].mean(), 3))),
]

[('Hit@1', 0.478), ('Hit@5', 0.687), ('Recall@20', 0.4), ('Recall@all', 0.709)]

### What was the starting node when we didn't hit the correct subgraph?

In [12]:
for _, row in df[df["recall@all"] != 1].iterrows():
    starting_node = graph.get_node_by_index(
        (row["sorted_central_nodes_indices"][0] if row["sorted_central_nodes_indices"] else "None")
    )

    print(f"Question: {row['question']}\nStarting node: {starting_node}")
    print(
        f"Other candidate for starting node: {row['sorted_central_nodes_indices'][1] if len(row['sorted_central_nodes_indices']) > 1 else 'None'}\n"
    )

Question: Can you recommend a map created by the US Forest Service?
Starting node: ProductNode(name=Colorado Saddlery The Forest Service Special Saddle, index=697550, type=product)
Other candidate for starting node: 719981

Question: Is there a durable, waterproof trail map available for hiking and biking that can withstand rainy conditions?
Starting node: ProductNode(name=WHITE MOUNTAINS WATERPROOF TRAIL MAP, index=414623, type=product)
Other candidate for starting node: 93

Question: What are some high-quality, pocket-sized map sets with detailed features ideal for outdoor adventures?
Starting node: ProductNode(name=Jandd Map Pocket, index=92330, type=product)
Other candidate for starting node: 225079

Question: Can you suggest a high-quality OEM Control unicycle featuring a powder-coated finish steel fork that is also easy to store in a corner?
Starting node: ProductNode(name=Sea Striker Planer with Black Powder Coated Finish, index=458142, type=product)
Other candidate for starting

### When we started in the correct subgraph, how many did we recover?

In [13]:
df[df["recall@all"] == 1]['recall@20'].mean()

np.float64(0.6681617572242573)

### Can we match any of the central nodes to the question?

In [14]:
df["exact_matching_nodes_indices"] = df.apply(
    lambda row: [
        i
        for i in list(set(row["sorted_central_nodes_indices"]))
        if graph.get_node_by_index(i).name in row["question"]
    ],
    axis=1,
)
df["exact_matching_node_names"] = df["exact_matching_nodes_indices"].apply(
    lambda exact_matching_nodes_indices: [
        graph.get_node_by_index(i).name for i in exact_matching_nodes_indices
    ]
)
df["starting_node_name"] = df["starting_node_index"].apply(
    lambda i: graph.get_node_by_index(i).name
)
df["starting_node_matches"] = df.apply(
    lambda row: row["starting_node_index"] in row["exact_matching_nodes_indices"],
    axis=1,
)

An interesting result: when the starting node literally appears in the question, we get better results.

In [15]:
df.groupby('starting_node_matches').agg(
    {
        "recall@20": "mean",
        "hit@1": "mean",
        "hit@5": "mean",
        "file_id": "count"
    }
).rename(columns={"file_id": "count"})

Unnamed: 0_level_0,recall@20,hit@1,hit@5,count
starting_node_matches,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.404259,0.46875,0.6875,64
True,0.316049,0.666667,0.666667,3


Look at this: In many rows even more than one node appears literally named in the question

In [16]:
df['n_of_exact_matching_nodes'] = df['exact_matching_nodes_indices'].apply(len)
df['n_of_exact_matching_nodes'].value_counts()

n_of_exact_matching_nodes
0    61
1     4
2     2
Name: count, dtype: int64

In [17]:
for _, row in df[df['starting_node_matches'] == False].iterrows():
    
    sorted_central_node_names = [
        graph.get_node_by_index(i).name for i in row['sorted_central_nodes_indices']
    ]
    print(f"Question: {row['question']}")
    print(f"Starting node name: {row['starting_node_name']}\n")
    print(f"Other candidates for starting node: {sorted_central_node_names}\n")
    

Question: Looking for a chess strategy guide from The House of Staunton that offers tactics against Old Indian and Modern defenses. Any recommendations?
Starting node name: Beating the King's Indian and Benoni Defense with 5. Bd3

Other candidates for starting node: ['The House of Staunton The Roman Chess Set', 'The House of Staunton Miniature Chess Set Combination', 'The House of Staunton Chess2Go Travel Chess Set', "Beating the King's Indian and Benoni Defense with 5. Bd3", 'Crushing White - The Nimzo-Indian Defense - EMPIRE CHESS Chess DVD', 'DVD - Shuyokan Ryu Aikido Self-Defense for the Modern Warrior', 'Gorgeous 2-Toned Gold on Silver Old U.S. Indian penny - Golf Ball Marker - Hat Clips', 'The Modern Reti', 'Modern Ball Chair Large']

Question: Looking for a user-friendly fly fishing knot guide with clear, easy-to-understand illustrations. Ideally, it should be logically organised for easy learning and effective in teaching dependable knot tying techniques. It would be a bonus if