In [None]:
import json
import pandas as pd
import os
from pathlib import Path
import sys

sys.path.append("../")

from config import DATA_DIR
from graph_types.graph import Graph

graph_name="amazon"

In [4]:
# graph = Graph.load(graph_name)

In [6]:
logs_dir = DATA_DIR / f"experiments/{graph_name}/2hop"
json_files = sorted([f for f in logs_dir.glob("*.json")])

data = []

for json_file in json_files:
    with open(json_file, "r") as f:
        log_data = json.load(f)

    # Extract key information from each log entry
    record = {
        "file_id": int(json_file.stem),
        "question": log_data.get("question", ""),
        "starting_node_index": log_data.get(
            "starting_node_index", log_data.get("sorted_central_nodes_indices", [])[0]
        ),
        "sorted_central_nodes_indices": log_data.get("sorted_central_nodes_indices", []),
        "sorted_candidates_indices": log_data.get("sorted_candidates_indices", []),
        "answer_type": log_data.get("answer_type", ""),
        "answer_indices": log_data.get("answer_indices", []),
    }

    data.append(record)

df = pd.DataFrame(data).sort_values(by="file_id").reset_index(drop=True)

df["recall@all"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"])))
    / len(set(row["answer_indices"])),
    axis=1,
)
df["hit@1"] = df.apply(
    lambda row: row["sorted_candidates_indices"][0] in row["answer_indices"] if row["sorted_candidates_indices"] else False,
    axis=1,
)
df["hit@5"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"][:5]))) > 0,
    axis=1,
)
df["hit@10"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"][:10]))) > 0,
    axis=1,
)
df["recall@20"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"][:20])))
    / len(set(row["answer_indices"])),
    axis=1,
)

[
    ("n", len(df)),
    ("Hit@1", float(round(df["hit@1"].mean(), 3))),
    ("Hit@5", float(round(df["hit@5"].mean(), 3))),
    ("Recall@20", float(round(df["recall@20"].mean(), 3))),
    ("Recall@all", float(round(df["recall@all"].mean(), 3))),
]

[('n', 249),
 ('Hit@1', 0.378),
 ('Hit@5', 0.574),
 ('Recall@20', 0.383),
 ('Recall@all', 0.626)]

### What was the starting node when we didn't hit the correct subgraph?

In [None]:
for _, row in df[df["recall@all"] != 1].iterrows():
    starting_node = graph.get_node_by_index(
        (row["sorted_central_nodes_indices"][0] if row["sorted_central_nodes_indices"] else "None")
    )

    print(f"Question: {row['question']}\nStarting node: {starting_node}")
    print(
        f"Other candidate for starting node: {row['sorted_central_nodes_indices'][1] if len(row['sorted_central_nodes_indices']) > 1 else 'None'}\n"
    )

Question: Search for publications by Hu Zhiyuan on the impact of radiation on flash memory input/output components.
Starting node: AuthorNode(name=Hu Zhiyuan, index=204626, type=author)
Other candidate for starting node: 1643284

Question: Find publications from Carma researchers that report detections using the Australian Square Kilometre Array Pathfinder (ASKAP) radio telescope.
Starting node: AuthorNode(name=Askap: Australian Ska Pathfinder, index=923663, type=author)
Other candidate for starting node: 1104625

Question: List papers discussing shock wave solution behaviors in the context of weak solutions.
Starting node: FieldOfStudyNode(name=Shock wave, index=1121443, type=field_of_study)
Other candidate for starting node: 1247731

Question: List of research articles on the interaction of drag forces with particles in cavity walls
Starting node: PaperNode(name=Ion drag forces and magnetomechanical effect, index=1440426, type=paper)
Other candidate for starting node: 1214177

Questi

### When we started in the correct subgraph, how many did we recover?

In [None]:
df_when_we_get_correct_subgraph = df[df["recall@all"] == 1]

In [None]:
len(df_when_we_get_correct_subgraph)

50

In [None]:
[
    ("Hit@1", float(round(df_when_we_get_correct_subgraph["hit@1"].mean(), 3))),
    ("Hit@5", float(round(df_when_we_get_correct_subgraph["hit@5"].mean(), 3))),
    # ("Hit@10", float(round(df_when_we_get_correct_subgraph["hit@10"].mean(), 3))),
    ("Recall@20", float(round(df_when_we_get_correct_subgraph["recall@20"].mean(), 3))),
    ("Recall@all", float(round(df_when_we_get_correct_subgraph["recall@all"].mean(), 3))),
]

[('Hit@1', 0.5), ('Hit@5', 0.8), ('Recall@20', 0.861), ('Recall@all', 1.0)]

### Can we match any of the central nodes to the question?

In [None]:
df["exact_matching_nodes_indices"] = df.apply(
    lambda row: [
        i
        for i in list(set(row["sorted_central_nodes_indices"]))
        if graph.get_node_by_index(i).name in row["question"]
    ],
    axis=1,
)
df["exact_matching_node_names"] = df["exact_matching_nodes_indices"].apply(
    lambda exact_matching_nodes_indices: [
        graph.get_node_by_index(i).name for i in exact_matching_nodes_indices
    ]
)
df["starting_node_name"] = df["starting_node_index"].apply(
    lambda i: graph.get_node_by_index(i).name
)
df["starting_node_matches"] = df.apply(
    lambda row: row["starting_node_index"] in row["exact_matching_nodes_indices"],
    axis=1,
)

An interesting result: when the starting node literally appears in the question, we get better results.

In [None]:
df.groupby('starting_node_matches').agg(
    {
        "recall@all": "mean",
        "recall@20": "mean",
        "hit@1": "mean",
        "hit@5": "mean",
        "file_id": "count"
    }
).rename(columns={"file_id": "count"})

Unnamed: 0_level_0,recall@all,recall@20,hit@1,hit@5,count
starting_node_matches,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,0.633333,0.508333,0.2,0.6,10
True,0.956522,0.832428,0.5,0.76087,46


Look at this: In many rows even more than one node appears literally named in the question

In [None]:
df['n_of_exact_matching_nodes'] = df['exact_matching_nodes_indices'].apply(len)
df['n_of_exact_matching_nodes'].value_counts()

n_of_exact_matching_nodes
1    45
0     9
2     2
Name: count, dtype: int64

In [None]:
for _, row in df[df['starting_node_matches'] == False].iterrows():
    
    sorted_central_node_names = [
        graph.get_node_by_index(i).name for i in row['sorted_central_nodes_indices']
    ]
    print(f"Question: {row['question']}")
    print(f"Starting node name: {row['starting_node_name']}\n")
    print(f"Other candidates for starting node: {sorted_central_node_names}\n")
    

Question: Find publications from Carma researchers that report detections using the Australian Square Kilometre Array Pathfinder (ASKAP) radio telescope.
Starting node name: Askap: Australian Ska Pathfinder

Other candidates for starting node: ['Askap: Australian Ska Pathfinder', 'Carma']

Question: What other research papers investigating effective techniques beyond semilocal density-functional theory are referenced in the paper titled "Analytical First-Order Molecular Properties and Forces within the Adiabatic Connection Random Phase Approximation"?
Starting node name: Analytical First-Order Molecular Properties and Forces within the Adiabatic Connection Random Phase Approximation.

Other candidates for starting node: ['Analytical First-Order Molecular Properties and Forces within the Adiabatic Connection Random Phase Approximation.', 'Orbital localization, charge transfer, and band gaps in semilocal density-functional theory.']

Question: Show me articles on deformable mirror modeli