In [110]:
import json
import pandas as pd
import os
from pathlib import Path
import sys

sys.path.append("../")


from config import DATA_DIR
from graph_types.prime import PrimeGraph
from graph_types.mag import MagGraph

In [None]:
name = "mag"

if name == "prime":
    graph = PrimeGraph.load()
elif name == "mag":
    graph = MagGraph.load()

In [131]:
logs_dir = DATA_DIR / f"connectedness/{graph.name}_logs_2hop"
json_files = sorted([f for f in logs_dir.glob("*.json")])

data = []

for json_file in json_files:
    with open(json_file, "r") as f:
        log_data = json.load(f)
        

    # Extract key information from each log entry
    record = {
        "file_id": json_file.stem,
        "question": log_data.get("question", ""),
        "sorted_central_nodes": log_data.get("sorted_central_nodes", []),
        "answer_indices": log_data.get("answer_indices", []),
        "sorted_candidates": log_data.get("sorted_candidates", []),
    }

    data.append(record)

df = pd.DataFrame(data)


In [138]:
df["recall@all"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates"])))
    / len(set(row["answer_indices"])),
    axis=1,
)
df["hit@1"] = df.apply(
    lambda row: row["sorted_candidates"][0] in row["answer_indices"],
    axis=1,
)
df["hit@5"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates"][:5]))) > 0,
    axis=1,
)
df["recall@20"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates"][:20])))
    / len(set(row["answer_indices"])),
    axis=1,
)

Metrics

In [140]:
[
    ("Hit@1", float(round(df["hit@1"].mean(), 3))),
    ("Hit@5", float(round(df["hit@5"].mean(), 3))),
    ("Recall@20", float(round(df["recall@20"].mean(), 3))),
    ("Recall@all", float(round(df["recall@all"].mean(), 3))),
]

[('Hit@1', 0.48), ('Hit@5', 0.76), ('Recall@20', 0.764), ('Recall@all', 0.915)]

What was the starting node when we didn't hit the correct subgraph?

In [134]:
for _, row in df[df["recall@all"] != 1].iterrows():
    starting_node = row["sorted_central_nodes"][0] if row["sorted_central_nodes"] else "None"   
    print(f"Question: {row['question']}\nStarting node: {starting_node} \n")

Question: Show me articles on deformable mirror modeling techniques within the pressure control domain.
Starting node: PaperNode(name=Modeling a MEMS deformable mirror using non-parametric estimation techniques, index=1370572, type=paper) 

Question: List papers discussing shock wave solution behaviors in the context of weak solutions.
Starting node: FieldOfStudyNode(name=Shock wave, index=1121443, type=field_of_study) 

Question: List of research articles on the interaction of drag forces with particles in cavity walls
Starting node: PaperNode(name=Ion drag forces and magnetomechanical effect, index=1440426, type=paper) 

Question: Search for publications by Hu Zhiyuan on the impact of radiation on flash memory input/output components.
Starting node: AuthorNode(name=Hu Zhiyuan, index=204626, type=author) 

Question: Find publications from Carma researchers that report detections using the Australian Square Kilometre Array Pathfinder (ASKAP) radio telescope.
Starting node: AuthorNode(n

In [115]:
epithelial_skin_neoplasm = graph.get_node_by_index(36622)
sebaceous_adenmoa = graph.get_node_by_index(95886)
khop = graph.get_khop_idx(epithelial_skin_neoplasm, k=2)

In [116]:
11587 in graph.get_khop_idx(graph.get_node_by_index(128493), k=2)

False

In [117]:
graph.get_khop_idx(graph.get_node_by_index(128493), k=2)

{np.int64(65538),
 np.int64(532486),
 np.int64(16391),
 np.int64(1662982),
 np.int64(294923),
 np.int64(155662),
 np.int64(475150),
 np.int64(139278),
 np.int64(180242),
 np.int64(131093),
 np.int64(147478),
 np.int64(565269),
 np.int64(892952),
 np.int64(827421),
 np.int64(1007650),
 np.int64(540707),
 np.int64(606244),
 np.int64(1007651),
 np.int64(90152),
 np.int64(1097768),
 np.int64(1835051),
 np.int64(1769517),
 np.int64(1695790),
 np.int64(1138735),
 np.int64(712753),
 np.int64(622644),
 np.int64(909365),
 np.int64(581687),
 np.int64(679993),
 np.int64(1499194),
 np.int64(991303),
 np.int64(1310791),
 np.int64(147532),
 np.int64(327759),
 np.int64(442448),
 np.int64(671823),
 np.int64(819280),
 np.int64(884819),
 np.int64(368726),
 np.int64(811096),
 np.int64(1253464),
 np.int64(344158),
 np.int64(188514),
 np.int64(204898),
 np.int64(1204322),
 np.int64(245862),
 np.int64(614503),
 np.int64(778348),
 np.int64(983149),
 np.int64(450669),
 np.int64(508017),
 np.int64(1327221),
 n

In [118]:
sebaceous_adenmoa.index in khop

False

In [119]:
khop

{np.int64(745),
 np.int64(1379),
 np.int64(2186),
 np.int64(2341),
 np.int64(2374),
 np.int64(2826),
 np.int64(3823),
 np.int64(4088),
 np.int64(4345),
 np.int64(4383),
 np.int64(4794),
 np.int64(5261),
 np.int64(5609),
 np.int64(5758),
 np.int64(6508),
 np.int64(6903),
 np.int64(7243),
 np.int64(7344),
 np.int64(7594),
 np.int64(7655),
 np.int64(8938),
 np.int64(9496),
 np.int64(10405),
 np.int64(10727),
 np.int64(10745),
 np.int64(11745),
 np.int64(11753),
 np.int64(11945),
 np.int64(12172),
 np.int64(14143),
 np.int64(14222),
 np.int64(14573),
 np.int64(14844),
 np.int64(15570),
 np.int64(15572),
 np.int64(16891),
 np.int64(17400),
 np.int64(17883),
 np.int64(19800),
 np.int64(22711),
 np.int64(23726),
 np.int64(23773),
 np.int64(25394),
 np.int64(27367),
 np.int64(27602),
 np.int64(27935),
 np.int64(28504),
 np.int64(28999),
 np.int64(30342),
 np.int64(31137),
 np.int64(32028),
 np.int64(32363),
 np.int64(32753),
 np.int64(33470),
 np.int64(33975),
 np.int64(34078),
 np.int64(34659

In [120]:
sebaceous_adenmoa.index

95886

In [121]:
graph.get_khop_idx(epithelial_skin_neoplasm, k=2)

{np.int64(745),
 np.int64(1379),
 np.int64(2186),
 np.int64(2341),
 np.int64(2374),
 np.int64(2826),
 np.int64(3823),
 np.int64(4088),
 np.int64(4345),
 np.int64(4383),
 np.int64(4794),
 np.int64(5261),
 np.int64(5609),
 np.int64(5758),
 np.int64(6508),
 np.int64(6903),
 np.int64(7243),
 np.int64(7344),
 np.int64(7594),
 np.int64(7655),
 np.int64(8938),
 np.int64(9496),
 np.int64(10405),
 np.int64(10727),
 np.int64(10745),
 np.int64(11745),
 np.int64(11753),
 np.int64(11945),
 np.int64(12172),
 np.int64(14143),
 np.int64(14222),
 np.int64(14573),
 np.int64(14844),
 np.int64(15570),
 np.int64(15572),
 np.int64(16891),
 np.int64(17400),
 np.int64(17883),
 np.int64(19800),
 np.int64(22711),
 np.int64(23726),
 np.int64(23773),
 np.int64(25394),
 np.int64(27367),
 np.int64(27602),
 np.int64(27935),
 np.int64(28504),
 np.int64(28999),
 np.int64(30342),
 np.int64(31137),
 np.int64(32028),
 np.int64(32363),
 np.int64(32753),
 np.int64(33470),
 np.int64(33975),
 np.int64(34078),
 np.int64(34659

In [122]:
graph.get_neighbors_idx(epithelial_skin_neoplasm.index)

{np.int64(1110689),
 np.int64(1195363),
 np.int64(1486014),
 np.int64(1563330),
 np.int64(1580228)}

In [123]:
khop

{np.int64(745),
 np.int64(1379),
 np.int64(2186),
 np.int64(2341),
 np.int64(2374),
 np.int64(2826),
 np.int64(3823),
 np.int64(4088),
 np.int64(4345),
 np.int64(4383),
 np.int64(4794),
 np.int64(5261),
 np.int64(5609),
 np.int64(5758),
 np.int64(6508),
 np.int64(6903),
 np.int64(7243),
 np.int64(7344),
 np.int64(7594),
 np.int64(7655),
 np.int64(8938),
 np.int64(9496),
 np.int64(10405),
 np.int64(10727),
 np.int64(10745),
 np.int64(11745),
 np.int64(11753),
 np.int64(11945),
 np.int64(12172),
 np.int64(14143),
 np.int64(14222),
 np.int64(14573),
 np.int64(14844),
 np.int64(15570),
 np.int64(15572),
 np.int64(16891),
 np.int64(17400),
 np.int64(17883),
 np.int64(19800),
 np.int64(22711),
 np.int64(23726),
 np.int64(23773),
 np.int64(25394),
 np.int64(27367),
 np.int64(27602),
 np.int64(27935),
 np.int64(28504),
 np.int64(28999),
 np.int64(30342),
 np.int64(31137),
 np.int64(32028),
 np.int64(32363),
 np.int64(32753),
 np.int64(33470),
 np.int64(33975),
 np.int64(34078),
 np.int64(34659

In [124]:
df["sorted_candidates"].apply(lambda x: len(x))

0       2
1     338
2      66
3     105
4      51
     ... 
67    504
68      9
69    746
70     28
71      8
Name: sorted_candidates, Length: 72, dtype: int64