In [1]:
import json
import pandas as pd
import os
from pathlib import Path
import sys

sys.path.append("../")


from config import DATA_DIR
from graph_types.prime import PrimeGraph
from graph_types.mag import MagGraph

name = "mag"

if name == "prime":
    graph = PrimeGraph.load()
elif name == "mag":
    graph = MagGraph.load()

  nodes_df = pd.read_csv(nodes_file)


In [2]:
logs_dir = DATA_DIR / f"connectedness/{graph.name}_logs_2hop"
json_files = sorted([f for f in logs_dir.glob("*.json")])

data = []

for json_file in json_files:
    with open(json_file, "r") as f:
        log_data = json.load(f)
        

    # Extract key information from each log entry
    record = {
        "file_id": json_file.stem,
        "question": log_data.get("question", ""),
        "sorted_central_nodes_indices": log_data.get("sorted_central_nodes_indices", []),
        "sorted_candidates_indices": log_data.get("sorted_candidates_indices", []),
        "answer_indices": log_data.get("answer_indices", []),
    }

    data.append(record)

df = pd.DataFrame(data)


In [3]:
df

Unnamed: 0,file_id,question,sorted_central_nodes_indices,sorted_candidates_indices,answer_indices
0,0,Does any research from the Indian Maritime Uni...,"[1106065, 1105912, 1107568, 1141741, 1124967, ...","[1600189, 1254035]",[1600189]
1,1,Show me articles related to magnetic field stu...,"[1143690, 1146931, 1336879, 1350736, 1195050, ...","[1265762, 1390758, 1584269, 1185665, 1275328, ...",[1265762]
2,10,"Show me publications from the co-authors of ""C...","[1422742, 1406271, 1369681, 1267497, 1603422, ...","[1422742, 1361436, 1492642, 1424718, 1365479, ...","[1521384, 1578676, 1289934]"
3,100,Could you show me some research papers focusin...,"[1147504, 1125572, 1152961, 1137317, 1154752, ...","[1439303, 1514233, 1413071, 1274743, 1379062, ...","[1514233, 1439303, 1413071]"
4,101,Do any publications from Algoma University exa...,"[1107265, 1770094, 1740964, 1113596, 1235066, ...","[1346396, 1436150, 1550848, 1542334, 1707999, ...",[1346396]
...,...,...,...,...,...
995,995,Show me the research papers authored by the co...,"[1711330, 1134408, 1747848, 1847182, 1433031, ...","[1711330, 1336126, 1254633, 1528917, 1771815, ...",[1214645]
996,996,Publications on nanostructuring and surface mo...,"[1110874, 1104570, 1105437, 1126566, 1540811, ...","[1487887, 1338402, 1416907, 1429538, 1227173, ...","[1429538, 1416907, 1487887]"
997,997,Could you look for papers that share a co-auth...,"[1365511, 1129253, 1698671, 1125515, 1541881, ...","[1365511, 1541881, 1501814, 1382229, 1275360, ...",[1541881]
998,998,Could you find some articles discussing the de...,"[1126110, 1119270, 1119939, 1369500, 1172741, ...","[1471344, 1472294, 1429989, 1426739, 1505111, ...",[1471344]


In [4]:
df["recall@all"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"])))
    / len(set(row["answer_indices"])),
    axis=1,
)
df["hit@1"] = df.apply(
    lambda row: row["sorted_candidates_indices"][0] in row["answer_indices"],
    axis=1,
)
df["hit@5"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"][:5]))) > 0,
    axis=1,
)
df["recall@20"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["sorted_candidates_indices"][:20])))
    / len(set(row["answer_indices"])),
    axis=1,
)

### Metrics

In [5]:
[
    ("Hit@1", float(round(df["hit@1"].mean(), 3))),
    ("Hit@5", float(round(df["hit@5"].mean(), 3))),
    ("Recall@20", float(round(df["recall@20"].mean(), 3))),
    ("Recall@all", float(round(df["recall@all"].mean(), 3))),
]

[('Hit@1', 0.491),
 ('Hit@5', 0.764),
 ('Recall@20', 0.781),
 ('Recall@all', 0.936)]

### What was the starting node when we didn't hit the correct subgraph?

In [6]:
for _, row in df[df["recall@all"] != 1].iterrows():
    starting_node = graph.get_node_by_index(
        (row["sorted_central_nodes_indices"][0] if row["sorted_central_nodes_indices"] else "None")
    )

    print(f"Question: {row['question']}\nStarting node: {starting_node}")
    print(
        f"Other candidate for starting node: {row['sorted_central_nodes_indices'][1] if len(row['sorted_central_nodes_indices']) > 1 else 'None'}\n"
    )

Question: Show me research articles related to weather stations focusing on the study of mountainous terrain impacts on observational data.
Starting node: PaperNode(name=Daytime Heat Transfer Processes over Mountainous Terrain, index=1521359, type=paper)
Other candidate for starting node: 1582654

Question: I'm seeking research articles on mathematical objects that delve into delineating complex networks' modular configuration. My focus is on those employing graph theory and algebraic topology approaches to scrutinize the communal constructs within empirical networks.
Starting node: FieldOfStudyNode(name=Algebraic topology, index=1129889, type=field_of_study)
Other candidate for starting node: 1123418

Question: Show me publications by M. F. de Andrade that study the interactions of monomers within a lattice structure.
Starting node: AuthorNode(name=M. F. de Andrade, index=557457, type=author)
Other candidate for starting node: 735221

Question: Show me articles on deformable mirror mo

### When we started in the correct subgraph, how many did we recover?

In [7]:
df[df["recall@all"] == 1]['recall@20'].mean()

np.float64(0.8347926013214799)

### Can we match any of the central nodes to the question?

In [8]:
df["exact_matching_nodes_indices"] = df.apply(
    lambda row: [
        i
        for i in list(set(row["sorted_central_nodes_indices"]))
        if graph.get_node_by_index(i).name in row["question"]
    ],
    axis=1,
)
df["exact_matching_node_names"] = df["exact_matching_nodes_indices"].apply(
    lambda exact_matching_nodes_indices: [
        graph.get_node_by_index(i).name for i in exact_matching_nodes_indices
    ]
)
df['starting_node_name'] = df.apply(
    lambda row: graph.get_node_by_index(row["sorted_central_nodes_indices"][0]).name
    if row["sorted_central_nodes_indices"] else "None",
    axis=1,
)
df["starting_node_matches"] = df.apply(
    lambda row: row["sorted_central_nodes_indices"][0] in row["exact_matching_nodes_indices"],
    axis=1,
)

An interesting result: when the starting node literally appears in the question, we get better results.

In [9]:
df.groupby('starting_node_matches').agg(
    {
        "recall@20": "mean",
        "hit@1": "mean",
        "hit@5": "mean",
        "file_id": "count"
    }
).rename(columns={"file_id": "count"})

Unnamed: 0_level_0,recall@20,hit@1,hit@5,count
starting_node_matches,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0.613118,0.406061,0.606061,165
True,0.813687,0.507784,0.79521,835


Look at this: In many rows even more than one node appears literally named in the question

In [10]:
df['n_of_exact_matching_nodes'] = df['exact_matching_nodes_indices'].apply(len)
df['n_of_exact_matching_nodes'].value_counts()

n_of_exact_matching_nodes
1    699
2    149
0    119
3     31
4      2
Name: count, dtype: int64

In [11]:
for _, row in df[df['starting_node_matches'] == False].iterrows():
    print(f"Question: {row['question']}")
    print(f"Starting node name: {row['starting_node_name']}\n")

Question: Could you show me some research papers focusing on soil decontamination from radioactive cesium using magnetic separation methods in the realm of soil pollution?
Starting node name: Magnetic separation

Question: Show me research articles related to weather stations focusing on the study of mountainous terrain impacts on observational data.
Starting node name: Daytime Heat Transfer Processes over Mountainous Terrain

Question: Show me research papers on the evaluation of cone-beam CT for near real-time dosimetry, authored by scholars from China Medical University in Taiwan.
Starting node name: China Medical University (PRC)

Question: I'm seeking research articles on mathematical objects that delve into delineating complex networks' modular configuration. My focus is on those employing graph theory and algebraic topology approaches to scrutinize the communal constructs within empirical networks.
Starting node name: Algebraic topology

Question: What other research papers inve

In [14]:
df[df['n_of_exact_matching_nodes'] > 1]

Unnamed: 0,file_id,question,sorted_central_nodes_indices,sorted_candidates_indices,answer_indices,recall@all,hit@1,hit@5,recall@20,exact_matching_nodes_indices,exact_matching_node_names,starting_node_name,starting_node_matches,n_of_exact_matching_nodes
13,11,Show me publications by Jianxin Ma on new tech...,"[315889, 639910, 695454, 1621377, 1611651, 143...","[1779954, 1622059, 1414886, 1436282, 1300727, ...",[1686364],1.0,False,False,0.0,"[639910, 315889, 695454]","[Jianxin Ma, Jianxin Ma, Jianxin Ma]",Jianxin Ma,True,3
18,114,I'm looking for articles on Embedding Medium t...,"[1152598, 1120450, 1120035, 1469683, 1120733, ...","[1227837, 1599409, 1449987, 1738378, 1831799, ...",[1227837],1.0,True,True,1.0,"[1118026, 1152598]","[Embedding, Embedding Medium]",Embedding Medium,True,2
25,120,Show me publications by Ying Xin focusing on t...,"[380582, 218933, 625232, 920607, 1348654, 1746...","[1348654, 1477046, 1273883, 1266108, 1537473, ...",[1348654],1.0,True,True,1.0,"[380582, 625232, 218933]","[Ying Xin, Ying Xin, Ying Xin]",Ying Xin,True,3
29,124,Show me publications by M. F. de Andrade that ...,"[557457, 735221, 284973, 1307865, 1798821, 176...","[1242884, 1391869, 1457338, 1452506, 1216921, ...","[1338369, 1528066]",0.0,False,False,0.0,"[557457, 735221]","[M. F. de Andrade, M. F. de Andrade]",M. F. de Andrade,True,2
38,132,Can you find me papers that delve into the top...,"[1520715, 1309908, 1459873, 1645029, 1135916, ...","[1520715, 1229120, 1175873, 1522204, 1776967, ...",[1229120],1.0,False,True,1.0,"[1520715, 1135916]",[Solar Cycle 24: Curious Changes in the Relati...,Solar Cycle 24: Curious Changes in the Relativ...,True,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
966,969,Show me publications by Wen-Chen Chen on the t...,"[170057, 246297, 304422, 1287867, 1296249, 123...","[1545119, 1404190, 1525152, 1310094, 1221928, ...","[1279944, 1578697, 1665193, 1600209, 1655732]",0.0,False,False,0.0,"[170057, 246297]","[Wen-Chen Chen, Wen-Chen Chen]",Wen-Chen Chen,True,2
980,981,Does any research from Technische Universität ...,"[1106530, 1105545, 1108252, 1134317, 1146673, ...","[1703398, 1224569, 1347427, 1821259, 1851156, ...","[1201448, 1196314, 1729551]",1.0,False,False,0.0,"[1119616, 1106530]","[Magnetic moment, Technische Universität München]",Technische Universität München,True,2
984,985,Find publications by Zhen Zhang on automated t...,"[1136378, 1125741, 433318, 1842840, 1856921, 3...","[1836965, 1836702, 1852560, 1703266, 1865315, ...",[1836965],1.0,True,True,1.0,"[402698, 353492]","[Zhen Zhang, Zhen Zhang]",Radiation treatment planning,False,2
987,988,I'm searching for articles co-authored by the ...,"[1145668, 1281470, 1352249, 1281470, 1579736, ...","[1281470, 1641185, 1693233, 1580437, 1231371, ...","[1278116, 1826087, 1231817, 1322581, 1324765]",0.2,False,False,0.2,"[1145668, 1281470]","[Foldy–Wouthuysen transformation, Foldy–Wouthu...",Foldy–Wouthuysen transformation,True,2
