In [3]:
import json
import os
import sys
from pathlib import Path

import torch

sys.path.append("../")

import pandas as pd

from config import DATA_DIR
from graph_types.mag import MagGraph
from src.llms.entity_extraction import extract_entities_from_question

qas = pd.read_csv(DATA_DIR / "02_qa_datasets/mag.csv")
mag_graph = MagGraph.load()
doc_embeddings = torch.load(
    DATA_DIR / f"node_embeddings/{mag_graph.name}/text-embedding-ada-002/doc/candidate_emb_dict.pt"
)
query_embeddings = torch.load(
    DATA_DIR / f"node_embeddings/{mag_graph.name}/text-embedding-ada-002/query/query_emb_dict.pt"
)


  nodes_df = pd.read_csv(nodes_file)


In [21]:
question_index = 123
row = qas.iloc[question_index]
question = row["question"]
question_embedding = query_embeddings[question_index][0]
answer_indices = json.loads(row["answer_indices"])

In [22]:
entities = extract_entities_from_question(question)

central_nodes = []
for entity in entities:
    nodes = mag_graph.search_nodes(entity, k=1)
    central_nodes.extend(nodes)

In [34]:
question

'Looking for research papers co-authored by authors of "Bimodal distribution of the magnetic dipole moment in nanoparticles with a monomodal distribution of the physical size" focusing on the unconventional phase transition in unique materials described in the same study.'

In [45]:
mag_graph.get_neighbors(central_nodes[0])

{AuthorNode(name=Ben H. Erné, index=33814, type=author),
 AuthorNode(name=Bonny W. M. Kuipers, index=243701, type=author),
 AuthorNode(name=Jos van Rijssel, index=61131, type=author),
 FieldOfStudyNode(name=Condensed matter physics, index=1116339, type=field_of_study),
 FieldOfStudyNode(name=Demagnetizing field, index=1120271, type=field_of_study),
 FieldOfStudyNode(name=Dipole, index=1133173, type=field_of_study),
 FieldOfStudyNode(name=Magnetic anisotropy, index=1117036, type=field_of_study),
 FieldOfStudyNode(name=Magnetic dipole, index=1132780, type=field_of_study),
 FieldOfStudyNode(name=Magnetic nanoparticles, index=1123287, type=field_of_study),
 FieldOfStudyNode(name=Magnetization, index=1116982, type=field_of_study),
 FieldOfStudyNode(name=Nuclear magnetic resonance, index=1118568, type=field_of_study),
 FieldOfStudyNode(name=Paramagnetism, index=1127607, type=field_of_study),
 FieldOfStudyNode(name=Physics, index=1127241, type=field_of_study),
 FieldOfStudyNode(name=Single do

In [24]:
distances = pd.DataFrame(
    columns=["src_index", "dst_index", "distance"],
)
for node in central_nodes:
    node_distances = mag_graph.distance_to_all(node, d=2)
    node_distances["src_index"] = node.index
    distances = pd.concat([distances, node_distances], ignore_index=True)

In [30]:
dst_counts = distances['dst_index'].value_counts()
filtered_distances = distances[distances['dst_index'].isin(dst_counts[dst_counts > 1].index)]
filtered_distances

Unnamed: 0,src_index,dst_index,distance
0,1559404,1559404,0
2,1559404,1116339,1
5,1559404,1118568,1
8,1559404,1127241,1
12,1559404,1189797,1
...,...,...,...
1400569,1713459,1118568,2
1400572,1713459,1123147,2
1400586,1113662,1753550,1
1400589,1113662,1127241,2


In [32]:
mean_distances = filtered_distances.groupby('dst_index')['distance'].mean().sort_values(ascending=False)

In [42]:
mean_distances[mean_distances.index == 1548176]

dst_index
1548176    2.0
Name: distance, dtype: object

In [27]:
# Get all unique src_index values
all_src_indexes = distances['src_index'].unique()
total_src_count = len(all_src_indexes)

# Find dst_index values that appear with ALL src_index values
dst_counts = distances.groupby('dst_index')['src_index'].nunique()
valid_dst_indexes = dst_counts[dst_counts == total_src_count].index

# Filter to only those dst_index values and calculate mean distance
result = (distances[distances['dst_index'].isin(valid_dst_indexes)]
          .groupby('dst_index')['distance']
          .mean()
          .reset_index()
          .rename(columns={'distance': 'mean_distance'}))

In [28]:
result

Unnamed: 0,dst_index,mean_distance
0,1127241,1.333333
1,1668796,2.0
2,1753550,1.666667


In [29]:
answer_indices

[1548176]

In [12]:
distances

Unnamed: 0,src_index,dst_index,distance
0,1106065,1106065,0
1,1106065,835790,1
2,1106065,1254035,2
3,1106065,1600189,2
4,1106065,1108464,2
...,...,...,...
704088,1141741,1415008,2
704089,1141741,1641159,2
704090,1141741,1647475,2
704091,1141741,6026,2


In [None]:

candidate_df = pd.DataFrame(
    [(node.name, node.index, node.type, value) for node, value in candidate_rank.items()],
    columns=["name", "index", "type", "connectedness"],
)

candidate_df = candidate_df[candidate_df["type"] == "paper"]

candidate_df["embedding"] = candidate_df["index"].apply(lambda index: doc_embeddings[index])
candidate_df["similarity"] = candidate_df["embedding"].apply(
    lambda embedding: torch.matmul(
        question_embedding.detach().clone(), embedding.detach().clone().T
    ).item()
)

candidate_df = candidate_df.sort_values(
    ["connectedness", "similarity"], ascending=[False, False]
)

retrieved_indices = candidate_df["index"].tolist()
hit_1 = retrieved_indices[0] in answer_indices if retrieved_indices else False
hit_5 = any([retrieved_index in answer_indices for retrieved_index in retrieved_indices[:5]])
recall_20 = len(set(retrieved_indices[:20]) & set(answer_indices)) / len(answer_indices)

log = {
    "question": question,
    "entities": [entity for entity in entities],
    "central_nodes": [node.to_doc() for node in central_nodes],
    "retrieved_indices": retrieved_indices,
    "answer_indices": answer_indices,
    "hit@1": hit_1,
    "hit@5": hit_5,
    "recall@20": recall_20,
}

with open(results_dir / f"{question_index}.json", "w") as f:
    json.dump(log, f, indent=4)
    candidate_df.to_csv(results_dir / f"{question_index}_candidates.csv", index=False)

print(f"Processed question {question_index}")

In [60]:
for i in range(10):
    print(f"Question {i}: {qas.iloc[i]['question']}")
    print(f"Answer: {qas.iloc[i]['answer_indices']}")


Question 0: Does any research from the Indian Maritime University touch upon Fe II energy level transitions within the scope of Configuration Interaction?
Answer: [1600189]
Question 1: Show me articles related to magnetic field studies within the Digitized Sky Survey discipline.
Answer: [1265762]
Question 2: Show me publications by J. Karch that evaluate various imaging techniques in paleontological research.
Answer: [1787579]
Question 3: What are the papers that reference "The interaction between feedback from active galactic nuclei and supernovae" and also explore the effects of black hole feedback on galaxy groups, similar to the 2010 study on this subject?
Answer: [1561967]
Question 4: Looking for papers published in 2014 in the field of Optics, pertinent to process observation in selective laser melting and related to the International Federation of Sport Climbing.
Answer: [1572865]
Question 5: Are there any published papers from the coauthors of "On the structure of isomeric stat

In [61]:
node = mag_graph.search_nodes("Observed cumulative time delay between second harmonic and fundamental component of pressure wave fields propagating through ultrasound contrast agents", k=5)[0]
print(node)
mag_graph.distance_to_all(node, d=2)

PaperNode(name=Observed cumulative time delay between second harmonic and fundamental component of pressure wave fields propagating through ultrasound contrast agents, index=1283454, type=paper)


Unnamed: 0,dst_index,distance
0,1283454,0
1,1116128,1
2,1120645,1
3,1127165,1
4,1127241,1
...,...,...
700290,358366,2
700291,504518,2
700292,873026,2
700293,915414,2


In [62]:
mag_graph.get_neighbors(node)

{AuthorNode(name=Giovanna Russo, index=277416, type=author),
 AuthorNode(name=Hessel Wijkstra, index=111353, type=author),
 AuthorNode(name=Libertario Demi, index=151676, type=author),
 AuthorNode(name=M Massimo Mischi, index=118327, type=author),
 FieldOfStudyNode(name=-1.0, index=1164287, type=field_of_study),
 FieldOfStudyNode(name=Acoustics, index=1116128, type=field_of_study),
 FieldOfStudyNode(name=Backscatter, index=1129862, type=field_of_study),
 FieldOfStudyNode(name=Harmonic, index=1127984, type=field_of_study),
 FieldOfStudyNode(name=Mechanical index, index=1129151, type=field_of_study),
 FieldOfStudyNode(name=Microbubbles, index=1120645, type=field_of_study),
 FieldOfStudyNode(name=Optics, index=1127165, type=field_of_study),
 FieldOfStudyNode(name=Path length, index=1128101, type=field_of_study),
 FieldOfStudyNode(name=Physics, index=1127241, type=field_of_study),
 FieldOfStudyNode(name=Ultrasound, index=1129750, type=field_of_study),
 PaperNode(name=Angiogenesis imaging b