In [1]:
import json
import pandas as pd
import sys

sys.path.append("../../")

from config import DATA_DIR
from graph_types.graph import Graph

graph_name = "mag"

In [2]:
logs_dir = DATA_DIR / f"experiments/{graph_name}/bm25"
json_files = sorted([f for f in logs_dir.glob("*.json")], key=lambda x: x.stat().st_ctime)

data = []

for json_file in json_files:
    with open(json_file, "r") as f:
        log_data = json.load(f)

    # Extract key information from each log entry
    record = {
        "file_id": int(json_file.stem),
        "question": log_data.get("question", ""),
        "answer_indices": log_data.get("answer_indices", []),
        "bm25_indices": log_data.get("bm25_indices", []),
    }

    data.append(record)

df = pd.DataFrame(data).reset_index(drop=True)  # .sort_values(by="file_id").reset_index(drop=True)

df["recall@all"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["bm25_indices"])))
    / len(set(row["answer_indices"])),
    axis=1,
)
df["hit@1"] = df.apply(
    lambda row: (row["bm25_indices"][0] in row["answer_indices"] if row["bm25_indices"] else False),
    axis=1,
)
df["hit@5"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["bm25_indices"][:5]))) > 0,
    axis=1,
)
df["hit@10"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["bm25_indices"][:10]))) > 0,
    axis=1,
)
df["recall@10"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["bm25_indices"][:10])))
    / len(set(row["answer_indices"])),
    axis=1,
)
df["recall@20"] = df.apply(
    lambda row: len(set(row["answer_indices"]).intersection(set(row["bm25_indices"][:20])))
    / len(set(row["answer_indices"])),
    axis=1,
)

[
    ("n", len(df)),
    ("Hit@1", float(round(df["hit@1"].mean(), 3))),
    ("Hit@5", float(round(df["hit@5"].mean(), 3))),
    ("Recall@10", float(round(df["recall@10"].mean(), 3))),
    ("Recall@20", float(round(df["recall@20"].mean(), 3))),
    ("Recall@all", float(round(df["recall@all"].mean(), 3))),
]

[('n', 1000),
 ('Hit@1', 0.229),
 ('Hit@5', 0.433),
 ('Recall@10', 0.385),
 ('Recall@20', 0.456),
 ('Recall@all', 0.606)]

In [3]:
try:
    graph
except NameError:
    graph = Graph.load(graph_name)

In [4]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
for _, row in df[df["recall@20"] == 0].iterrows():
    print(f"File ID: {row['file_id']}")
    print(f"Question: {row['question']}")

    bm25_indices = row["bm25_indices"]
    nodes = [graph.get_node_by_index(idx) for idx in bm25_indices][:10]
    print("bm25 Nodes:", "\n".join([node.name for node in nodes]))

    print()

File ID: 737
Question: Find papers on the development of optical imaging methods that are referenced in "Fluorescence and SEM correlative microscopy for nanomanipulation of subcellular structures".
bm25 Nodes: Fluorescence and SEM correlative microscopy for nanomanipulation of subcellular structures
Superresolution optical fluctuation imaging (SOFI) aided nanomanipulation of quantum dots using AFM for novel artificial arrangements of chemically functionalized colloidal quantum dots and plasmonic structures
Super-resolution endoscopy for real-time wide-field imaging
Combined multi-plane phase retrieval and super-resolution optical fluctuation imaging for 4D cell microscopy.
Fluorescence and second-harmonic generation correlative microscopy to probe space charge separation and silver cluster stabilization during direct laser writing in a tailored silver-containing glass
The 2018 correlative microscopy techniques roadmap.
Development of functional nanoprobes for optical near-field charact