In [29]:
"""
Author: James McGreivy
Email: mcgreivy@mit.edu
"""

import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib
import regex as re 
import subprocess
import importlib
from collections import deque
import json
import asyncio
from tqdm import tqdm
import pickle
import glob

os.chdir("/work/submit/mcgreivy/beauty-in-stats/src/graph_rag")

import SystematicsGraph

In [40]:
def build_papers_cache(tex_dir, abstract_dir, cache_dir, max_papers=-1):

    os.makedirs(cache_dir, exist_ok=True)
    # Remove everything currently in the cache directory
    files = glob.glob(os.path.join(cache_dir, "*"))
    for f in files:
        print(files)
        os.remove(f)

    print(f"Loading papers from {tex_dir}")

    tex_files = [f for f in os.listdir(tex_dir) if f.endswith(".tex")]
    
    if max_papers != -1:
        tex_files = tex_files[:max_papers]
    
    for file in tqdm(tex_files, desc="Loading papers"):

        abstract_path = os.path.join(abstract_dir, file)
        try:
            with open(abstract_path, 'r', encoding='utf-8') as f:
                abstract = f.read()
        except Exception as e:
            tqdm.write(f"Warning: Abstract not found for {file} \n {e}")
            continue

        tex_path = os.path.join(tex_dir, file)
        with open(tex_path, 'r', encoding='utf-8') as f:
            text = f.read()

        arxiv_id = file[:file.find(".tex")]

        paper = SystematicsGraph.LHCbPaper(abstract, text, arxiv_id)
        
        with open(os.path.join(cache_dir, paper.arxiv_id + ".pkl"), "wb") as f:
            pickle.dump(paper, f)
            

In [None]:
tex_dir= "../scraper/data/cleaned_tex"
abstract_dir="../scraper/data/abstracts"
cache_dir="../systematics_graph_cache/papers"

build_papers_cache(tex_dir, abstract_dir, cache_dir, 5)

['../systematics_graph_cache/papers/2503.02711.pkl', '../systematics_graph_cache/papers/2502.18987.pkl', '../systematics_graph_cache/papers/2501.12611.pkl', '../systematics_graph_cache/papers/2502.04013.pkl', '../systematics_graph_cache/papers/2501.14943.pkl']
['../systematics_graph_cache/papers/2503.02711.pkl', '../systematics_graph_cache/papers/2502.18987.pkl', '../systematics_graph_cache/papers/2501.12611.pkl', '../systematics_graph_cache/papers/2502.04013.pkl', '../systematics_graph_cache/papers/2501.14943.pkl']
['../systematics_graph_cache/papers/2503.02711.pkl', '../systematics_graph_cache/papers/2502.18987.pkl', '../systematics_graph_cache/papers/2501.12611.pkl', '../systematics_graph_cache/papers/2502.04013.pkl', '../systematics_graph_cache/papers/2501.14943.pkl']
['../systematics_graph_cache/papers/2503.02711.pkl', '../systematics_graph_cache/papers/2502.18987.pkl', '../systematics_graph_cache/papers/2501.12611.pkl', '../systematics_graph_cache/papers/2502.04013.pkl', '../syst

Loading papers: 100%|██████████| 5/5 [00:00<00:00, 175.58it/s]


In [48]:
async def process_papers_cache(cache_dir, threads=6, timeout=360):
    
    files = list(filter(lambda x : ".pkl" in x, os.listdir(cache_dir)))

    # Create progress bar
    pbar = tqdm(total=len(files), desc="Processing papers")
    semaphore = asyncio.Semaphore(threads)
    
    async def process_with_limit(file_path):

        with open(file_path, "rb") as f:
            paper = pickle.load(f)

        """Process a single paper with semaphore limiting."""
        if hasattr(paper, "relationships") and len(paper.relationships) > 0:
            tqdm.write(f"Already processed {paper.arxiv_id}, skipping...")
            pbar.update(1)
            return
        
        async with semaphore:
            try:
                await asyncio.wait_for(
                    paper.process_paper(),
                    timeout=timeout
                )
                tqdm.write(f"Successfully processed: {paper.arxiv_id}")

                with open(file_path, "wb") as f:
                    pickle.dump(paper, f)

            except Exception as e:
                tqdm.write(f"Exception {e} while processing {paper.arxiv_id}")
            finally:
                pbar.update(1)
    
    tasks = [process_with_limit(os.path.join(cache_dir, file)) for file in files]        
    await asyncio.gather(*tasks)
    pbar.close()
    
    print("Finished processing all papers")

import nest_asyncio
nest_asyncio.apply()
asyncio.run(process_papers_cache(cache_dir))

Processing papers: 100%|██████████| 5/5 [00:00<00:00, 454.43it/s]

Already processed 2503.02711, skipping...
Already processed 2502.18987, skipping...
Already processed 2501.12611, skipping...
Already processed 2502.04013, skipping...
Already processed 2501.14943, skipping...
Finished processing all papers





In [None]:
def build_graph(papers):
    graph = SystematicsGraph.SystematicsGraph()
    
    for paper in tqdm(papers, desc="Loading papers into graph"):
        graph.load_paper(paper)
    
    return graph


def merge_entities(graph):
    print("Merging similar entities...")
    
    # Merge uncertainty sources
    print("Merging uncertainty sources...")
    graph.merge_entity_type("uncertainty_source", ["type"], 0.2, True)
    
    # Merge methods
    print("Merging methods...")
    graph.merge_entity_type("method", [], 0.2, True)
    
    print("Entity merging complete")


def push_to_neo4j(graph, uri, username, password):
    print(f"Pushing graph to Neo4j at {uri}")
    graph.push_to_neo4j(uri, username, password)
    print("Successfully pushed graph to Neo4j")


async def main():
    # Neo4j connection details
    NEO4J_URI = "neo4j+s://2d257b33.databases.neo4j.io"
    NEO4J_USERNAME = "neo4j"
    NEO4J_PASSWORD = "LrVuuzEjpH3gmxLAFlOwgZoKnDCnX5AU3rRqS0PW97g"
    
    text_dir="../scraper/data/cleaned_tex/", 
    abstract_dir="../scraper/data/abstracts/"

    save_dir = "./systematics_graph_cache/lhcb_papers.pkl"

    papers = load_papers(text_dir, abstract_dir, save_dir, 4)

    
    await process_papers(papers, threads=3)
    
    save_papers(papers, )
    
    # graph = build_graph(papers)
    
    # merge_entities(graph)
    
    # push_to_neo4j(graph, NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)


if __name__ == "__main__":
    pass
    #asyncio.run(main())

### Graph Querying

In [1]:
from GraphQueryAgent import GraphQueryAgent

uri="neo4j+s://2d257b33.databases.neo4j.io"
username="neo4j"
password = "LrVuuzEjpH3gmxLAFlOwgZoKnDCnX5AU3rRqS0PW97g"
agent = GraphQueryAgent(uri, username, password)

In [2]:
query = "What systematic tends to result in the largest uncertainty in studies of CP violation? Roughly how large are these uncertainties?"
answer = agent.query(query)

NameError: name 're' is not defined

In [16]:
print("Query:", query)
print("\nCypher Query:", answer["cypher_query"])
print("\nexplanation:", answer["explanation"])
print("\nSynthesized Answer:\n", answer["synthesized_answer"])

Query: What systematic tends to result in the largest uncertainty in studies of CP violation? Roughly how large are these uncertainties?

Cypher Query: MATCH (o:observable) WHERE o.type CONTAINS 'physical_constant' WITH o ORDER BY vector.similarity.cosine(o.embedding, $("CP violation")) DESC LIMIT 20
MATCH (u:uncertainty_source)-[r:affects]-(o)
WITH u, o, r
WHERE u.type <> 'statistical'
WITH u.name AS uncertainty_name, u.description AS uncertainty_description, u.arxiv_id AS arxiv_id, o.name AS observable_name, o.arxiv_id AS observable_arxiv, r.ranking AS ranking, r.magnitude AS magnitude
WITH uncertainty_name, uncertainty_description, arxiv_id, COLLECT(DISTINCT [observable_name, observable_arxiv]) AS observables, AVG(ranking) AS avg_ranking, COLLECT(DISTINCT magnitude) AS magnitudes
RETURN uncertainty_name, uncertainty_description, arxiv_id, observables, avg_ranking, magnitudes
ORDER BY avg_ranking ASC
LIMIT 20

explanation: This Cypher query identifies the systematic uncertainty sourc

In [8]:
print(agent.process_cypher_query("""MATCH (o:observable) WHERE o.type CONTAINS 'physical_constant' WITH o ORDER BY vector.similarity.cosine(o.embedding, $("CP violation")) DESC LIMIT 20
MATCH (u:uncertainty_source)-[r:affects]-(o)
WITH u, o, r
WHERE u.type <> 'statistical'
WITH u.name AS uncertainty_name, u.description AS uncertainty_description, u.arxiv_id AS arxiv_id, o.name AS observable_name, o.arxiv_id AS observable_arxiv, r.ranking AS ranking, r.magnitude AS magnitude
WITH uncertainty_name, uncertainty_description, arxiv_id, COLLECT(DISTINCT [observable_name, observable_arxiv]) AS observables, AVG(ranking) AS avg_ranking, COLLECT(DISTINCT magnitude) AS magnitudes
RETURN uncertainty_name, uncertainty_description, arxiv_id, observables, avg_ranking, magnitudes
ORDER BY avg_ranking ASC
LIMIT 20
"""))

MATCH (o:observable) WHERE o.type CONTAINS 'physical_constant' WITH o ORDER BY vector.similarity.cosine(o.embedding, [-0.002668354, 0.014479429, -0.036080487, 0.022082537, -0.015610004, -0.023077147, -0.022156319, 0.002087259, -0.018545624, -0.0027308273, 0.023635834, -0.03224365, 0.018095437, -0.0036364584, 0.021645857, -0.032337744, 0.031136252, -0.011478264, -0.03174111, -0.020712188, 0.023705246, -0.04409919, -0.0008243269, 0.04496399, -0.039289396, 0.018980563, -0.004292594, -0.016523592, -0.035786852, -0.01370339, -0.027106859, -0.049007773, 0.027676182, -0.06254653, 0.003279017, -0.048825674, 0.0006639421, 0.001900897, -0.052226108, 0.024750035, 0.012985992, -0.06204855, 0.006578718, -0.038730275, 0.0329035, -0.02190374, 0.0044309385, -0.036377124, 0.0009073392, -0.014752998, 0.021476507, 0.043894425, 0.017962892, -0.051165454, 0.057382576, 0.022970937, -0.0036581885, -0.012254066, -0.041973393, -0.04240697, -0.025960611, -0.038928524, 0.023840323, 0.022106225, -0.008647565, 0.1