In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib
import regex as re 
import subprocess
import importlib
from collections import deque

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

from llama_index.llms.openai import OpenAI
llm = OpenAI(temperature=0, model="gpt-4o-mini")

os.chdir("/work/submit/mcgreivy/beauty-in-stats/graph_rag")

import LHCbKnowledgeGraph
importlib.reload(LHCbKnowledgeGraph)

import nest_asyncio
nest_asyncio.apply()

Chunked Stats Corpus

In [2]:
def clean_latex(text):
    environments = [
        r'\\begin\{figure\*?\}.*?\\end\{figure\*?\}',
        r'\\begin\{wrapfigure\*?\}.*?\\end\{wrapfigure\*?\}',
        r'\\begin\{thebibliography\}.*?\\end\{thebibliography\}',
        r'\\label\{.*?\}',
        r'.*\\begin\{document\}',
    ]
    
    for pattern in environments:
        text = re.sub(pattern, ' ', text, flags=re.DOTALL)

    marker = "\\section{Introduction}"
    position = text.find(marker)
    if position > 0:
        text = text[position:]

    text = re.sub(r'\%.*\n', "", text)
    text = re.sub(r"\n\s*", "\n", text)
    text = re.sub(r'([^\S\n])+', ' ', text)
    return text

def split_sections(text, depth=0, title="", max_tokens=6000):
    pattern = r"(\\" + "sub" * depth + r"section[\*\s]*(?:\[[^\]]*\])?\s*({(?:[^{}]*+|(?2))*}))"
    matches = re.finditer(pattern, text)

    if not matches or depth > 3:
        return [text]

    sections = []
    start = 0
    section_title = ""
    for match in [(match.start(), match.end()) for match in matches] + [(-1, -1)]:
        end = match[0]
        section_text = text[start:end]
        if len(re.sub("\s", "", section_text)) > 0:
            new_title = f"{title}\n{section_title}" if len(title) > 0 else section_title
            num_tokens = len(LHCbKnowledgeGraph.embedding_model.tokenizer.tokenize(section_text))
            if num_tokens > max_tokens:
                sections.extend(split_sections(section_text, depth=depth+1, title=new_title))
            else:
                sections.append(new_title + section_text)

        start = match[1]
        section_title = text[end:start]

    return sections

In [3]:
chunks = []
i = 0
corpus_path = "/work/submit/mcgreivy/beauty-in-stats/src/scraper/data/cleaned_tex"
for file in os.listdir(corpus_path):
    if file.endswith(".tex"):
        file_path = os.path.join(corpus_path, file)
        with open(file_path) as f:
            if i != 5:
                i += 1
                continue
            text = clean_latex(f.read())
            sections = split_sections(text)
            chunks.extend(sections)
            break

In [15]:
entities = []
for chunk in chunks[]:
    print(chunk)
    prompt = LHCbKnowledgeGraph.get_lhcb_entity_extraction_prompt(chunk)
    found_entities = eval(llm.complete(prompt).text)
    entities.extend(found_entities)

\section{Systematic uncertainties}
Systematic uncertainties on the {{\calB} ( {{{\Lambda} ^0_{ b } } {\to} { J / {\psi} } {{\Xi} ^-} {{ K } ^+} } )/{\calB} ( {{{\Lambda} ^0_{ b } } {\to} { J / {\psi} } {\Lambda} } )} and {{\calB} ( {{{\Xi} ^0_{ b } } {\to} { J / {\psi} } {{\Xi} ^-} {{\pi} ^+} } )/{\calB} ( {{{\Xi} ^-_{ b } } {\to} { J / {\psi} } {{\Xi} ^-} } )} measurements arise from several sources. A summary of the uncertainties is provided in Table~\ref{tab:systematics:summary}.
\begin{table}[!htb]
\caption{Summary of relative systematic uncertainties (in percent) for the measured ratio of branching fractions. The individual sources are described in the text. The total relative uncertainty is determined by adding the individual sources in quadrature.}
\begin{tabular}{lcc}
\hline
Source & {\frac{\mathlarger{\calB}( {{{\Lambda} ^0_{ b } } {\to} { J / {\psi} } {{\Xi} ^-} {{ K } ^+} } )}{\mathlarger{\calB}( {{{\Lambda} ^0_{ b } } {\to} { J / {\psi} } {\Lambda} } )}} [\\hline
Fit model 

In [16]:
for e in entities:
    print(e)

{'entity_name': 'branching fraction ratio of Λ^0_b to J/ψ Ξ^- K^+', 'entity_type': 'measurement_quantity', 'justification': 'This quantity is measured as part of the systematic uncertainties analysis, specifically for the decay process Λ^0_b to J/ψ Ξ^- K^+.'}
{'entity_name': 'branching fraction ratio of Ξ^0_b to J/ψ Ξ^- π^+', 'entity_type': 'measurement_quantity', 'justification': 'This quantity is also measured in the context of systematic uncertainties, specifically for the decay process Ξ^0_b to J/ψ Ξ^- π^+.'}
{'entity_name': 'Λ^0_b to J/ψ Ξ^- K^+', 'entity_type': 'decay_process', 'justification': 'This decay process is part of the branching fraction ratio being measured and analyzed for systematic uncertainties.'}
{'entity_name': 'Ξ^0_b to J/ψ Ξ^- π^+', 'entity_type': 'decay_process', 'justification': 'This decay process is included in the analysis of the branching fraction ratio and associated systematic uncertainties.'}
{'entity_name': 'Λ^0_b to J/ψ Λ', 'entity_type': 'decay_proc

In [17]:
for e in entities:
    if e["entity_type"] == "measurement_quantity":
        print(e)

{'entity_name': 'branching fraction', 'entity_type': 'measurement_quantity'}
{'entity_name': '9 fb^{-1}', 'entity_type': 'measurement_quantity'}
{'entity_name': 'R_{BF}', 'entity_type': 'measurement_quantity'}
{'entity_name': '\\mathcal{R}_{BF} \\equiv \x0crac{\\mathcal{B}(B^{+} \to \\psi(2S) \\phi K^{+})}{\\mathcal{B}(B^{+} \to J/\\psi \\phi K^{+})}', 'entity_type': 'measurement_quantity'}


In [19]:
for e in entities:
    if e["entity_type"] == "decay_property":
        print(e)

{'entity_name': '80 times smaller phase space', 'entity_type': 'decay_property'}
{'entity_name': '5 times larger average amplitude squared', 'entity_type': 'decay_property'}
{'entity_name': 'resonant amplitudes in B^{+} \to \\psi(2S) \\phi K^{+}', 'entity_type': 'decay_property'}


In [100]:
from py2neo import Graph, Node, Relationship

uri="neo4j+s://2d257b33.databases.neo4j.io"
username="neo4j"
password="LrVuuzEjpH3gmxLAFlOwgZoKnDCnX5AU3rRqS0PW97g"

graph = Graph(uri, auth=(username, password))

graph.delete_all()

entity_nodes = {}

for entity in entities:
    node = Node(
        entity["entity_type"],
        name=entity["entity_name"],
        description=entity["description"],
    )

    graph.create(node)
    entity_nodes[entity["entity_name"]] = node

for rel in relationships:
    try:
        rel = rel.copy()
        source_node = entity_nodes[rel["source"]]
        target_node = entity_nodes[rel["target"]]
        relationship = rel["relation"]
        rel.pop("source")
        rel.pop("target")
        rel.pop("relation")

        relationship = Relationship(
            source_node,
            relationship,
            target_node,
            **rel
        )

        graph.create(relationship)
    
    except Exception as e:
        print(e)