In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib
import regex as re 
import subprocess
import importlib
from collections import deque

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document

from llama_index.llms.openai import OpenAI
llm = OpenAI(temperature=0, model="gpt-4o-mini")

os.chdir("/work/submit/mcgreivy/beauty-in-stats/graph_rag")

import StatsKnowledgeGraph
importlib.reload(StatsKnowledgeGraph)

import nest_asyncio
nest_asyncio.apply()

Chunked Stats Corpus

In [2]:
def clean_latex(text):
    environments = [
        r'\\begin\{figure\*?\}.*?\\end\{figure\*?\}',
        r'\\begin\{wrapfigure\*?\}.*?\\end\{wrapfigure\*?\}',
        r'\\begin\{thebibliography\}.*?\\end\{thebibliography\}',
        r'\\label\{.*?\}',
        r'.*\\begin\{document\}',
    ]
    
    for pattern in environments:
        text = re.sub(pattern, ' ', text, flags=re.DOTALL)

    marker = "\\section{Introduction}"
    position = text.find(marker)
    if position > 0:
        text = text[position:]

    text = re.sub(r'\%.*\n', "", text)
    text = re.sub(r"\n\s*", "\n", text)
    text = re.sub(r'([^\S\n])+', ' ', text)
    return text

def split_sections(text, depth=0, title="", max_tokens=6000):
    pattern = r"(\\" + "sub" * depth + r"section[\*\s]*(?:\[[^\]]*\])?\s*({(?:[^{}]*+|(?2))*}))"
    matches = re.finditer(pattern, text)

    if not matches or depth > 3:
        return [text]

    sections = []
    start = 0
    section_title = ""
    for match in [(match.start(), match.end()) for match in matches] + [(-1, -1)]:
        end = match[0]
        section_text = text[start:end]
        if len(re.sub("\s", "", section_text)) > 0:
            new_title = f"{title}\n{section_title}" if len(title) > 0 else section_title
            num_tokens = len(StatsKnowledgeGraph.embedding_model.tokenizer.tokenize(section_text))
            if num_tokens > max_tokens:
                sections.extend(split_sections(section_text, depth=depth+1, title=new_title))
            else:
                sections.append(new_title + section_text)

        start = match[1]
        section_title = text[end:start]

    return sections

chunks = []
chunk_to_paper = {}
source_directory = "/work/submit/mcgreivy/beauty-in-stats/graph_rag/data/arXiv"
for root, dirs, files in os.walk(source_directory):
    for file in files:
        if file.endswith(".tex"):
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                text = clean_latex(f.read())
                sections = split_sections(text)
                chunks.extend(sections)
                for chunk in sections:
                    chunk_to_paper[chunk] = root.split("/")[-1]
                
chunks = list(filter(lambda x: len(x) > 5, chunks))
print(len(chunks))

stats_entities, stats_relationships = StatsKnowledgeGraph.process_all_sections(chunks, chunk_size=40, sleep_time=10)
np.save("./data/saved_kg/stats_entities.npy", stats_entities, allow_pickle=True)
np.save("./data/saved_kg/stats_relationships.npy", stats_relationships, allow_pickle=True)
# stats_entities, stats_relationships = np.load("./data/saved_kg/stats_entities.npy", allow_pickle=True), np.load("./data/saved_kg/stats_relationships.npy", allow_pickle=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (16530 > 8192). Running this sequence through the model will result in indexing errors


178

Phase 1: Extracting entities from all chunks
Processing chunks 0 to 39
Sleeping for 10 seconds...
Processing chunks 40 to 79
Sleeping for 10 seconds...
Processing chunks 80 to 119
Sleeping for 10 seconds...
Processing chunks 120 to 159
Sleeping for 10 seconds...
Processing chunks 160 to 177

Phase 2: Deduplicating 554 extracted entities
After deduplication: 122 unique entities

Phase 3: Extracting relationships using merged entities
Processing relationships for chunks 0 to 39
Sleeping for 10 seconds...
Processing relationships for chunks 40 to 79
Sleeping for 10 seconds...
Processing relationships for chunks 80 to 119
Sleeping for 10 seconds...
Processing relationships for chunks 120 to 159
Sleeping for 10 seconds...
Processing relationships for chunks 160 to 177

Phase 4: Deduplicating 862 extracted relationships


Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-vbCBUfLxtSLrNODFqTZaltoI on tokens per min (TPM): Limit 200000, Used 199662, Requested 1196. Please try again in 257ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-vbCBUfLxtSLrNODFqTZaltoI on tokens per min (TPM): Limit 200000, Used 199647, Requested 1229. Please try again in 262ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error 

After deduplication: 454 unique relationships


In [3]:
# paper_entities = []
# for paper in set(chunk_to_paper.values()):
#     paper_entity = {"entity_name": paper, "entity_type": "stats_review_paper", "description": "One of the stats review papers"}
#     paper_entities.append(paper_entity)
# paper_entities = np.array(paper_entities)

# paper_relationships = []
# for entity in stats_entities:
#     entity_name_a = entity["entity_name"]
#     entity_type_a = entity["entity_type"]
#     for passage in entity["relevant_passages"]:
#         entity_name_b = chunk_to_paper[passage]
#         entity_type_b = "stats_review_paper"
#         paper_relationship = {"entity_name_a": entity_name_a, "entity_type_a": entity_type_a, "entity_name_b": entity_name_b, "entity_type_b": entity_type_b, "relationship_name": "is described in"}
#         paper_relationships.append(paper_relationship)
# paper_relationships = np.array(paper_relationships)

In [4]:
# chunks = []
# chunk_to_paper = {}

# i = 0
# corpus_path = "/work/submit/mcgreivy/beauty-in-stats/src/scraper/data/cleaned_tex"
# for file in os.listdir(corpus_path):
#     if file.endswith(".tex"):
#         file_path = os.path.join(corpus_path, file)
#         with open(file_path) as f:
#             text = clean_latex(f.read())
#             sections = split_sections(text)
#             chunks.extend(sections)
#             for chunk in sections:
#                 chunk_to_paper[chunk] = file
#             i += 1
#             if i > 20:
#                 break

# print(len(chunks))

# #relevant_entities = [e for e in stats_entities if e["entity_type"] not in  "statistics_concept"]
# #lhcb_entities, lhcb_relationships = StatsKnowledgeGraph.lhcb_kg_extension(chunks, chunk_to_paper, relevant_entities, chunk_size=40, sleep_time=10)

In [5]:
# # all_entities = np.hstack((stats_entities, paper_entities, lhcb_entities))
# # all_relationships = np.hstack((stats_relationships, paper_relationships, lhcb_relationships))
# # np.save("./data/saved_kg/all_entities.npy", all_entities, allow_pickle=True)
# # np.save("./data/saved_kg/all_relationships.npy", all_relationships, allow_pickle=True)
# all_entities = np.load("./data/saved_kg/all_entities.npy", allow_pickle=True)
# all_relationships = np.load("./data/saved_kg/all_relationships.npy", allow_pickle=True)

In [6]:
from py2neo import Graph, Node, Relationship

uri="neo4j+s://2d257b33.databases.neo4j.io"
username="neo4j"
password="LrVuuzEjpH3gmxLAFlOwgZoKnDCnX5AU3rRqS0PW97g"

graph = Graph(uri, auth=(username, password))


In [7]:
graph.delete_all()

entity_nodes = {}

for entity in stats_entities:
    node = Node(
        entity["entity_type"],
        name=entity["entity_name"],
        description=entity["description"],
    )

    graph.create(node)
    entity_nodes[entity["entity_name"]] = node

for rel in stats_relationships:
    try:
        rel = rel.copy()
        source_node = entity_nodes[rel["entity_name_a"]]
        target_node = entity_nodes[rel["entity_name_b"]]
        relationship = rel["relationship_name"]
        rel.pop("entity_name_a")
        rel.pop("entity_name_b")
        rel.pop("relationship_name")
        rel.pop("relevant_passages", None)

        relationship = Relationship(
            source_node,
            relationship,
            target_node,
            **rel
        )

        graph.create(relationship)
    
    except Exception as e:
        print(e)

In [8]:
stats_entities

array([{'entity_name': 'likelihood function and related concepts', 'entity_type': 'statistics_concept', 'description': 'A fundamental statistical function that measures the probability of observing given data under various parameter values, extensively used in statistical inference, hypothesis testing, and parameter estimation. This encompasses related concepts such as likelihood ratios, maximum likelihood estimation, and various probability distributions like Poisson, Gaussian, and binomial distributions, which are essential in modeling and analyzing data in fields such as particle physics.', 'relevant_passages': {"\\section{Physics questions formulated in statistical language}\n\\subsection{Discovery as hypothesis tests} \nLet us examine the statistical statement associated to the claim of discovery for new physics. Typically, new physics searches are looking for a signal that is additive on top of the background, though in some cases there are interference effects that need to be ta