In [1]:
from llama_index.llms.openai import OpenAI
import asyncio
import nest_asyncio
import regex as re
nest_asyncio.apply()

In [2]:
def get_prompt_queries(section):
    prompt = f"""You are an expert in particle physics and a member of the LHCb collaboration. You have been tasked with generating high quality queries for a Q&A dataset of particle physics papers. A high quality query is something conceptually simple to ask but difficult to answer. A high quality query is answerable only by digging deep into the contents of the paper, with the answer scattered across multiple different sections.

    Example high quality queries:
    ['What is the dominant systematic uncertainty?',
    'What decay is used in this measurement?',
    'How large is the statistical uncertainty relative to the nominal value?',
    'Is this result compatible with the SM?',
    'How much data was analysed?',
    'Which decays and final states enter the analysis?',
    'What are the dangerous backgrounds in this analysis?',
    'What are all the correcting weights?',
    'Why is this result different to the previous result?',
    'How many resonances does the model contain?',
    'How is the efficiency modelled?',
    'How does this measurement relate to other results?',
    'Why is it so difficult to work with photons at LHCb?',
    'What are the challenges when dealing with amplitude fits?',
    'What are the considered backgrounds and how are they addressed?',
    'How is semileptonic background treated?',
    'How does this analysis deal with the correlation between the three-body mass and the two-body masses?',
    'How does the selection strategy remove background from charm decays or other three-body radiative decays?',
    'Why does this analysis not simply do an angular fit?',
    'How large is the signal purity in the analysis?']

    Please begin by processing the below section of a paper. You may re-use the example high quality queries, if appropriate. Provide your high quality queries as a list of strings: ["query one", "query two", ...]. Do not provide anything else.

    {section}
    """
    return prompt

In [3]:
def read_tex_file_by_linenumber(file_path, start_line=0, end_line=1000000):
    document_by_linenumber = {}
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            if line_number < start_line:
                continue
            if line_number > end_line:
                break
            document_by_linenumber[line_number] = line
    return document_by_linenumber

async def queries_from_chunk(chunk):
    # Change model here #
    llm = OpenAI(temperature=0, model="gpt-4o-mini")
    prompt = get_prompt_queries(chunk)
    answer = await llm.acomplete(prompt)
    # Check if the answer contains a JSON block
    pattern = r"(?<=```json)([\s\S]*?)(?=```)"
    match = re.search(pattern, answer.text)
    if match:
        answer_text = match.group(1).replace("\n", " ")
    else:
        answer_text = answer.text.replace("\n", " ")
    return answer_text

async def generate_queries_async(tex_path, lines_per_chunk = 40):
    lines = read_tex_file_by_linenumber(tex_path)
    total_lines = len(lines)
    num_chunks = (total_lines + lines_per_chunk - 1) // lines_per_chunk
    all_answers = []

    calls = []
    for chunk_idx in range(num_chunks):
        start_idx = chunk_idx * lines_per_chunk + 1
        end_idx = min((chunk_idx + 1) * lines_per_chunk, total_lines)
        chunk = "".join([lines[line_num] for line_num in range(start_idx, end_idx + 1) if line_num in lines])
        calls.append(queries_from_chunk(chunk))

    queries = await asyncio.gather(*calls)
    queries = sum([eval(q) for q in queries], [])
    return queries

def generate_queries(tex_path, lines_per_chunk = 40):
    return asyncio.run(generate_queries_async(tex_path, lines_per_chunk))

In [4]:
def get_prompt_best_queries(candidate_queries):
    prompt = f"""You are an expert in particle physics and a member of the LHCb collaboration. You have been tasked with generating high quality queries for a Q&A dataset of particle physics papers. Provide your queries as a list of strings: ["query one", "query two", ...]. Do not provide anything else.

    Example high quality queries:
    ['What is the dominant systematic uncertainty?',
    'What decay is used in this measurement?',
    'How large is the statistical uncertainty relative to the nominal value?',
    'Is this result compatible with the SM?',
    'How much data was analysed?',
    'Which decays and final states enter the analysis?',
    'What are the dangerous backgrounds in this analysis?',
    'What are all the correcting weights?',
    'Why is this result different to the previous result?',
    'How many resonances does the model contain?',
    'How is the efficiency modelled?',
    'How does this measurement relate to other results?',
    'Why is it so difficult to work with photons at LHCb?',
    'What are the challenges when dealing with amplitude fits?',
    'What are the considered backgrounds and how are they addressed?',
    'How is semileptonic background treated?',
    'How does this analysis deal with the correlation between the three-body mass and the two-body masses?',
    'How does the selection strategy remove background from charm decays or other three-body radiative decays?',
    'Why does this analysis not simply do an angular fit?',
    'How large is the signal purity in the analysis?']

    Please consider the below list of potential queries. Based off of the description and examples above, your task is to consider the below candidate queries and output only the 15 highest quality ones. A high quality query is something conceptually simple to ask but difficult to answer. A high quality query is answerable only by digging deep into the contents of the paper, with the answer scattered across multiple different sections. Once you have selected 15 high quality queries, you should reword each of them slightly. Output a list, in the exact same format as provided, with only the 15 reworded high quality queries kept.

    Candidate queries:
    {candidate_queries}
    """
    return prompt

def get_best_queries(candidate_queries):
    prompt = get_prompt_best_queries(candidate_queries)
    # Change model here
    llm = OpenAI(temperature=0, model="gpt-4o-mini")
    answer = llm.complete(prompt)

    # Check if the answer contains a JSON block
    pattern = r"(?<=```json)([\s\S]*?)(?=```)"
    match = re.search(pattern, answer.text)
    if match:
        answer_text = match.group(1).replace("\n", " ")
    else:
        answer_text = answer.text.replace("\n", " ")
    answer = eval(answer_text)
    return answer

# Query answering with multiple queries per chunk (much cheaper API calls)

In [5]:
import regex as re
import time
import asyncio
import json

def get_multi_query_prompt(document, queries):
    formatted_queries = "\n".join([f"{i+1}. {query}" for i, query in enumerate(queries)])
    
    prompt = f"""
    You are a particle physics expert, enthusiastically answering queries about their paper. Because you have written the paper, you have an extremely detailed understanding of the contents of every single section of the paper.
    
    You will be given:
    
    - queries: A list of questions related to the contents of your paper. The answers to these queries may be scattered across multiple different sections.
    
    Your answer should be a JSON array where each element corresponds to a query and includes the field:

    - relevant_passages: A list of line numbers that directly answer a part of the query. If the query is not answered anywhere in text, provide an empty list. Make sure to consider the full document, do not just provide the first partial answer you see. However, do not provide any more line numbers than is needed or provide line numbers which are ultimately not relevant to the query.

    Structure:
    - You will be given an excerpt from your paper in the following format, with the line number given before the first ':'
        1: text for line 1...
        2: text for line 2...
        3: text for line 3...
    - Your output should be a valid JSON array with one object per query, like this:
        [
            {{
                "query": "What is the dominant systematic uncertainty?",
                "relevant_passages": [relevant_line_1, relevant_line_2, ...],
            }},
            {{
                "query": "What detector was used?",
                "relevant_passages": [relevant_line_1, relevant_line_2, ...],
            }},
            ...
        ]

    The excerpt is given below:
    {document}

    The queries are given below:
    {formatted_queries}
    """
    return prompt

def read_tex_file_by_linenumber(file_path, start_line=0, end_line=1000000):
    document_by_linenumber = {}
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            if line_number < start_line:
                continue
            if line_number > end_line:
                break
            document_by_linenumber[line_number] = line
    return document_by_linenumber

def collapse_line_numbers(line_numbers, max_gap=2):
    if not line_numbers:
        return []
    
    # Sort the line numbers first and remove duplicates
    sorted_lines = sorted(set(line_numbers))
    
    # Initialize with the first range
    ranges = [[sorted_lines[0], sorted_lines[0]]]
    
    # Process remaining line numbers
    for line in sorted_lines[1:]:
        # If this line is within the allowed gap of the end of the current range
        if line <= ranges[-1][1] + max_gap + 1:
            # Extend the current range
            ranges[-1][1] = max(ranges[-1][1], line)
        else:
            # Start a new range
            ranges.append([line, line])
    
    return ranges

def get_text_for_range(range_start, range_end, full_lines):
    range_text = ""
    for line_num in range(range_start, range_end + 1):
        if line_num in full_lines:
            range_text += full_lines[line_num]
        else:
            range_text += f"Line {line_num} not in document\n"
    return range_text

async def find_answers_to_queries_async(lines, queries, model="gpt-4o-mini"):
    document = "".join([f"{key}: {value}" for key, value in lines.items()])
    prompt = get_multi_query_prompt(document, queries)
    
    llm = OpenAI(temperature=0, model=model)
    answer = await llm.acomplete(prompt)

    answer_text = answer.text
    answer_text = re.sub(r'\n', ' ', answer_text)
    match = re.search(r"\[(?:[^\[\]]|(?R))*\]", answer_text)
    
    if match:
        try:
            answers = eval(match.group(0))
        except Exception as e:
            print(f"Error evaluating response: {e}")
            # Create a fallback answer
            answers = [{"query": query, "relevant_passages": []} for query in queries]
    else:
        print(f"Could not extract answer list from response: {answer_text}")
        answers = [{"query": query, "relevant_passages": []} for query in queries]
    
    # Return just the raw line numbers from each chunk
    return answers

async def process_chunk(chunk_idx, total_chunks, chunk, queries, model):
    print(f"Processing chunk {chunk_idx+1}/{total_chunks}: lines {min(chunk.keys())}-{max(chunk.keys())}")
    
    try:
        chunk_answers = await find_answers_to_queries_async(chunk, queries, model)
        return chunk_idx, chunk_answers
    except Exception as e:
        print(f"Error processing chunk {chunk_idx}: {e}")
        # Return empty answers in case of error
        return chunk_idx, [{"query": q, "relevant_passages": []} for q in queries]

async def process_chunks_for_queries(file_path, queries, lines_per_chunk=50, overlap=5, model="gpt-4o-mini", max_concurrent=5):
    # Read all lines from the file
    all_lines = read_tex_file_by_linenumber(file_path)
    total_lines = max(all_lines.keys()) if all_lines else 0
    
    # Dictionary to store all line numbers for each query
    query_line_numbers = {query: [] for query in queries}
    
    # Calculate the number of chunks with overlap
    effective_chunk_size = lines_per_chunk - overlap
    num_chunks = (total_lines + effective_chunk_size - 1) // effective_chunk_size
    
    # Create a list of chunk specifications
    chunks = []
    for chunk_idx in range(num_chunks):
        # Calculate chunk boundaries with overlap
        if chunk_idx == 0:
            start_idx = 1
        else:
            start_idx = chunk_idx * effective_chunk_size - overlap + 1
        
        # End index is start + chunk size, but not beyond total lines
        end_idx = min(start_idx + lines_per_chunk - 1, total_lines)
        
        # Extract the chunk
        chunk = {line_num: all_lines[line_num] for line_num in range(start_idx, end_idx + 1) if line_num in all_lines}
        
        if chunk:  # Only add non-empty chunks
            chunks.append((chunk_idx, chunk))
    
    # Process chunks in batches to limit concurrency
    results = []
    for i in range(0, len(chunks), max_concurrent):
        batch = chunks[i:i+max_concurrent]
        batch_tasks = [process_chunk(idx, num_chunks, chunk, queries, model) for idx, chunk in batch]
        
        # Wait for all tasks in this batch to complete
        batch_results = await asyncio.gather(*batch_tasks)
        results.extend(batch_results)
        
        # Optional rate limiting between batches (not between individual API calls within a batch)
        if i + max_concurrent < len(chunks):
            print(f"Completed batch {i//max_concurrent + 1}/{(len(chunks) + max_concurrent - 1)//max_concurrent}. Waiting before starting next batch...")
            await asyncio.sleep(2)
    
    # Process all results to collect line numbers for each query
    for _, chunk_answers in sorted(results):  # Sort by chunk index to maintain order
        for i, answer in enumerate(chunk_answers):
            if i < len(queries):
                query = queries[i]
                query_line_numbers[query].extend(answer.get('relevant_passages', []))
    
    # Now process all collected line numbers for each query
    consolidated_answers = []
    for query in queries:
        # Remove duplicates and collapse line numbers across all chunks
        collapsed_ranges = collapse_line_numbers(query_line_numbers[query])
        
        # Generate text for each range
        range_texts = [get_text_for_range(start, end, all_lines) for start, end in collapsed_ranges]
        
        consolidated_answers.append({
            "query": query,
            "relevant_passages": collapsed_ranges,
            "relevant_passages_text": range_texts
        })
    
    return consolidated_answers

def find_answers_in_paper(file_path, queries, lines_per_chunk=30, overlap=5, model="gpt-4o-mini", max_concurrent=10):
    return asyncio.run(process_chunks_for_queries(file_path, queries, lines_per_chunk, overlap, model, max_concurrent))

In [6]:
def convert_dict_format(input_data):
    result = {}
    
    for item in input_data:
        query = item.get('query')
        passages = item.get('relevant_passages', [])
        
        if query:
            result[query] = passages
    
    return result

def write_to_json(answers, header):
    json_dict = convert_dict_format(answers)
    import json
    with open(f"/work/submit/mcgreivy/beauty-in-stats/tree_rag/validation_data/autogenerated/{header}.json", "w") as file:
        json.dump(convert_dict_format(answers), file, indent=4)

In [17]:
for header in ["1508.00788"]:
    tex_path = f"/work/submit/mcgreivy/paper_trees_cache/split_tex/{header}.tex"
    candidate_queries = generate_queries(tex_path)
    best_queries = get_best_queries(candidate_queries)
    answers = find_answers_in_paper(tex_path, best_queries)
    write_to_json(answers, header)

Processing chunk 1/7: lines 1-30
Processing chunk 2/7: lines 21-50
Processing chunk 3/7: lines 46-75
Processing chunk 4/7: lines 71-100
Processing chunk 5/7: lines 96-125
Processing chunk 6/7: lines 121-150
Processing chunk 7/7: lines 146-159


In [20]:
import os
import regex as re
import json

headers = []
validation_data_dir = "/work/submit/mcgreivy/beauty-in-stats/tree_rag/validation_data/autogenerated/"
split_tex_dir = "/work/submit/mcgreivy/paper_trees_cache/split_tex/"

for file in os.listdir(validation_data_dir):
    pattern = r"(.*).json"
    header = re.findall(pattern, file)[0]
    headers.append(header)

print(headers)

questions_to_answers = {}
questions_to_linenumbers = {}
for header in headers:
    questions_to_answers[header] = {}
    questions_to_linenumbers[header] = {}
    
    with open(split_tex_dir + f"{header}.tex", 'r', encoding='utf-8') as file:
        lines = file.readlines()

    with open(validation_data_dir + f'{header}.json', 'r') as file:
        q_to_line = json.load(file)
        
    for question in q_to_line:
        answers = []
        for start, end in q_to_line[question]:
            answer = re.sub("\s+", " ", "".join(lines[start - 1 : end]))
            answers.append(answer)
        questions_to_answers[header][question] = answers
        questions_to_linenumbers[header][question] = q_to_line[question]

['1102.0206', '1109.0963', '1301.7084', '1402.6852', '1403.1339', '1404.0275', '1405.3219', '1406.2624', '1407.2222', '1408.5373', '1508.00788']


In [26]:
all_questions = []
for key in questions_to_answers:
    key = "1508.00788"
    for question in questions_to_answers[key]:
        all_questions.append(question)
    break

In [29]:
all_questions

['What are the main challenges in measuring the $\\Bs \rightarrow \\phi \\phi$ branching fraction using the LHCb Run 1 dataset?',
 'How does the multivariate algorithm contribute to the signal selection process for the $\\Bs \rightarrow \\phi \\phi$ decay?',
 'What theoretical predictions exist for the $\\Bs \rightarrow \\phi \\phi$ branching fraction, and how do they align with experimental findings?',
 'How does the selection strategy balance background reduction and signal efficiency for the $\\Bs \rightarrow \\phi \\phi$ decay?',
 'What role does the $\\Bz \rightarrow \\phi \\Kstar(892)^0$ decay play in the analysis of the $\\Bs \rightarrow \\phi \\phi$ decay?',
 'What are the implications of the $\\Bs \rightarrow \\phi \\phi$ decay for theories beyond the Standard Model?',
 'How is the combinatorial background from hadrons originating at the primary vertex minimized?',
 'What is the function of the ring-imaging Cherenkov detectors in this analysis?',
 'How are the $\\phi$ and $K^*