In [1]:
import os
import json
import re
from datetime import datetime
from dotenv import load_dotenv
from openai import OpenAI

from typing import List, Literal, Optional
from pydantic import BaseModel

from plot_json_graphviz import render_json_graph

client = OpenAI()

In [2]:
# variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# models
cheap = 'gpt-4o'
best = 'o3'

# prompts
from prompts import PROMPT_FULL, PROMPT_DEV, PROMPT_JSON_DEV

# io
input_dir = 'input'
json_dir = 'json'
traces_dir = 'traces'

In [None]:
# run once to upload all papers from /input directory

# all_papers = []

# for paper in os.listdir(input_dir):
#     if not paper.endswith('.pdf'):
#         continue

#     paper_path = os.path.join(input_dir, paper)
    
#     file = client.files.create(
#         file=open(paper_path, "rb"),
#         purpose="user_data"
#     )

#     all_papers.append({
#         "id": paper,
#         "file_id": file.id,
#         "file_name": paper,
#         "file_path": paper_path,
#         "file_size": os.path.getsize(paper_path),
#         "file_created_at": datetime.fromtimestamp(os.path.getctime(paper_path)).isoformat(),
#         "file_modified_at": datetime.fromtimestamp(os.path.getmtime(paper_path)).isoformat(),
#         "file_uploaded_at": datetime.now().isoformat(),
#     })

# # save all_papers to file
# with open(os.path.join('all_papers.json'), 'w') as f:
#     json.dump(all_papers, f, indent=2)

In [3]:
# load all_papers from file
with open(os.path.join('all_papers.json'), 'r') as f:
    all_papers = json.load(f)

In [None]:
# helper functions 

# Save freeform response as trace
def save_reasoning_response(resp: str, file_name: str, model: str): 
    trace_path = os.path.join(traces_dir, f"{file_name}_{model}.txt")
    with open(trace_path, "w", encoding="utf-8") as f:
        f.write(resp.output_text)

# Save structured response as JSON
def save_json_response(resp: str, file_name: str, model: str):
    if model == 'o3':
        structured_data = json.loads(resp.model_dump()['output'][1]['arguments'])
    else:
        structured_data = json.loads(resp.model_dump()['output'][0]['arguments'])

    structured_json_path = os.path.join(json_dir, f"{file_name}_{model}.json")
    with open(structured_json_path, "w", encoding="utf-8") as f:
        json.dump(structured_data, f, ensure_ascii=False, indent=2)

# Get json from Response objects (such as response.usage)
def get_json(obj):
    if hasattr(obj, '__dict__'):
        return obj.__dict__
    return str(obj)

In [6]:
# pydantic classes

# Schema 1
class Node1(BaseModel):
    id: str  # concise description of node
    aliases: List[str] # 2-3 alternative concise descriptions of node
    type: Literal["concept", "intervention"]
    description: str  # detailed technical description of node
    maturity: Optional[int]  # 1-5 (only for intervention nodes)

class Edge1(BaseModel):
    id: str  # relationship label verb
    source_id: str  # source node_id 
    target_id: str  # target node_id
    description: str  # concise description of logical connection
    confidence: int  # 1-5

class LogicalChain1(BaseModel):
    id: str  # concise description of logical chain
    nodes: List[Node1]
    edges: List[Edge1]

class PaperSchema1(BaseModel):
    # paper_doi: Optional[str] = None # exact DOI if available # removed, metadata
    # paper_title: str  # exact paper title # removed, metadata
    logical_chains: List[LogicalChain1]

In [7]:
# get response from model
def get_single_response(file_id: str, prompt_text: str, model: str = 'gpt-4.0'):

    input = [{
        "role": "user",
        "content": [
            {"type": "input_file", "file_id": file_id},
            {"type": "input_text", "text": prompt_text}
        ]
    }]
    
    response = client.responses.create(
        model=model,
        input=input,
    )

    return response

In [8]:
# dual responses from model (reasoning + json in separate requests)
def get_dual_response(file_id: str, prompt_text: str, schema: object, model: str = 'gpt-4.0'):
    """
    Get response from model in two steps:
    1. Freeform analysis of the paper based on the prompt_text.
    2. Structured json using the causal_chain_structure tool based on the freeform analysis.
    """

    # First call - get reasoning
    reasoning_input = [{
        "role": "user",
        "content": [
            {"type": "input_file", "file_id": file_id},
            {"type": "input_text", "text": prompt_text}
        ]
    }]
    
    reasoning_response = client.responses.create(
        model=model,
        input=reasoning_input,
        tools=None  # No tools for reasoning
    )

    # Second call - get structured json
    json_input = [{
        "role": "system",
        "content": "Use the following detailed analysis to help create the structured json:"
    }, {
        "role": "assistant",
        "content": reasoning_response.output_text
    },{
        "role": "user",
        "content": [
            {"type": "input_file", "file_id": file_id},
            {"type": "input_text", "text": "Based on the paper and your analysis, provide a structured representation of the logical chains using the causal_chain_structure tool. Focus only on providing the structured json."}
        ]
    }]

    tools = [{
        "type": "function",
        "name": "causal_chain_structure",
        "description": "Summarize the paper's causal structure into a set of logical chains",
        "parameters": schema.model_json_schema()
    }]
    
    json_response = client.responses.create(
        model=model,
        input=json_input,
        tools=tools,
        tool_choice={"type": "allowed_tools", 
                     "mode": "required",
                     "tools": [{"type": "function", "name": "causal_chain_structure"}]
        }  # force tool use
    )

    return reasoning_response, json_response

In [9]:
# main function to analyze paper
def analyze_paper(file_name: str, file_id: str, prompt_text: str, schema: object, dual: bool = False, label: str = '', model: str = 'gpt-4.0'):
    """
    Analyze a paper using the specified model and prompt. Writes the responses to the json and traces directories.
    
    Args:
        file_name (str): Name of the file to analyze.
        file_id (str): ID of the file in OpenAI.
        prompt_text (str): The prompt to use for analysis.
        model (str): The model to use for analysis. (cheap or best)
    """
    
    if dual:
        print('dual')
        reasoning_response, json_response = get_dual_response(
            file_id=file_id,
            prompt_text=prompt_text,
            schema=schema,
            model=model)
        reasoning_usage = '\n\n' + get_json(reasoning_response)
        json_usage = '\n\n' + get_json(json_response)
        save_reasoning_response(reasoning_response + reasoning_usage, file_name + label, model=model)
        save_json_response(json_response + json_usage, file_name + label, model=model)
        return reasoning_response, json_response
    else:
        print('single')
        response = get_single_response(
            file_id=file_id,
            prompt_text=prompt_text,
            model=model)
        usage = '\n\n' + get_json(response.usage)
        save_reasoning_response(response + usage, file_name + label, model=model)
        return response

In [10]:
# single paper & response
file_name = all_papers[2]['file_name']
file_id = all_papers[2]['file_id']

analyze_paper(
    file_name=file_name,
    file_id=file_id,
    prompt_text=PROMPT_DEV,
    schema=PaperSchema1,
    model=best,
    label='_reason_1'
)

single


TypeError: can only concatenate str (not "dict") to str

In [None]:
# single paper & response
file_name = all_papers[2]['file_name']
file_id = all_papers[2]['file_id']

analyze_paper(
    file_name=file_name,
    file_id=file_id,
    prompt_text=PROMPT_DEV+PROMPT_JSON_DEV,
    schema=PaperSchema1,
    model=best,
    label='_reason+json_1'
)

-----

In [None]:
# test all papers, single response
for paper in all_papers:
    analyze_paper(
        file_name=paper['file_name'],
        file_id=paper['file_id'],
        prompt_text=PROMPT_FREEFORM,
        schema=PaperSchema1,
        model=best
    )

-----

In [None]:
# iterative analysis
def analyze_paper_iteratively(file_name: str, file_id: str, prompt_text: str, iterations: int = 3, schema: object, model: str = 'gpt-4.0'):
    """
    Iteratively analyze a paper multiple times, asking the model to find more connections each time.
    
    Args:
        file_name (str): Name of the file to analyze.
        file_id (str): ID of the file in OpenAI.
        prompt_text (str): The base prompt to use for analysis.
        iterations (int): Number of iterations to perform (default 3).
        model (str): The model to use for analysis.
    
    Returns:
        List of tuples containing (freeform_response, structured_response) for each iteration.
    """
    
    results = []
    current_prompt = prompt_text
    
    for i in range(iterations):
        print(f"\nIteration {i+1}/{iterations}")
        
        # For iterations after the first, add the improvement request
        if i > 0:
            current_prompt = (
                current_prompt + 
                "\n\nIMPORTANT: You missed many causal connections and relationships in your previous analysis. " +
                "Please analyze again more thoroughly, looking specifically for:\n" +
                "1. Additional connections between existing concepts\n" +
                "2. Implicit relationships that weren't directly stated\n" +
                "3. Higher-order effects and consequences\n" +
                "4. Cross-cutting themes and patterns\n" +
                "5. Alternative interpretations of the findings"
            )
        
        # Run the analysis
        freeform_response, structured_response = get_dual_response(
            file_id=file_id,
            prompt_text=current_prompt,
            schema=schema,
            model=model
        )
        
        # Save responses with iteration number in filename
        save_trace_response(freeform_response, f"{file_name}_iter{i+1}", model=model)
        save_json_response(structured_response, f"{file_name}_iter{i+1}", model=model)
        
        results.append((freeform_response, structured_response))
        
    return results

In [None]:
# test iterative analysis
file_name = all_papers[0]['file_name']
file_id = all_papers[0]['file_id']

iterative_results = analyze_paper_iteratively(
    file_name=file_name,
    file_id=file_id,
    prompt_text=PROMPT_FREEFORM,
    iterations=3,
    schema=PaperSchema1,
    model=cheap
)

# Print the freeform responses from each iteration
# for i, (freeform_resp, _) in enumerate(iterative_results, 1):
#     print(f"\n=== Iteration {i} Analysis ===")
#     print(freeform_resp.output_text)

-----

In [None]:
# get all json jsons

all_json = []

for json in os.listdir(json_dir):
    if not json.endswith('.json'):
        continue

    json_path = os.path.join(json_dir, json)

    all_json.append({
        "id": json,
        "file_path": json_path,
        "file_size": os.path.getsize(json_path),
        "file_created_at": datetime.fromtimestamp(os.path.getctime(json_path)).isoformat(),
    })

In [None]:
# graphviz

# render single json file
json_file = '2307.16513v2.pdf_o3.json'
render_json_graph(os.path.join(json_dir, json_file), 'graphs')

In [None]:
# render all json files in json folder
for j in all_json:
    json_file = j['id']
    render_json_graph(os.path.join(json_dir, json_file), 'graphs')
    print(f"Rendered {json_file}")