In [None]:
import os
import json
import re
from datetime import datetime
from dotenv import load_dotenv
from openai import OpenAI

from typing import List, Literal, Optional
from pydantic import BaseModel

client = OpenAI()

In [2]:
# variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# models
cheap = 'gpt-4o'
best = 'o3'

# prompts
from prompts import PROMPT_FULL, PROMPT_FREEFORM

# io
input_dir = 'input'
output_dir = 'output'
traces_dir = 'traces'

In [None]:
# # run once to upload all papers from /input directory

# all_papers = []

# for paper in os.listdir(input_dir):
#     if not paper.endswith('.pdf'):
#         continue

#     paper_path = os.path.join(input_dir, paper)
    
#     file = client.files.create(
#         file=open(paper_path, "rb"),
#         purpose="user_data"
#     )

#     all_papers.append({
#         "id": paper,
#         "file_id": file.id,
#         "file_name": paper,
#         "file_path": paper_path,
#         "file_size": os.path.getsize(paper_path),
#         "file_created_at": datetime.fromtimestamp(os.path.getctime(paper_path)).isoformat(),
#         "file_modified_at": datetime.fromtimestamp(os.path.getmtime(paper_path)).isoformat(),
#         "file_uploaded_at": datetime.now().isoformat(),
#     })

# # save all_papers to file
# with open(os.path.join('all_papers.json'), 'w') as f:
#     json.dump(all_papers, f, indent=2)

In [7]:
# load all_papers from file
with open(os.path.join('all_papers.json'), 'r') as f:
    all_papers = json.load(f)

In [9]:
# helper functions 

# Save freeform response as trace
def save_trace_response(resp: str, file_name: str, model: str): 
    trace_output_path = os.path.join(traces_dir, f"{file_name}_{model}.txt")
    with open(trace_output_path, "w", encoding="utf-8") as f:
        f.write(resp.output_text)

# Save structured response as JSON
def save_json_response(resp: str, file_name: str, model: str):
    if model == 'o3':
        structured_data = json.loads(resp.model_dump()['output'][1]['arguments'])
    else:
        structured_data = json.loads(resp.model_dump()['output'][0]['arguments'])

    structured_output_path = os.path.join(output_dir, f"{file_name}_{model}.json")
    with open(structured_output_path, "w", encoding="utf-8") as f:
        json.dump(structured_data, f, ensure_ascii=False, indent=2)

In [None]:
# pydantic classes

# Schema 1
class PaperSchema1(BaseModel):
    paper_doi: Optional[str] = None # exact DOI if available
    paper_title: str  # exact paper title
    logical_chains: List[LogicalChain]

class LogicalChain1(BaseModel):
    chain_id: str  # unique_identifier
    description: str  # brief chain summary
    nodes: List[Node]
    edges: List[Edge]

class Node1(BaseModel):
    id: str  # unique_node_id
    type: Literal["concept", "intervention"]
    title: str  # concise descriptive phrase
    description: str  # detailed technical description
    maturity: Optional[int]  # 1-5 (only for intervention nodes)

class Edge1(BaseModel):
    source_id: str  # source_node_id
    target_id: str  # target_node_id
    title: str  # relationship_verb
    confidence: int  # 1-5
    description: str  # brief explanation of logical connection

In [None]:
# dual responses from model (freeform + json in separate requests)

def get_dual_response(file_id: str, prompt_text: str, schema: object, model: str = 'gpt-4.0'):
# create docstring
    """
    Get response from model in two steps:
    1. Freeform analysis of the paper based on the prompt_text.
    2. Structured output using the causal_chain_structure tool based on the freeform analysis.
    """

    # First call - get freeform analysis
    freeform_input = [{
        "role": "user",
        "content": [
            {"type": "input_file", "file_id": file_id},
            {"type": "input_text", "text": prompt_text}
        ]
    }]
    
    freeform_response = client.responses.create(
        model=model,
        input=freeform_input,
        tools=None  # No tools for freeform analysis
    )

    # Second call - get structured output
    structured_input = [{
        "role": "system",
        "content": "Use the following detailed analysis to help create the structured output:"
    }, {
        "role": "assistant",
        "content": freeform_response.output_text
    },{
        "role": "user",
        "content": [
            {"type": "input_file", "file_id": file_id},
            {"type": "input_text", "text": "Based on the paper and your analysis, provide a structured representation of the logical chains using the causal_chain_structure tool. Focus only on providing the structured output."}
        ]
    }]

    tools = [{
        "type": "function",
        "name": "causal_chain_structure",
        "description": "Summarize the paper's causal structure into a set of logical chains",
        "parameters": schema.model_json_schema()
    }]
    
    structured_response = client.responses.create(
        model=model,
        input=structured_input,
        tools=tools,
        tool_choice={"type": "allowed_tools", 
                     "mode": "required",
                     "tools": [{"type": "function", "name": "causal_chain_structure"}]
        }  # force tool use
    )

    return freeform_response, structured_response

In [None]:
# main function to analyze paper
def analyze_paper(file_name: str, file_id: str, prompt_text: str, schema: object, model: str = 'gpt-4.0'):
    """
    Analyze a paper using the specified model and prompt. Writes the responses to the output and traces directories.
    
    Args:
        file_name (str): Name of the file to analyze.
        file_id (str): ID of the file in OpenAI.
        prompt_text (str): The prompt to use for analysis.
        model (str): The model to use for analysis. (cheap or best)
    
    """
    
    freeform_response, structured_response = get_dual_response(
        file_id=file_id,
        prompt_text=prompt_text,
        schema=schema,
        model=model
    )
    
    # Save responses
    save_trace_response(freeform_response, file_name, model=model)
    save_json_response(structured_response, file_name, model=model)

    return freeform_response, structured_response

In [None]:
# test first paper
file_name = all_papers[0]['file_name']
file_id = all_papers[0]['file_id']

analyze_paper(
    file_name=file_name,
    file_id=file_id,
    prompt_text=PROMPT_FREEFORM,
    schema=PaperSchema1
    model=cheap
)

In [None]:
# test all papers
for paper in all_papers:
    analyze_paper(
        file_name=paper['file_name'],
        file_id=paper['file_id'],
        prompt_text=PROMPT_FREEFORM,
        schema=PaperSchema1
        model=best
    )

In [8]:
# iteration

file_name = all_papers[0]['file_name']
file_id = all_papers[0]['file_id']

freeform_response, structured_response = analyze_paper(
    file_name=file_name,
    file_id=file_id,
    prompt_text=PROMPT_FREEFORM,
    model=cheap
)

In [11]:
def analyze_paper_iteratively(file_name: str, file_id: str, prompt_text: str, iterations: int = 3, model: str = 'gpt-4.0'):
    """
    Iteratively analyze a paper multiple times, asking the model to find more connections each time.
    
    Args:
        file_name (str): Name of the file to analyze.
        file_id (str): ID of the file in OpenAI.
        prompt_text (str): The base prompt to use for analysis.
        iterations (int): Number of iterations to perform (default 3).
        model (str): The model to use for analysis.
    
    Returns:
        List of tuples containing (freeform_response, structured_response) for each iteration.
    """
    
    results = []
    current_prompt = prompt_text
    
    for i in range(iterations):
        print(f"\nIteration {i+1}/{iterations}")
        
        # For iterations after the first, add the improvement request
        if i > 0:
            current_prompt = (
                current_prompt + 
                "\n\nIMPORTANT: You missed many causal connections and relationships in your previous analysis. " +
                "Please analyze again more thoroughly, looking specifically for:\n" +
                "1. Additional connections between existing concepts\n" +
                "2. Implicit relationships that weren't directly stated\n" +
                "3. Higher-order effects and consequences\n" +
                "4. Cross-cutting themes and patterns\n" +
                "5. Alternative interpretations of the findings"
            )
        
        # Run the analysis
        freeform_response, structured_response = get_response(
            file_id=file_id,
            prompt_text=current_prompt,
            model=model
        )
        
        # Save responses with iteration number in filename
        save_trace_response(freeform_response, f"{file_name}_iter{i+1}", model=model)
        save_json_response(structured_response, f"{file_name}_iter{i+1}", model=model)
        
        results.append((freeform_response, structured_response))
        
    return results

In [None]:
# test iterative analysis
file_name = all_papers[0]['file_name']
file_id = all_papers[0]['file_id']

iterative_results = analyze_paper_iteratively(
    file_name=file_name,
    file_id=file_id,
    prompt_text=PROMPT_FREEFORM,
    iterations=3,
    model=cheap
)

# Print the freeform responses from each iteration
# for i, (freeform_resp, _) in enumerate(iterative_results, 1):
#     print(f"\n=== Iteration {i} Analysis ===")
#     print(freeform_resp.output_text)