In [2]:
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Optional
from pypdf import PdfReader

client = OpenAI()

class PaperData(BaseModel):
    title: str = Field(description="The title of the paper")
    abstract_summary: List[str] = Field(description="One sentence summary of the abstract")
    intro_backgrounds_summary: str = Field(description="One sentence summary of the intro and backgrounds sections")
    methods_summary: str = Field(description="One sentence summary of the methods section")
    results_summary: str = Field(description="One sentence summary of the results section")
    discussion_summary: str = Field(description="One sentence summary of the discussion section")
    

def load_pdf_text(file_path):
    '''Loads text from a PDF file.'''
    # creating a pdf reader object
    reader = PdfReader(file_path)

    # extracting text from page
    text = "\n\n".join([page.extract_text() for page in reader.pages])
    
    return text    

In [1]:
!wget -O ../assets/paper.pdf https://arxiv.org/pdf/2510.26493

--2025-11-26 13:45:11--  https://arxiv.org/pdf/2510.26493
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.3.42, 151.101.67.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2371045 (2,3M) [application/pdf]
Saving to: ‘../assets/paper.pdf’


2025-11-26 13:45:11 (32,0 MB/s) - ‘../assets/paper.pdf’ saved [2371045/2371045]



In [3]:
paper_raw_text = load_pdf_text("../assets/paper.pdf")

response = client.responses.parse(
    model="gpt-5-mini",
    input=f"extract the information from the paper: {paper_raw_text}",
    text_format=PaperData
)

print(response)

ParsedResponse[PaperData](id='resp_081192adf42d638900692735ec66088197aa2970ed9be6f7b9', created_at=1764177388.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5-mini-2025-08-07', object='response', output=[ResponseReasoningItem(id='rs_081192adf42d638900692735ed864c8197936307c3784efc0b', summary=[], type='reasoning', content=None, encrypted_content=None, status=None), ParsedResponseOutputMessage[PaperData](id='msg_081192adf42d638900692735f236d08197a2c8038b2961bad1', content=[ParsedResponseOutputText[PaperData](annotations=[], text='{"title":"Context Engineering 2.0: The Context of Context Engineering","abstract_summary":["The paper defines and situates context engineering as the systematic process of designing, organizing, and managing contextual information to reduce entropy between human intentions and machine understanding, traces its evolution over four eras from early HCI/context-aware systems to speculative superhuman intelligence, formalizes cont

In [4]:
response.output_parsed

PaperData(title='Context Engineering 2.0: The Context of Context Engineering', abstract_summary=['The paper defines and situates context engineering as the systematic process of designing, organizing, and managing contextual information to reduce entropy between human intentions and machine understanding, traces its evolution over four eras from early HCI/context-aware systems to speculative superhuman intelligence, formalizes context and context engineering mathematically, and proposes design considerations for context collection, management, and usage while identifying open challenges and future directions.'], intro_backgrounds_summary='The introduction motivates context engineering by highlighting the impact of context on LLM and agent behavior, argues that context engineering predates modern LLM practices (rooted in ubiquitous computing and HCI), and frames the core task as compressing high-entropy human contexts into machine-understandable low-entropy representations.', methods_su

In [5]:
from IPython.display import Markdown, display

md = f"""
# {response.output_parsed.title}

## Abstract Summary
{'; '.join(response.output_parsed.abstract_summary)}

## Introduction & Backgrounds Summary
{response.output_parsed.intro_backgrounds_summary}

## Methods Summary
{response.output_parsed.methods_summary}

## Results Summary
{response.output_parsed.results_summary}

## Discussion Summary
{response.output_parsed.discussion_summary}
"""

display(Markdown(md))


# Context Engineering 2.0: The Context of Context Engineering

## Abstract Summary
The paper defines and situates context engineering as the systematic process of designing, organizing, and managing contextual information to reduce entropy between human intentions and machine understanding, traces its evolution over four eras from early HCI/context-aware systems to speculative superhuman intelligence, formalizes context and context engineering mathematically, and proposes design considerations for context collection, management, and usage while identifying open challenges and future directions.

## Introduction & Backgrounds Summary
The introduction motivates context engineering by highlighting the impact of context on LLM and agent behavior, argues that context engineering predates modern LLM practices (rooted in ubiquitous computing and HCI), and frames the core task as compressing high-entropy human contexts into machine-understandable low-entropy representations.

## Methods Summary
The paper formalizes context with mathematical definitions (entities, Char, Context) and defines context engineering as a transformation CE:(C,T)→f_context composed from modular operations (ϕi), and it proposes a four-stage evolutionary characterization (Era 1.0–4.0) tied to machine intelligence levels.

## Results Summary
Rather than empirical experiments, the paper presents a conceptual and historical analysis: a comparison of Era 1.0 vs 2.0 practices, catalogs of context collection/storage/management/usage techniques (e.g., multimodal encoding, hierarchical memory, subagents, RAG, embeddings), representative system patterns and applications (CLI, deep research agents, BCIs), and a set of emerging engineering practices and trade-offs.

## Discussion Summary
The discussion highlights key challenges for lifelong and large-scale context engineering (storage bottlenecks, processing degradation, system instability, evaluation difficulty), argues for new architectures and a ‘‘semantic operating system’’ for context, and forecasts increasing machine responsibility in interpreting and constructing context as intelligence advances.


In [6]:
import pandas as pd
import os

def upsert_paper_to_csv(paper_data, csv_path="papers_database.csv", key_column="title"):
    """
    Inserts or updates a paper's data in a CSV file database. If the file does not exist,
    it creates it. If the file exists, it updates the entry if the key exists, or appends 
    the new paper if not.

    Parameters:
        paper_data: an object with attributes corresponding to extracted fields
        csv_path: output file path for CSV
        key_column: column name to use as unique identifier (default: "title")
    """
    # Prepare data dictionary, flattening abstract_summary if it's a list
    paper_dict = {
        "title": getattr(paper_data, "title", ""),
        "abstract_summary": "; ".join(paper_data.abstract_summary) if isinstance(getattr(paper_data, "abstract_summary", ""), list) else getattr(paper_data, "abstract_summary", ""),
        "intro_backgrounds_summary": getattr(paper_data, "intro_backgrounds_summary", ""),
        "methods_summary": getattr(paper_data, "methods_summary", ""),
        "results_summary": getattr(paper_data, "results_summary", ""),
        "discussion_summary": getattr(paper_data, "discussion_summary", ""),
    }

    # If CSV exists, load and update if key matches; otherwise create new
    if os.path.isfile(csv_path):
        df = pd.read_csv(csv_path)
        # Check if key (e.g., title) already in db
        if paper_dict[key_column] in df[key_column].values:
            # Update existing entry
            df.loc[df[key_column] == paper_dict[key_column], list(paper_dict.keys())] = list(paper_dict.values())
            updated = True
        else:
            # Append if not found
            df = pd.concat([df, pd.DataFrame([paper_dict])], ignore_index=True)
            updated = False
    else:
        # Create new dataframe
        df = pd.DataFrame([paper_dict])
        updated = False

    df.to_csv(csv_path, index=False)
    if updated:
        print(f"Updated entry in '{csv_path}' for {key_column}='{paper_dict[key_column]}'")
    else:
        print(f"Added new entry to '{csv_path}' for {key_column}='{paper_dict[key_column]}'")

# Example usage:
upsert_paper_to_csv(response.output_parsed)

Added new entry to 'papers_database.csv' for title='Context Engineering 2.0: The Context of Context Engineering'


In [None]:
!wget -O ../assets/paper3.pdf "https://arxiv.org/pdf/1301.3781"

--2025-11-25 16:25:46--  https://arxiv.org/pdf/1301.3781
Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.195.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228716 (223K) [application/pdf]
Saving to: ‘paper3.pdf’


2025-11-25 16:25:46 (9,24 MB/s) - ‘paper3.pdf’ saved [228716/228716]



In [None]:
def extract_paper_data(paper_path: str):
    paper_raw_text = load_pdf_text(paper_path)

    response = client.responses.parse(
        model="gpt-5-mini",
        input=f"extract the information from the paper: {paper_raw_text}",
        text_format=PaperData
    )
    
    output_parsed = response.output_parsed

    print(output_parsed)
    
    upsert_paper_to_csv(output_parsed)
    
    return output_parsed

extract_paper_data("../assets/paper3.pdf")

title='Efﬁcient Estimation of Word Representations in Vector Space' abstract_summary=['The paper introduces two efficient model architectures (Continuous Bag-of-Words — CBOW — and Continuous Skip-gram) for learning continuous word vector representations from very large datasets, achieving state-of-the-art syntactic and semantic analogy performance with much lower computational cost and practical training times on billions of words.'] intro_backgrounds_summary='The authors motivate moving beyond atomic word representations to distributed continuous vectors (learned by neural methods) to capture multiple degrees of similarity and linear regularities (e.g., vector arithmetic like king - man + woman = queen), and set the goal of learning high-quality vectors from massive corpora and large vocabularies.' methods_summary='They propose two log-linear architectures—CBOW (predict current word from averaged context vectors) and Skip-gram (predict surrounding words given the current word)—use hie

PaperData(title='Efﬁcient Estimation of Word Representations in Vector Space', abstract_summary=['The paper introduces two efficient model architectures (Continuous Bag-of-Words — CBOW — and Continuous Skip-gram) for learning continuous word vector representations from very large datasets, achieving state-of-the-art syntactic and semantic analogy performance with much lower computational cost and practical training times on billions of words.'], intro_backgrounds_summary='The authors motivate moving beyond atomic word representations to distributed continuous vectors (learned by neural methods) to capture multiple degrees of similarity and linear regularities (e.g., vector arithmetic like king - man + woman = queen), and set the goal of learning high-quality vectors from massive corpora and large vocabularies.', methods_summary='They propose two log-linear architectures—CBOW (predict current word from averaged context vectors) and Skip-gram (predict surrounding words given the current 

In [None]:
import pandas as pd


df = pd.read_csv("../assets/papers_database.csv")
df

Unnamed: 0,title,abstract_summary,intro_backgrounds_summary,methods_summary,results_summary,discussion_summary
0,Context Engineering 2.0: The Context of Contex...,The paper defines and situates context enginee...,The introduction motivates context engineering...,The paper formalizes context with mathematical...,"Rather than empirical experiments, the paper p...",The discussion highlights key challenges for l...
1,Efﬁcient Estimation of Word Representations in...,The paper introduces two efficient model archi...,The authors motivate moving beyond atomic word...,They propose two log-linear architectures—CBOW...,"Experimentally, Skip-gram yields the best sema...","The paper concludes that very simple, computat..."


In [16]:
tool_schema = {
    "type": "function",
    "name": "extract_paper_data",
    "description": "Extracts data from a paper",
    "parameters": {
        "type": "object",
        "properties": {
            "paper_path": {
                "type": "string",
                "description": "The path to the paper"
            }
        },
        "required": ["paper_path"],
    }
}

tool_schema

{'type': 'function',
 'name': 'extract_paper_data',
 'description': 'Extracts data from a paper',
 'parameters': {'type': 'object',
  'properties': {'paper_path': {'type': 'string',
    'description': 'The path to the paper'}},
  'required': ['paper_path']}}

In [11]:
!wget -O ./test_agent_paper.pdf "https://arxiv.org/pdf/2412.14161"

--2025-11-26 17:23:32--  https://arxiv.org/pdf/2412.14161
Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.195.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2169754 (2,1M) [application/pdf]
Saving to: ‘./test_agent_paper.pdf’


2025-11-26 17:23:33 (33,7 MB/s) - ‘./test_agent_paper.pdf’ saved [2169754/2169754]



In [17]:
response = client.responses.create(
    model="gpt-5-mini",
    instructions="You are an extraction agent, users will give you paths to files for pdfs\
                  and you will extract the data from the paper using the:\
                  extract_paper_data function.",
    tools=[tool_schema],
    input="Extract data from this paper: ./test_agent_paper.pdf"
)

response.output_text

''

In [18]:
response.output

[ResponseReasoningItem(id='rs_0cc5319de7b42f93006927383a59588195ab5045551da875a4', summary=[], type='reasoning', content=None, encrypted_content=None, status=None),
 ResponseFunctionToolCall(arguments='{"paper_path":"./test_agent_paper.pdf"}', call_id='call_lBrW8pLIfaO2o8FOCel95PtU', name='extract_paper_data', type='function_call', id='fc_0cc5319de7b42f93006927383af9dc8195944fb0fa775c0ac9', status='completed')]

In [None]:
class ExtractPaperData(BaseModel):
    paper_path: str = Field(description="The path to the paper")


In [26]:
extract_paper_data_schema = ExtractPaperData.model_json_schema()

extract_paper_data_schema


{'properties': {'paper_path': {'description': 'The path to the paper',
   'title': 'Paper Path',
   'type': 'string'}},
 'required': ['paper_path'],
 'title': 'ExtractPaperData',
 'type': 'object'}