In [2]:
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List, Optional
from pypdf import PdfReader

client = OpenAI()

class PaperData(BaseModel):
    title: str = Field(description="The title of the paper")
    abstract_summary: List[str] = Field(description="One sentence summary of the abstract")
    intro_backgrounds_summary: str = Field(description="One sentence summary of the intro and backgrounds sections")
    methods_summary: str = Field(description="One sentence summary of the methods section")
    results_summary: str = Field(description="One sentence summary of the results section")
    discussion_summary: str = Field(description="One sentence summary of the discussion section")
    

def load_pdf_text(file_path):
    '''Loads text from a PDF file.'''
    # creating a pdf reader object
    reader = PdfReader(file_path)

    # extracting text from page
    text = "\n\n".join([page.extract_text() for page in reader.pages])
    
    return text    

In [1]:
!wget -O ../assets/paper.pdf https://arxiv.org/pdf/2510.26493

--2025-11-26 13:45:11--  https://arxiv.org/pdf/2510.26493
Resolving arxiv.org (arxiv.org)... 151.101.195.42, 151.101.3.42, 151.101.67.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.195.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2371045 (2,3M) [application/pdf]
Saving to: ‘../assets/paper.pdf’


2025-11-26 13:45:11 (32,0 MB/s) - ‘../assets/paper.pdf’ saved [2371045/2371045]



In [None]:
paper_raw_text = load_pdf_text("../assets/paper.pdf")

response = client.responses.parse(
    model="gpt-5-mini",
    input=f"extract the information from the paper: {paper_raw_text}",
    text_format=PaperData
)

print(response)

In [27]:
response.output_parsed

PaperData(title='Context Engineering 2.0: The Context of Context Engineering', abstract_summary=['The paper argues that context engineering—designing, organizing, and managing contextual information for machines—is a long‑evolving discipline (not just a recent LLM-era invention), frames context engineering as an entropy‑reduction problem, presents a formal definition and four-stage evolutionary model (Context Engineering 1.0–4.0), and surveys practical design considerations for context collection, storage, management, and usage to guide future AI systems.'], intro_backgrounds_summary='Introduces context engineering, situates it historically from ubiquitous computing and early HCI to modern LLMs and agents, defines context broadly (information characterizing relevant entities), and frames the core challenge as bridging human intent and machine understanding by compressing high-entropy human contexts into machine‑usable representations.', methods_summary='Presents a formal mathematical f

In [28]:
from IPython.display import Markdown, display

md = f"""
# {response.output_parsed.title}

## Abstract Summary
{'; '.join(response.output_parsed.abstract_summary)}

## Introduction & Backgrounds Summary
{response.output_parsed.intro_backgrounds_summary}

## Methods Summary
{response.output_parsed.methods_summary}

## Results Summary
{response.output_parsed.results_summary}

## Discussion Summary
{response.output_parsed.discussion_summary}
"""

display(Markdown(md))


# Context Engineering 2.0: The Context of Context Engineering

## Abstract Summary
The paper argues that context engineering—designing, organizing, and managing contextual information for machines—is a long‑evolving discipline (not just a recent LLM-era invention), frames context engineering as an entropy‑reduction problem, presents a formal definition and four-stage evolutionary model (Context Engineering 1.0–4.0), and surveys practical design considerations for context collection, storage, management, and usage to guide future AI systems.

## Introduction & Backgrounds Summary
Introduces context engineering, situates it historically from ubiquitous computing and early HCI to modern LLMs and agents, defines context broadly (information characterizing relevant entities), and frames the core challenge as bridging human intent and machine understanding by compressing high-entropy human contexts into machine‑usable representations.

## Methods Summary
Presents a formal mathematical framework (entity characterization Char, context C as aggregation over relevant entities), defines context engineering as f_context(C,T)=F(ϕ1,…,ϕn) (a composition of modular operations for collection, storage, representation, multimodal handling, selection, sharing and adaptation), and characterizes four developmental stages aligned with machine intelligence (Primitive/1.0, Agent‑centric/2.0, Human‑level/3.0, Superhuman/4.0).

## Results Summary
Through historical analysis and system-level comparison, the paper contrasts Era 1.0 and 2.0 practices (sensor/collection modes, tolerance for raw context, core mechanisms like Context Toolkit vs. prompting/RAG/memory agents), surveys contemporary designs (hierarchical memory, subagents, embeddings, schema extraction, KV caching, multimodal fusion), and synthesizes common patterns and trade-offs for context collection, management, and usage across applications.

## Discussion Summary
Identifies key challenges and future directions—scalable lifelong context storage and semantic indexing, long-context processing and architectural limits of transformers, robustness and evaluation of accumulated memory, cross‑system context sharing standards, richer multimodal (and BCI) sensing, and the need for a semantic operating system and new long-range reasoning architectures to enable reliable, proactive, and human-aligned context engineering. 


In [30]:
import pandas as pd
import os

def upsert_paper_to_csv(paper_data, csv_path="papers_database.csv", key_column="title"):
    """
    Inserts or updates a paper's data in a CSV file database. If the file does not exist,
    it creates it. If the file exists, it updates the entry if the key exists, or appends 
    the new paper if not.

    Parameters:
        paper_data: an object with attributes corresponding to extracted fields
        csv_path: output file path for CSV
        key_column: column name to use as unique identifier (default: "title")
    """
    # Prepare data dictionary, flattening abstract_summary if it's a list
    paper_dict = {
        "title": getattr(paper_data, "title", ""),
        "abstract_summary": "; ".join(paper_data.abstract_summary) if isinstance(getattr(paper_data, "abstract_summary", ""), list) else getattr(paper_data, "abstract_summary", ""),
        "intro_backgrounds_summary": getattr(paper_data, "intro_backgrounds_summary", ""),
        "methods_summary": getattr(paper_data, "methods_summary", ""),
        "results_summary": getattr(paper_data, "results_summary", ""),
        "discussion_summary": getattr(paper_data, "discussion_summary", ""),
    }

    # If CSV exists, load and update if key matches; otherwise create new
    if os.path.isfile(csv_path):
        df = pd.read_csv(csv_path)
        # Check if key (e.g., title) already in db
        if paper_dict[key_column] in df[key_column].values:
            # Update existing entry
            df.loc[df[key_column] == paper_dict[key_column], list(paper_dict.keys())] = list(paper_dict.values())
            updated = True
        else:
            # Append if not found
            df = pd.concat([df, pd.DataFrame([paper_dict])], ignore_index=True)
            updated = False
    else:
        # Create new dataframe
        df = pd.DataFrame([paper_dict])
        updated = False

    df.to_csv(csv_path, index=False)
    if updated:
        print(f"Updated entry in '{csv_path}' for {key_column}='{paper_dict[key_column]}'")
    else:
        print(f"Added new entry to '{csv_path}' for {key_column}='{paper_dict[key_column]}'")

# Example usage:
upsert_paper_to_csv(response.output_parsed)

Added new entry to 'papers_database.csv' for title='Context Engineering 2.0: The Context of Context Engineering'


In [None]:
!wget -O ../assets/paper3.pdf "https://arxiv.org/pdf/1301.3781"

--2025-11-25 16:25:46--  https://arxiv.org/pdf/1301.3781
Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.195.42, 151.101.131.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228716 (223K) [application/pdf]
Saving to: ‘paper3.pdf’


2025-11-25 16:25:46 (9,24 MB/s) - ‘paper3.pdf’ saved [228716/228716]



In [36]:
def extract_paper_data(paper_path: str):
    paper_raw_text = load_pdf_text(paper_path)

    response = client.responses.parse(
        model="gpt-5-mini",
        input=f"extract the information from the paper: {paper_raw_text}",
        text_format=PaperData
    )
    
    output_parsed = response.output_parsed

    print(output_parsed)
    
    upsert_paper_to_csv(output_parsed)
    
    return output_parsed

extract_paper_data("paper3.pdf")

title='Efficient Estimation of Word Representations in Vector Space' abstract_summary=['The paper introduces two efficient log-linear architectures (Continuous Bag‑of‑Words and Skip‑gram) for learning continuous word vectors from very large corpora, achieving large accuracy improvements on syntactic and semantic word similarity tasks at much lower computational cost (e.g. learning high‑quality vectors from ~1.6B words in under a day).'] intro_backgrounds_summary='The paper motivates learning distributed continuous word representations (instead of atomic indices) to capture multiple degrees of word similarity and linear regularities (e.g., vector arithmetic like king−man+woman≈queen), surveys prior neural language model work, and sets the goal of scalable, high‑quality word vectors from very large datasets and vocabularies.' methods_summary='They propose two computationally cheap log‑linear models—CBOW (predict target word from averaged context vectors) and Skip‑gram (predict context wo

PaperData(title='Efficient Estimation of Word Representations in Vector Space', abstract_summary=['The paper introduces two efficient log-linear architectures (Continuous Bag‑of‑Words and Skip‑gram) for learning continuous word vectors from very large corpora, achieving large accuracy improvements on syntactic and semantic word similarity tasks at much lower computational cost (e.g. learning high‑quality vectors from ~1.6B words in under a day).'], intro_backgrounds_summary='The paper motivates learning distributed continuous word representations (instead of atomic indices) to capture multiple degrees of word similarity and linear regularities (e.g., vector arithmetic like king−man+woman≈queen), surveys prior neural language model work, and sets the goal of scalable, high‑quality word vectors from very large datasets and vocabularies.', methods_summary='They propose two computationally cheap log‑linear models—CBOW (predict target word from averaged context vectors) and Skip‑gram (predi