# Extraction

In [37]:
from typing import Optional

from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[float] = Field(
        default=None, description="Height measured in meters"
    )

In [38]:
Person.model_json_schema()  

{'description': 'Information about a person.',
 'properties': {'name': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'default': None,
   'description': 'The name of the person',
   'title': 'Name'},
  'hair_color': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'default': None,
   'description': "The color of the person's hair if known",
   'title': 'Hair Color'},
  'height_in_meters': {'anyOf': [{'type': 'number'}, {'type': 'null'}],
   'default': None,
   'description': 'Height measured in meters',
   'title': 'Height In Meters'}},
 'title': 'Person',
 'type': 'object'}

In [39]:
Person.model_json_schema()['properties']['name']['anyOf']

[{'type': 'string'}, {'type': 'null'}]

In [42]:
Person.parse_raw('{"name": "Alice", "height_in_meters": 1.7}')

/tmp/ipykernel_22787/832530925.py:1: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  Person.parse_raw('{"name": "Alice", "height_in_meters": 1.7}')


Person(name='Alice', hair_color=None, height_in_meters=1.7)

In [79]:
from openai import OpenAI
from typing import Type
import json
from pydantic import TypeAdapter

class LLM():
    def __init__(self, output_model:Type[BaseModel]):
        self.client = OpenAI()
        self.output_model = output_model
        system_prompt = "Given the user-submitted text, identify the following information."
        
        schema = self.output_model.model_json_schema()
        system_prompt += f"\nThe output json will contain {schema['description']}. Extract the information into JSON format:"
        system_prompt += '\n\n{'
        for field in schema['properties'].keys():
            system_prompt += f"\n{field}: {schema['properties'][field]['description']}  # {schema['properties'][field]['description']}"
        system_prompt += '\n}'
        
        self.system_prompt = {"role": "system", "content": system_prompt}

    def __call__(self, *args, **kwds):
        return self.invoke(*args, **kwds)
    
    def process(self, response:str) -> BaseModel|str:
        try:
            return TypeAdapter(self.output_model).validate_json(response)
        except Exception as e:
            print(e)
            return response

    def invoke(self, text:str) -> BaseModel|str:
        completion = self.client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                self.system_prompt,
                {"role": "user", "content": text}
            ],
            temperature=0.0,
        )
        response = completion.choices[0].message.content
        return self.process(response)

In [None]:

TypeAdapter(Person).validate_json('{"name": "Alice", "height_in_meters": 1.7}')

Person(name='Alice', hair_color=None, height_in_meters=1.7)

In [80]:
llm = LLM(Person)

In [81]:
print(llm.system_prompt['content'])

Given the user-submitted text, identify the following information.
The output json will contain Information about a person.. Extract the information into JSON format:

{
name: The name of the person  # The name of the person
hair_color: The color of the person's hair if known  # The color of the person's hair if known
height_in_meters: Height measured in meters  # Height measured in meters
}


In [82]:
response = llm.invoke("The person's name is John Doe, they have brown hair, and are 1.8 meters tall.")

In [84]:
llm.invoke("Alan Smith is 6 feet tall and has blond hair.")

Person(name='Alan Smith', hair_color='blond', height_in_meters=1.8288)

In [83]:
response

Person(name='John Doe', hair_color='brown', height_in_meters=1.8)