# Structural Data Extraction

<https://python.langchain.com/docs/tutorials/extraction/>

## Schema

In [1]:
from typing import Optional
from pydantic import BaseModel, Field


class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )

In [2]:
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

## LLM

In [4]:
from langchain.chat_models import init_chat_model

### OpenAI

In [13]:
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [14]:
structured_llm = llm.with_structured_output(schema=Person)

text = "Alan Smith is 6 feet tall and has blond hair."
prompt = prompt_template.invoke({"text": text})
structured_llm.invoke(prompt)

Person(name='Alan Smith', hair_color='blond', height_in_meters='1.83')

### Together

In [7]:
llm_tg = init_chat_model("meta-llama/Llama-3.3-70B-Instruct-Turbo", model_provider="together")

structured_llm_tg = llm_tg.with_structured_output(schema=Person)

text = "Alan Smith is 6 feet tall and has blond hair."
prompt = prompt_template.invoke({"text": text})
structured_llm_tg.invoke(prompt)

Person(name='Alan Smith', hair_color='blond', height_in_meters='1.8288')

### Local: `llama3.1:8b`

In [7]:
from langchain.chat_models import init_chat_model

llm_local1 = init_chat_model("llama3.1:8b", model_provider="ollama")

In [8]:
structured_llm_local1 = llm_local1.with_structured_output(schema=Person)

In [9]:
text = "Alan Smith is 6 feet tall and has blond hair."
prompt = prompt_template.invoke({"text": text})
structured_llm_local1.invoke(prompt)

Person(name='Alan Smith', hair_color='blond', height_in_meters=None)

### Local: `mistral:latest`

In [9]:
from langchain.chat_models import init_chat_model

llm_local2 = init_chat_model("mistral:latest", model_provider="ollama")
structured_llm_local2 = llm_local2.with_structured_output(schema=Person)

In [11]:
text = "Alan Smith is 6 feet tall and has blond hair."
prompt = prompt_template.invoke({"text": text})

structured_llm_local2.invoke(prompt)

ValidationError: 1 validation error for Person
height_in_meters
  Input should be a valid string [type=string_type, input_value=5.4, input_type=float]
    For further information visit https://errors.pydantic.dev/2.10/v/string_type