# Schema Engineering with Pydantic

This notebook reports the code for the examples in the blog post Schema Engineering for Structured Generation with Pydantic, the second in the series "Building Reproducible LLM Applications".

In [None]:
!pip3 install pydantic
!pip3 install anthropic

### Writing Schemas

Basics of Pydantic to write down abstract schemas that can also be used to validate data.

In [None]:
from pydantic import BaseModel
from typing import Literal
from enum import Enum
from pydantic import Field, field_validator, model_validator
from typing import Optional

In [None]:
class Dogs(BaseModel):
    breed: str
    toy: str

# An instance
Bella = Dogs(breed = "Poodle", toy="ball")
print(Bella)

In [None]:
class Hair(Enum):
    Curly = "curly"
    Straight = "straight"
    Shaved = "shaved"

class Dogs(BaseModel):
    breed: str
    toy: str
    hair: Hair
    color: Literal["brown", "white"]

In [None]:
# Charlie is not valid because of toy and color. This will raise Validation Error
Charlie = Dogs(breed="Corgi", toy=12, hair=Hair.Straight, color="red")

In [None]:
class Dogs(BaseModel):
    breed: str = Field(description="The name of an existing breed")
    toy: str = Field(max_length=50)
    hair: Hair = Field(description="The hair type of the dog")
    color: Optional[Literal["brown", "white"]] = Field(default=None)
    age: int = Field(default=1, gt=0, description="Age must be greater than 0")

    @field_validator('breed') 
    def check_name(cls, value):
        if any(char in value for char in "!@#^*"):
            raise ValueError('Breed cannot contain special characters')
        return value
    
    # Adding a new cross-field validator to the Dogs class
    @model_validator(mode='before')
    def check_hair_and_color(cls, values):
        hair = values.get('hair')
        color = values.get('color')
        # If hair is shaved, color can be None or absent
        if hair == Hair.Shaved and color is not None:
            raise ValueError("Dogs with shaved hair should not have a color.")
        # If hair is curly or straight, color must be specified and cannot be None
        if hair in {Hair.Curly, Hair.Straight} and color is None:
            raise ValueError("Dogs with curly or straight hair must have a color.")
        return values

In [None]:
# Coco is not valid because of the hair-color inconsistency
Coco = Dogs(breed = "Beagle", toy = "socks", hair = Hair.Curly, age = 1)

In [None]:
# Visualize schema in JSON format
pydantic_to_json= Dogs.model_json_schema()
print(pydantic_to_json)

### Integrating Schemas with Claude Calls

For this tutorial, we'll integrate this schema as JSON object in the system prompt

In [None]:
import anthropic
from pydantic import ValidationError

In [None]:
# Setup environment
anthropic_api_key = "YOUR-KEY-GOES-HERE"
client = anthropic.Anthropic(api_key=anthropic_api_key)

In [None]:
class BasicDogs(BaseModel):
    breed: str = Field(description="The name of an existing breed")
    toy: str = Field(max_length=4)

schema = BasicDogs.model_json_schema()
print(schema)

In [None]:
def call_model(sys_prompt, user_prompt):
    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        system=sys_prompt,
        messages=[
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=1024,
    )
    llm_answer = response.content[0].text
    return llm_answer

In [None]:
sys_prompt = f"You are a helpful assistant that responds in the same format as the following example schema: {schema}"
user_prompt = "Extract dog information from: Bella is a curly black Poodle, who loves playing football and chess.```\n"


In [None]:
for i in range(10):
    print(f"LLM call number: {i}")
    llm_answer = call_model(sys_prompt,user_prompt)
    try:
        # Pydantic validates all fields at once
        dog = BasicDogs.model_validate_json(llm_answer)
    except ValidationError as e:
        print("Error!")
        print(llm_answer)
        # This gives the chance to retry immediately for errors
        ret_sys_p = f"You are a helpful assistant that responds in the same format as the following example schema: {schema}. Previously, you gave me an incorrect output: {llm_answer}. This was incorrect because of these errors {e}. "
        retry_answer =  call_model(sys_prompt,user_prompt)
        try:
            dog = BasicDogs.model_validate_json(retry_answer)
        except ValidationError:
            print("Still Error!")

Now doing the same but without Pydantic

In [None]:
import json

# Defining the schema
manual_schema = {
    "type": "object",
    "properties": {
        "breed": { "type": "string" },
        "toy": { "type": "string" }
        },
     "required": ["breed","toy"] 
      }

# Adding requirements in the systm prompt
sys_prompt = f"You are a helpful assistant that responds in the same format as the following example schema: {manual_schema}. Note that toy can have max four characters."
    


In [None]:
# Calling the model 10 times
for i in range(10):
    print(f"LLM call number: {i}")
    llm_answer = call_model(sys_prompt,user_prompt)
    # convert string into a dictionary
    llm_dict = json.loads(llm_answer)
    # Validate data

    # Manually check that it can be converted into a dictionary
    dog_dictionary = { "breed": llm_dict["breed"],
                      "toy": llm_dict["toy"]
                      }
    # Manually check if constraints are respected
    try:
        assert type(dog_dictionary["breed"])==str
        assert type(dog_dictionary["toy"])==str
        assert len(dog_dictionary["toy"])<=4
    except AssertionError as e:
        print("Error!")
        