In [1]:
from pydantic import BaseModel, Field, HttpUrl, field_validator
from typing import Optional, Literal
from uuid import UUID

# Allowed values (enums)
LEAF_TYPES = {
    "Simple",
    "Pinnately compound (single)",
    "Pinnately compound (double)",
    "Pinnately compound (triple)",
    "Palmately compound",
}

FRUIT_TYPES = {
    "Drupe",
    "Capsule",
    "Follicle",
    "Pod",
}


class SpeciesRecord(BaseModel):
    # Required fields
    id: UUID
    language: Literal["en", "tet"]
    scientific_name: str = Field(min_length=2, max_length=255)

    # Optional text fields
    etymology: Optional[str] = None
    common_name: Optional[str] = None
    habitat: Optional[str] = None
    phenology: Optional[str] = None
    identification_characters: Optional[str] = None
    seed_germination: Optional[str] = None
    pest: Optional[str] = None

    # Controlled fields (enums)
    leaf_type: Optional[str] = None
    fruit_type: Optional[str] = None

    # Media
    image: Optional[HttpUrl] = None
    video: Optional[str] = None

    # ---------- Validators ----------

    @field_validator("leaf_type")
    @classmethod
    def validate_leaf_type(cls, v):
        if v is None or v == "":
            return None
        if v not in LEAF_TYPES:
            raise ValueError(f"leaf_type must be one of {sorted(LEAF_TYPES)}")
        return v

    @field_validator("fruit_type")
    @classmethod
    def validate_fruit_type(cls, v):
        if v is None or v == "":
            return None
        if v not in FRUIT_TYPES:
            raise ValueError(f"fruit_type must be one of {sorted(FRUIT_TYPES)}")
        return v

    @field_validator(
        "etymology",
        "habitat",
        "phenology",
        "identification_characters",
        "seed_germination",
        "pest",
        mode="before",
    )
    @classmethod
    def empty_string_to_none(cls, v):
        if v == "":
            return None
        return v


In [5]:
from pydantic import ValidationError

sample_record = {
    "id": "2b2d3d5e-3b8c-4f46-9a5f-4e51dfc10b0a",
    "language": "en",
    "scientific_name": "Synthetica plantensis 1",
    "leaf_type": "Simple",
    "fruit_type": "Capsule",
    "image": "https://example.com/species/images/1.jpg",
}

try:
    record = SpeciesRecord(**sample_record)
    print("VALID RECORD ")
    print(record)
except ValidationError as e:
    print("INVALID RECORD ")
    print(e)


VALID RECORD âœ…
id=UUID('2b2d3d5e-3b8c-4f46-9a5f-4e51dfc10b0a') language='en' scientific_name='Synthetica plantensis 1' etymology=None common_name=None habitat=None phenology=None identification_characters=None seed_germination=None pest=None leaf_type='Simple' fruit_type='Capsule' image=Url('https://example.com/species/images/1.jpg') video=None


In [19]:
import json
import pandas as pd
from pydantic import ValidationError

#loading both JSON files
with open("synthetic_species_en.json", "r") as f:
    records_en = json.load(f)

with open("synthetic_species_tet.json", "r") as f:
    records_tet = json.load(f)

# Combine both datasets
records = records_en + records_tet

# validating
valid_count = 0
error_rows = []

for idx, record in enumerate(records):
    try:
        SpeciesRecord(**record)
        valid_count += 1
    except ValidationError as e:
        for err in e.errors():
            error_rows.append({
                "Record_Number": idx + 1,
                "Language": record.get("language"),
                "Field": err["loc"][0],
                "Error_Message": err["msg"]
            })

#summary
print("Total records:", len(records))
print("Valid records:", valid_count)
print("Invalid records:", len(set(r["Record_Number"] for r in error_rows)))

# error report
error_report = pd.DataFrame(error_rows)
error_report


Total records: 40
Valid records: 38
Invalid records: 2


Unnamed: 0,Record_Number,Language,Field,Error_Message
0,1,en,leaf_type,"Value error, leaf_type must be one of ['Palmat..."
1,21,tet,leaf_type,"Value error, leaf_type must be one of ['Palmat..."
