In [1]:
from pydantic import BaseModel, Field, field_validator
from typing import Union, List, Optional, Dict
import datetime
import uuid
import tiktoken
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")


# Base class for all bd_types
class BDType(BaseModel):
    source: str = Field("babydragon", description="The source of the data.")
    timestamp: Optional[datetime.datetime] = Field(None, description="When the data was collected or created. If not provided, the current time is used.")
    id: uuid.UUID = Field(default_factory=uuid.uuid4, description="Unique identifier of the data.")
    data_name: Optional[str] = Field(None, description="Name of the data.")
    elements_name: Optional[List[str]] = Field(None, description="Names of the elements if the data is a list.")

    @field_validator("timestamp")
    def set_timestamp(cls, v):
        return v or datetime.datetime.now()

    @field_validator("id")
    def set_id(cls, values, **kwargs):
        if "id" not in values:
            values["id"] = uuid.uuid4()
        return values


class NaturalLanguageSingle(BDType):
    text: str = Field(..., description="The natural language text. It should be less than or equal to `max_tokens` in length when tokenized.")
    max_tokens: int = Field(8000, description="The maximum allowed length of the text in tokens. The default value is 8000.")
    
    @field_validator("text")
    def validate_text(cls, v, info):
        try:
            # Tokenize the text and get the token count
            token_count = len(tokenizer.encode(v))
        except Exception as e:
            raise ValueError("Failed to tokenize text.") from e

        # Get max_tokens from info.data, if not available, default to 8000
        max_tokens = info.data.get("max_tokens", 8000)

        if token_count > max_tokens:
            raise ValueError(f"Text is longer than {max_tokens} tokens.")

        return v


class NaturalLanguageList(BDType):
    texts: List[NaturalLanguageSingle] = Field(..., description="A list of `NaturalLanguageSingle` objects. Each object should pass the validation requirements of the `NaturalLanguageSingle` class.")



In [2]:
from pydantic import BaseModel, Field, FieldValidationInfo, field_validator
from typing import List, Optional, Set

class DiscreteDataInt(BDType):
    alphabet: Optional[Set[int]] = Field(None, description="Set of allowed discrete variables. All elements should be integers.")
    value: int = Field(..., description="The discrete data value. It should be an integer.")

    @field_validator('alphabet')
    def check_alphabet(cls, v):
        if not all(isinstance(item, int) for item in v):
            raise ValueError("All elements in 'alphabet' should be integers.")
        return v

    @field_validator('value')
    def check_value(cls, v, info: FieldValidationInfo):
        alphabet = info.data.get('alphabet')
        if alphabet is not None and v not in alphabet:
            raise ValueError("Value must be in the alphabet.")
        return v


class DiscreteDataStr(BDType):
    alphabet: Optional[Set[str]] = Field(None, description="Set of allowed discrete variables. All elements should be strings.")
    value: str = Field(..., description="The discrete data value. It should be a string.")

    @field_validator('alphabet')
    def check_alphabet(cls, v):
        if not all(isinstance(item, str) for item in v):
            raise ValueError("All elements in 'alphabet' should be strings.")
        return v

    @field_validator('value')
    def check_value(cls, v, info: FieldValidationInfo):
        alphabet = info.data.get('alphabet')
        if alphabet is not None and v not in alphabet:
            raise ValueError("Value must be in the alphabet.")
        return v


In [3]:
alphabet = ["a", "b", "c"]
data = DiscreteDataStr(value="c", alphabet=alphabet)
#now with try except for out of alphabet
try:
    data = DiscreteDataStr(value="d", alphabet=alphabet)
except Exception as e:
    print(e)
int_alphabet = [1, 2, 3]
int_data = DiscreteDataInt(value=2, alphabet=int_alphabet)
#now with try except for out of alphabet
try:
    int_data = DiscreteDataInt(value=4, alphabet=int_alphabet)
except Exception as e:
    print(e)

1 validation error for DiscreteDataStr
value
  Value error, Value must be in the alphabet. [type=value_error, input_value='d', input_type=str]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error
1 validation error for DiscreteDataInt
value
  Value error, Value must be in the alphabet. [type=value_error, input_value=4, input_type=int]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error


In [4]:
from pydantic import BaseModel, Field, FieldValidationInfo, field_validator
from typing import List, Optional, Union, Set

# The DiscreteDataInt and DiscreteDataStr models are defined as before

class DiscreteDataList(BaseModel):
    alphabet: Optional[Set[Union[int, str]]] = Field(None, description="Set of allowed discrete variables. All elements should be of the same type (either integers or strings).")
    value: List[Union[DiscreteDataInt, DiscreteDataStr]] = Field(..., description="The list of discrete data values. All elements should be either DiscreteDataInt or DiscreteDataStr, not a mix.")

    @field_validator('value')
    def check_alphabets(cls, value, info: FieldValidationInfo):
        list_alphabet = info.data.get('alphabet')
        if list_alphabet is not None:
            for item in value:
                item_alphabet = item.alphabet
                if item_alphabet is not None and not set(item_alphabet).issubset(list_alphabet):
                    raise ValueError(f"Item alphabet {item_alphabet} is not a subset of the list alphabet {list_alphabet}.")
        return value


In [5]:
# Usage:
alphabet = ["a", "b", "c"]
alphabet_2 = ["d"]
data_items = [
        DiscreteDataStr(value="a", alphabet=alphabet),
        DiscreteDataStr(value="b", alphabet=alphabet),
    ]
data_list = DiscreteDataList(value=data_items, alphabet=alphabet)
data_items_wrong = [
        DiscreteDataStr(value="a", alphabet=alphabet),
        DiscreteDataStr(value="b", alphabet=alphabet),
        DiscreteDataStr(value="d", alphabet=alphabet_2),
    ]
try:
    
    data_list = DiscreteDataList(value=data_items_wrong, alphabet=alphabet)
except Exception as e:
    print(e)

1 validation error for DiscreteDataList
value
  Value error, Item alphabet {'d'} is not a subset of the list alphabet {'a', 'c', 'b'}. [type=value_error, input_value=[DiscreteDataStr(source='...habet={'d'}, value='d')], input_type=list]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error


In [6]:
class MultiDimensionalDiscrete(BDType):
    value: List[Union[DiscreteDataInt, DiscreteDataStr]] = Field(..., description="The multidimensional discrete data value. It should be a list of either DiscreteDataInt or DiscreteDataStr.")
    type_dictionary: Dict[int, str] = Field(default_factory=dict, description="The helper dictionary containing the type of each dimension of the value list.")
    def __init__(self, **data):
        super().__init__(**data)
        self.type_dictionary = {i: item.__class__.__name__ for i, item in enumerate(self.value)}
        
    @field_validator('value')
    def check_value(cls, value):
        if len(value) < 2:
            raise ValueError("For multidimensional discrete data, size of the list should be at least 2. For less than 2, use DiscreteDataInt or DiscreteDataStr.")
        return value

    

In [7]:
# Create some examples
discrete_int_1 = DiscreteDataInt(value=1)
discrete_int_2 = DiscreteDataInt(value=2)
discrete_str_1 = DiscreteDataStr(value="a")

# This should work (two DiscreteDataInt)
multi_dim_discrete_1 = MultiDimensionalDiscrete(value=[discrete_int_1, discrete_int_2])
print(multi_dim_discrete_1)

# This should work too (one DiscreteDataInt and one DiscreteDataStr)
multi_dim_discrete_2 = MultiDimensionalDiscrete(value=[discrete_int_1, discrete_str_1])
print(multi_dim_discrete_2)

# This should raise a ValueError (only one DiscreteDataInt)
try:
    multi_dim_discrete_3 = MultiDimensionalDiscrete(value=[discrete_int_1])
except ValueError as e:
    print(f"Caught an expected error: {e}")


source='babydragon' timestamp=None id=UUID('9b78b111-bee7-407a-8a95-c9ae9ff6d98d') data_name=None elements_name=None value=[DiscreteDataInt(source='babydragon', timestamp=None, id=UUID('d07daa29-e2f0-4f5d-95a3-fc1428190e50'), data_name=None, elements_name=None, alphabet=None, value=1), DiscreteDataInt(source='babydragon', timestamp=None, id=UUID('bef822ba-f658-4bf9-bdd6-eaff82f3887d'), data_name=None, elements_name=None, alphabet=None, value=2)] type_dictionary={0: 'DiscreteDataInt', 1: 'DiscreteDataInt'}
source='babydragon' timestamp=None id=UUID('f223e2ba-c45e-4708-8e22-fc8f0e6ad46b') data_name=None elements_name=None value=[DiscreteDataInt(source='babydragon', timestamp=None, id=UUID('d07daa29-e2f0-4f5d-95a3-fc1428190e50'), data_name=None, elements_name=None, alphabet=None, value=1), DiscreteDataStr(source='babydragon', timestamp=None, id=UUID('27612c45-69a5-429a-9e78-6fd0a6e8a809'), data_name=None, elements_name=None, alphabet=None, value='a')] type_dictionary={0: 'DiscreteDataInt'

In [8]:
from pydantic import BaseModel, Field, field_validator
from typing import List, Optional, Union, Tuple, Any, Set

class MultiDimensionalDiscreteList(BDType):
    values: List[MultiDimensionalDiscrete] = Field(..., description="The list of multidimensional discrete data values. All elements should be instances of MultiDimensionalDiscrete.")
    joint_alphabet: Optional[Set[Tuple[Any, ...]]] = Field(None, description="Set of tuples representing allowed discrete variable combinations. All elements should be tuples of the same length as the number of dimensions in each joint discrete variable.")

    @field_validator('values')
    def check_type_dictionaries(cls, values):
        first_type_dictionary = values[0].type_dictionary
        for value in values[1:]:
            if value.type_dictionary != first_type_dictionary:
                raise ValueError("All elements in 'values' should have the same 'type_dictionary'.")
        return values

    @field_validator('joint_alphabet')
    def check_joint_alphabet(cls, v, info):
        if v is not None and "values" in info.data:
            expected_tuple_length = len(info.data["values"][0].value)
            for item in v:
                if not isinstance(item, tuple) or len(item) != expected_tuple_length:
                    raise ValueError(f"Each element in 'joint_alphabet' should be a tuple of length {expected_tuple_length}.")
                for dim_value, dim_alphabet in zip(item, [value.alphabet for value in info.data["values"][0].value]):
                    if dim_alphabet is not None and dim_value not in dim_alphabet:
                        raise ValueError(f"Value {dim_value} is not in the alphabet for its dimension.")
        return v


In [9]:
discrete_int1 = DiscreteDataInt(alphabet=[0, 1, 2, 3, 4], value=2)
discrete_int2 = DiscreteDataInt(alphabet=[0, 1, 2, 3, 4], value=3)
discrete_str1 = DiscreteDataStr(alphabet=['a', 'b', 'c', 'd'], value='b')


In [10]:
multi_dim_discrete1 = MultiDimensionalDiscrete(value=[discrete_int1, discrete_str1])
print(multi_dim_discrete1.model_dump_json())

multi_dim_discrete2 = MultiDimensionalDiscrete(value=[discrete_int2, discrete_str1])
multi_dim_list = MultiDimensionalDiscreteList(values=[multi_dim_discrete1, multi_dim_discrete2], joint_alphabet=[(2, 'b'), (3, 'b')])
print(multi_dim_list.model_dump_json())


{"source":"babydragon","timestamp":null,"id":"46113d80-b664-43c7-8784-4eac154de569","data_name":null,"elements_name":null,"value":[{"source":"babydragon","timestamp":null,"id":"9d4840a5-28b8-4a2b-9a65-1186225e44df","data_name":null,"elements_name":null,"alphabet":[0,1,2,3,4],"value":2},{"source":"babydragon","timestamp":null,"id":"a979058a-2d2b-4710-9de7-aee584d38ce1","data_name":null,"elements_name":null,"alphabet":["d","a","c","b"],"value":"b"}],"type_dictionary":{"0":"DiscreteDataInt","1":"DiscreteDataStr"}}
{"source":"babydragon","timestamp":null,"id":"4bf17151-445c-4c35-b7df-183136cacd9a","data_name":null,"elements_name":null,"values":[{"source":"babydragon","timestamp":null,"id":"46113d80-b664-43c7-8784-4eac154de569","data_name":null,"elements_name":null,"value":[{"source":"babydragon","timestamp":null,"id":"9d4840a5-28b8-4a2b-9a65-1186225e44df","data_name":null,"elements_name":null,"alphabet":[0,1,2,3,4],"value":2},{"source":"babydragon","timestamp":null,"id":"a979058a-2d2b-4710

In [11]:
try:
    multi_dim_list_error = MultiDimensionalDiscreteList(values=[multi_dim_discrete1, multi_dim_discrete2], joint_alphabet=[(5, 'b'), (3, 'b')])
except ValueError as e:
    print(e)


1 validation error for MultiDimensionalDiscreteList
joint_alphabet
  Value error, Value 5 is not in the alphabet for its dimension. [type=value_error, input_value=[(5, 'b'), (3, 'b')], input_type=list]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error


In [12]:
try:
    discrete_int3 = DiscreteDataInt(alphabet=[0, 1, 2, 3, 4], value=4)
    multi_dim_discrete_error = MultiDimensionalDiscrete(value=[discrete_int2, discrete_int3])
    print(multi_dim_discrete_error.type_dictionary)
    multi_dim_list_error = MultiDimensionalDiscreteList(values=[multi_dim_discrete1, multi_dim_discrete_error])
except ValueError as e:
    print(e)


{0: 'DiscreteDataInt', 1: 'DiscreteDataInt'}
1 validation error for MultiDimensionalDiscreteList
values
  Value error, All elements in 'values' should have the same 'type_dictionary'. [type=value_error, input_value=[MultiDimensionalDiscrete... 1: 'DiscreteDataInt'})], input_type=list]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error


In [30]:
from pydantic import BaseModel, Field, FieldValidationInfo, field_validator
from typing import Union, Optional, Tuple

class RealData(BDType):
    range: Optional[Tuple[Union[float, int], Union[float, int]]] = Field(None, description="An optional inclusive range (min, max) for the value.")
    value: Union[float, int] = Field(..., description="The real value data. It should be a float or an integer.")

    @field_validator("value")
    def validate_value(cls, v, values):
        value_range = values.data["range"]
        if value_range is not None:
            min_value, max_value = value_range
            if not min_value <= v <= max_value:
                raise ValueError(f"Value {v} is not within the specified range {value_range}.")
        return v


In [31]:
realdata = RealData(value=3.5, range=(0, 5))
try:
    realdata = RealData(value=3.5, range=(0, 3))
except ValueError as e:
    print(e)

1 validation error for RealData
value
  Value error, Value 3.5 is not within the specified range (0, 3). [type=value_error, input_value=3.5, input_type=float]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error


In [34]:
from pydantic import BaseModel, Field, FieldValidationInfo, field_validator
from typing import List, Union, Optional, Tuple

class RealDataList(BDType):
    range: Optional[Tuple[Union[float, int], Union[float, int]]] = Field(None, description="An optional inclusive range (min, max) for the values.")
    values: List[RealData] = Field(..., description="The list of real value data. Each should be a RealeData object.")

    @field_validator("values")
    def validate_values(cls, values, values_dict):
        list_range = values_dict.data.get("range")
        if list_range is not None:
            min_value, max_value = list_range
            for value in values:
                if not min_value <= value.value <= max_value:
                    raise ValueError(f"Value {value.value} of RealData object is not within the specified range {list_range}.")
        return values


In [35]:
raw_data= list(range(10))
list_realdata = [RealData(value=value, range=(0, 5)) for value in raw_data[0:5]]
range_for_list = (0, 5)
realdata_list = RealDataList(values=list_realdata, range=range_for_list)



In [39]:
out_of_range_list = [RealData(value=value, range=(5, 10)) for value in raw_data[5:10]]
try:
    realdata_list = RealDataList(values=out_of_range_list, range=range_for_list)
except ValueError as e:
    print(e)

1 validation error for RealDataList
values
  Value error, Value 6 of RealData object is not within the specified range (0, 5). [type=value_error, input_value=[RealData(source='babydra...range=(5, 10), value=9)], input_type=list]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error


In [54]:
from pydantic import BaseModel, Field, FieldValidationInfo, field_validator
from typing import List, Union, Optional, Tuple

class MultiDimensionalReal(BDType):
    range: Optional[Union[Tuple[Union[float, int], Union[float, int]], List[Tuple[Union[float, int], Union[float, int]]]]]
    values: List[RealData]

    @field_validator("values")
    def validate_values(cls, values, values_dict):
        range_values = values_dict.data.get("range")
        if range_values is not None:
            # If range is a tuple, apply it to all dimensions
            if isinstance(range_values, tuple):
                min_value, max_value = range_values
                for value in values:
                    if not min_value <= value.value <= max_value:
                        raise ValueError(f"Value {value.value} of RealData object is not within the specified range {range_values}.")
            # If range is a list, it must have the same length as values
            elif isinstance(range_values, list):
                if len(values) != len(range_values):
                    raise ValueError("If range is a list, it must have the same length as values.")
                for value, (min_value, max_value) in zip(values, range_values):
                    if not min_value <= value.value <= max_value:
                        raise ValueError(f"Value {value.value} of RealData object is not within the specified range ({min_value}, {max_value}).")
        return values


In [55]:
data = MultiDimensionalReal(
        range=[(0.0, 10.0), (20.0, 30.0)], 
        values=[RealData(value=5.0), RealData(value=25.0)]
    )

In [57]:
data = MultiDimensionalReal(
        range=(0.0, 30.0), 
        values=[RealData(value=5.0), RealData(value=25.0)]
    )

In [58]:
try:
    data = MultiDimensionalReal(
        range=[(0.0, 10.0), (20.0, 30.0)], 
        values=[RealData(value=5.0), RealData(value=15.0)]
    )
except ValueError as e:
    print(f"Validation error: {e}")

try:
    data = MultiDimensionalReal(
        range=(0.0, 10.0), 
        values=[RealData(value=5.0), RealData(value=15.0)]
    )
except ValueError as e:
    print(f"Validation error: {e}")

Validation error: 1 validation error for MultiDimensionalReal
values
  Value error, Value 15.0 of RealData object is not within the specified range (20.0, 30.0). [type=value_error, input_value=[RealData(source='babydra...range=None, value=15.0)], input_type=list]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error
Validation error: 1 validation error for MultiDimensionalReal
values
  Value error, Value 15.0 of RealData object is not within the specified range (0.0, 10.0). [type=value_error, input_value=[RealData(source='babydra...range=None, value=15.0)], input_type=list]
    For further information visit https://errors.pydantic.dev/2.1.2/v/value_error


In [59]:
from pydantic import BaseModel, Field, FieldValidationInfo, field_validator
from typing import List, Union, Optional, Tuple

class MultiDimensionalRealList(BDType):
    range: Optional[Union[Tuple[Union[float, int], Union[float, int]], List[Tuple[Union[float, int], Union[float, int]]]]]
    values: List[MultiDimensionalReal]

    @field_validator("values")
    def validate_values(cls, values, values_dict):
        range_values = values_dict.data.get("range")
        dimension_length = len(values[0].values) if values else 0
        if range_values is not None:
            # If range is a tuple, apply it to all dimensions
            if isinstance(range_values, tuple):
                min_value, max_value = range_values
                for multi_real in values:
                    if len(multi_real.values) != dimension_length:
                        raise ValueError("All MultiDimensionalReal in the list must have the same length.")
                    for value in multi_real.values:
                        if not min_value <= value.value <= max_value:
                            raise ValueError(f"Value {value.value} of RealData object is not within the specified range {range_values}.")
            # If range is a list, it must have the same length as values in each dimension
            elif isinstance(range_values, list):
                if len(range_values) != dimension_length:
                    raise ValueError("If range is a list, it must have the same length as values in each dimension.")
                for multi_real in values:
                    if len(multi_real.values) != dimension_length:
                        raise ValueError("All MultiDimensionalReal in the list must have the same length.")
                    for value, (min_value, max_value) in zip(multi_real.values, range_values):
                        if not min_value <= value.value <= max_value:
                            raise ValueError(f"Value {value.value} of RealData object is not within the specified range ({min_value}, {max_value}).")
        return values


In [60]:
from pydantic import ValidationError

data1 = MultiDimensionalReal(
    range=(0.0, 10.0),
    values=[RealData(value=2.0), RealData(value=8.0)]
)

data2 = MultiDimensionalReal(
    range=(0.0, 10.0),
    values=[RealData(value=3.0), RealData(value=7.0)]
)

list_data = MultiDimensionalRealList(
    range=(0.0, 10.0),
    values=[data1, data2]
)

print("Successfully created MultiDimensionalRealList:", list_data)

# Error during creation of MultiDimensionalRealList due to value out of range
try:
    data1 = MultiDimensionalReal(
        range=(0.0, 10.0),
        values=[RealData(value=2.0), RealData(value=8.0)]
    )

    data2 = MultiDimensionalReal(
        range=(0.0, 10.0),
        values=[RealData(value=3.0), RealData(value=15.0)]  # This value is out of range
    )

    list_data = MultiDimensionalRealList(
        range=(0.0, 10.0),
        values=[data1, data2]
    )

    print("Successfully created MultiDimensionalRealList:", list_data)
except ValidationError as e:
    print(str(e))

# Error during creation of MultiDimensionalRealList due to inconsistent dimensions
try:
    data1 = MultiDimensionalReal(
        range=(0.0, 10.0),
        values=[RealData(value=2.0), RealData(value=8.0), RealData(value=5.0)]  # This data has three dimensions
    )

    data2 = MultiDimensionalReal(
        range=(0.0, 10.0),
        values=[RealData(value=3.0), RealData(value=7.0)]  # This data has two dimensions
    )

    list_data = MultiDimensionalRealList(
        range=(0.0, 10.0),
        values=[data1, data2]
    )

    print("Successfully created MultiDimensionalRealList:", list_data)
except ValidationError as e:
    print(str(e))


Successfully created MultiDimensionalRealList: source='babydragon' timestamp=None id=UUID('bab75d15-298b-4889-aa79-a2ffc6f863cc') data_name=None elements_name=None range=(0.0, 10.0) values=[MultiDimensionalReal(source='babydragon', timestamp=None, id=UUID('26f45aa0-4b4d-478f-bf97-cc84535a3f54'), data_name=None, elements_name=None, range=(0.0, 10.0), values=[RealData(source='babydragon', timestamp=None, id=UUID('c31a74c1-14f3-45d5-b94c-19bfd5011b87'), data_name=None, elements_name=None, range=None, value=2.0), RealData(source='babydragon', timestamp=None, id=UUID('771a73fd-7ca3-40de-93ef-6c2b9df615e9'), data_name=None, elements_name=None, range=None, value=8.0)]), MultiDimensionalReal(source='babydragon', timestamp=None, id=UUID('203054cb-39e1-440a-9d8a-c46516f7299b'), data_name=None, elements_name=None, range=(0.0, 10.0), values=[RealData(source='babydragon', timestamp=None, id=UUID('38adc6cc-91ef-40a9-acdc-eb5f7a649bd2'), data_name=None, elements_name=None, range=None, value=3.0), Rea