Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for pydantic 2.0, polars 0.20.10 and remove duckdb support #32

Merged
merged 29 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a5ddd94
Downgrade Sphinx to compatible version
JakobGM Jun 21, 2023
2287188
very rough draft for pydantic v2
brendancooley Oct 26, 2023
42299db
fix: `DataFrameValidationError` mirrors pydantic v1 `ValidationError`
brendancooley Oct 26, 2023
4b98da9
wip: subclass `FieldInfo`, extend with patito field attributes
brendancooley Oct 26, 2023
eef74fe
fix: update LDF.collect() for polars==0.19.8
brendancooley Oct 26, 2023
87d57bd
wip: restructure model reconstitution routines, all tests passing
brendancooley Oct 26, 2023
ffe343d
chore: lockfile and formatting
brendancooley Oct 27, 2023
e2bf0d7
fix: handle multiple annotations when `dtype` is present in `Field`
brendancooley Nov 3, 2023
017c59b
check: annotated dtypes match those specified in Field.dtype
brendancooley Nov 3, 2023
161300b
chore: misc typing improvements
brendancooley Nov 3, 2023
9e132bc
fix: search for nested field constraints on validation, ignore nulls
brendancooley Nov 3, 2023
93e171e
fix: better inference for column nullability
brendancooley Nov 3, 2023
e9dbca6
wip: better type mismatch messaging, move Representation to internal
brendancooley Nov 6, 2023
399673e
wip: onto classproperty formulation in lieu of metaclass
brendancooley Nov 6, 2023
a63db3f
wip: custom classproperty (from polars)
brendancooley Nov 6, 2023
db44aa3
wip: robustify array dtype inference, add pt custom fields to `Field()`
brendancooley Nov 6, 2023
438974c
feature: custom Field/FieldInfo constructors for better extensibility
brendancooley Nov 8, 2023
386ba83
fix: make _PatitoFieldInfo private, ensure ext fields -> _schema_props
brendancooley Nov 8, 2023
ee63672
wip: serialization
brendancooley Dec 13, 2023
daf8f59
wip: modularize annotation -> dtype and dtype validation
brendancooley Dec 18, 2023
054d034
chore: remove database/duckdb support
brendancooley Feb 7, 2024
155ce1f
chore: more sql cleanup
brendancooley Feb 7, 2024
6221dd7
chore: more migration/refactor
brendancooley Feb 8, 2024
dbad4cf
chore: add tests, some new features
brendancooley Feb 8, 2024
48ed782
bug: instantiated dtypes, constraints on examples, tz-less datetimes
brendancooley Feb 9, 2024
a2459a9
chore: docs cleanup, doctests running, partial linting
brendancooley Feb 14, 2024
7836d16
chore: wip types
brendancooley Feb 14, 2024
7c36b72
bump version and poetry update
thomasaarholt Feb 27, 2024
b5c7002
Use pl.String instead of pl.Utf8
thomasaarholt Feb 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,5 @@ dmypy.json

# Pyre type checker
.pyre/

.vscode/
1,359 changes: 763 additions & 596 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ keywords = ["validation", "dataframe"]

[tool.poetry.dependencies]
python = "^3.8"
pydantic = "^1.7.0"
polars = ">=0.18.7"
pydantic = ">=2.0.0"
polars = ">=0.19.0"
# Required for typing.Literal in python3.7
typing-extensions = "*"
pandas = {version = "*", optional = true, python = "^3.8"}
Expand Down
4 changes: 2 additions & 2 deletions src/patito/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
from polars import Expr, Series, col

from patito import exceptions, sql
from patito.exceptions import ValidationError
from patito.polars import DataFrame, LazyFrame
from patito.pydantic import Field, Model
from patito.exceptions import DataFrameValidationError

_CACHING_AVAILABLE = False
_DUCKDB_AVAILABLE = False
field = col("_")
__all__ = [
"DataFrame",
"DataFrameValidationError",
"Expr",
"Field",
"LazyFrame",
"Model",
"Series",
"ValidationError",
"_CACHING_AVAILABLE",
"_DUCKDB_AVAILABLE",
"col",
Expand Down
Empty file.
144 changes: 144 additions & 0 deletions src/patito/_pydantic/repr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import types
import typing
from typing import (
Any,
Tuple,
Callable,
Generator,
Union,
Sequence,
Iterable,
Optional,
get_origin,
get_args,
Literal,
)
import sys

if typing.TYPE_CHECKING:
Loc = Tuple[Union[int, str], ...]
ReprArgs = Sequence[Tuple[Optional[str], Any]]
RichReprResult = Iterable[
Union[Any, Tuple[Any], Tuple[str, Any], Tuple[str, Any, Any]]
]

try:
from typing import _TypingBase # type: ignore[attr-defined]
except ImportError:
from typing import _Final as _TypingBase # type: ignore[attr-defined]

typing_base = _TypingBase

if sys.version_info < (3, 9):
# python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
TypingGenericAlias = ()
else:
from typing import GenericAlias as TypingGenericAlias # type: ignore

if sys.version_info < (3, 10):

def origin_is_union(tp: type[Any] | None) -> bool:
return tp is typing.Union

WithArgsTypes = (TypingGenericAlias,)

else:

def origin_is_union(tp: type[Any] | None) -> bool:
return tp is typing.Union or tp is types.UnionType

WithArgsTypes = typing._GenericAlias, types.GenericAlias, types.UnionType # type: ignore[attr-defined]


class Representation:
"""
Mixin to provide __str__, __repr__, and __pretty__ methods. See #884 for more details.

__pretty__ is used by [devtools](https://python-devtools.helpmanual.io/) to provide human readable representations
of objects.
"""

__slots__: Tuple[str, ...] = tuple()

def __repr_args__(self) -> "ReprArgs":
"""
Returns the attributes to show in __str__, __repr__, and __pretty__ this is generally overridden.

Can either return:
* name - value pairs, e.g.: `[('foo_name', 'foo'), ('bar_name', ['b', 'a', 'r'])]`
* or, just values, e.g.: `[(None, 'foo'), (None, ['b', 'a', 'r'])]`
"""
attrs = ((s, getattr(self, s)) for s in self.__slots__)
return [(a, v) for a, v in attrs if v is not None]

def __repr_name__(self) -> str:
"""
Name of the instance's class, used in __repr__.
"""
return self.__class__.__name__

def __repr_str__(self, join_str: str) -> str:
return join_str.join(
repr(v) if a is None else f"{a}={v!r}" for a, v in self.__repr_args__()
)

def __pretty__(
self, fmt: Callable[[Any], Any], **kwargs: Any
) -> Generator[Any, None, None]:
"""
Used by devtools (https://python-devtools.helpmanual.io/) to provide a human readable representations of objects
"""
yield self.__repr_name__() + "("
yield 1
for name, value in self.__repr_args__():
if name is not None:
yield name + "="
yield fmt(value)
yield ","
yield 0
yield -1
yield ")"

def __str__(self) -> str:
return self.__repr_str__(" ")

def __repr__(self) -> str:
return f'{self.__repr_name__()}({self.__repr_str__(", ")})'

def __rich_repr__(self) -> "RichReprResult":
"""Get fields for Rich library"""
for name, field_repr in self.__repr_args__():
if name is None:
yield field_repr
else:
yield name, field_repr


def display_as_type(obj: Any) -> str:
"""Pretty representation of a type, should be as close as possible to the original type definition string.

Takes some logic from `typing._type_repr`.
"""
if isinstance(obj, types.FunctionType):
return obj.__name__
elif obj is ...:
return "..."
elif isinstance(obj, Representation):
return repr(obj)

if not isinstance(obj, (typing_base, WithArgsTypes, type)):
obj = obj.__class__

if origin_is_union(get_origin(obj)):
args = ", ".join(map(display_as_type, get_args(obj)))
return f"Union[{args}]"
elif isinstance(obj, WithArgsTypes):
if get_origin(obj) == Literal:
args = ", ".join(map(repr, get_args(obj)))
else:
args = ", ".join(map(display_as_type, get_args(obj)))
return f"{obj.__qualname__}[{args}]"
elif isinstance(obj, type):
return obj.__qualname__
else:
return repr(obj).replace("typing.", "").replace("typing_extensions.", "")
169 changes: 163 additions & 6 deletions src/patito/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,171 @@
"""Module containing all custom exceptions raised by patito."""
import json
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
List,
Optional,
Sequence,
Tuple,
Type,
Union,
Callable,
TypedDict,
Iterable,
)

import pydantic
from patito._pydantic.repr import Representation

if TYPE_CHECKING:
from pydantic import BaseModel

class ValidationError(pydantic.ValidationError):
"""Exception raised when dataframe does not match schema."""
Loc = Tuple[Union[int, str], ...]

class _ErrorDictRequired(TypedDict):
loc: Loc
msg: str
type: str

class ErrorWrapper(pydantic.error_wrappers.ErrorWrapper):
"""Wrapper for specific column validation error."""
class ErrorDict(_ErrorDictRequired, total=False):
ctx: Dict[str, Any]

from patito._pydantic.repr import ReprArgs


__all__ = "ErrorWrapper", "DataFrameValidationError"


class ErrorWrapper(Representation):
__slots__ = "exc", "_loc"

def __init__(self, exc: Exception, loc: Union[str, "Loc"]) -> None:
self.exc = exc
self._loc = loc

def loc_tuple(self) -> "Loc":
if isinstance(self._loc, tuple):
return self._loc
else:
return (self._loc,)

def __repr_args__(self) -> "ReprArgs":
return [("exc", self.exc), ("loc", self.loc_tuple())]


# ErrorList is something like Union[List[Union[List[ErrorWrapper], ErrorWrapper]], ErrorWrapper]
# but recursive, therefore just use:
ErrorList = Union[Sequence[Any], ErrorWrapper]


class DataFrameValidationError(Representation, ValueError):
__slots__ = "raw_errors", "model", "_error_cache"

def __init__(self, errors: Sequence[ErrorList], model: Type["BaseModel"]) -> None:
self.raw_errors = errors
self.model = model
self._error_cache: Optional[List["ErrorDict"]] = None

def errors(self) -> List["ErrorDict"]:
if self._error_cache is None:
self._error_cache = list(flatten_errors(self.raw_errors))
return self._error_cache

def __str__(self) -> str:
errors = self.errors()
no_errors = len(errors)
return (
f'{no_errors} validation error{"" if no_errors == 1 else "s"} for {self.model.__name__}\n'
f"{display_errors(errors)}"
)

def __repr_args__(self) -> "ReprArgs":
return [("model", self.model.__name__), ("errors", self.errors())]


def display_errors(errors: List["ErrorDict"]) -> str:
return "\n".join(
f'{_display_error_loc(e)}\n {e["msg"]} ({_display_error_type_and_ctx(e)})'
for e in errors
)


def _display_error_loc(error: "ErrorDict") -> str:
return " -> ".join(str(e) for e in error["loc"])


def _display_error_type_and_ctx(error: "ErrorDict") -> str:
t = "type=" + error["type"]
ctx = error.get("ctx")
if ctx:
return t + "".join(f"; {k}={v}" for k, v in ctx.items())
else:
return t


def flatten_errors(
errors: Sequence[Any], loc: Optional["Loc"] = None
) -> Generator["ErrorDict", None, None]:
for error in errors:
if isinstance(error, ErrorWrapper):
if loc:
error_loc = loc + error.loc_tuple()
else:
error_loc = error.loc_tuple()

if isinstance(error.exc, DataFrameValidationError):
yield from flatten_errors(error.exc.raw_errors, error_loc)
else:
yield error_dict(error.exc, error_loc)
elif isinstance(error, list):
yield from flatten_errors(error, loc=loc)
else:
raise RuntimeError(f"Unknown error object: {error}")


def error_dict(exc: Exception, loc: "Loc") -> "ErrorDict":
type_ = get_exc_type(exc.__class__)
msg_template = getattr(exc, "msg_template", None)
ctx = exc.__dict__
if msg_template:
msg = msg_template.format(**ctx)
else:
msg = str(exc)

d: "ErrorDict" = {"loc": loc, "msg": msg, "type": type_}

if ctx:
d["ctx"] = ctx

return d


_EXC_TYPE_CACHE: Dict[Type[Exception], str] = {}


def get_exc_type(cls: Type[Exception]) -> str:
# slightly more efficient than using lru_cache since we don't need to worry about the cache filling up
try:
return _EXC_TYPE_CACHE[cls]
except KeyError:
r = _get_exc_type(cls)
_EXC_TYPE_CACHE[cls] = r
return r


def _get_exc_type(cls: Type[Exception]) -> str:
if issubclass(cls, AssertionError):
return "assertion_error"

base_name = "type_error" if issubclass(cls, TypeError) else "value_error"
if cls in (TypeError, ValueError):
# just TypeError or ValueError, no extra code
return base_name

# if it's not a TypeError or ValueError, we just take the lowercase of the exception name
# no chaining or snake case logic, use "code" for more complex error types.
code = getattr(cls, "code", None) or cls.__name__.replace("Error", "").lower()
return base_name + "." + code


class WrongColumnsError(TypeError):
Expand Down
Loading