Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for pydantic 2.0, polars 0.20.10 and remove duckdb support #32

Merged
merged 29 commits into from
Feb 27, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
a5ddd94
Downgrade Sphinx to compatible version
JakobGM Jun 21, 2023
2287188
very rough draft for pydantic v2
brendancooley Oct 26, 2023
42299db
fix: `DataFrameValidationError` mirrors pydantic v1 `ValidationError`
brendancooley Oct 26, 2023
4b98da9
wip: subclass `FieldInfo`, extend with patito field attributes
brendancooley Oct 26, 2023
eef74fe
fix: update LDF.collect() for polars==0.19.8
brendancooley Oct 26, 2023
87d57bd
wip: restructure model reconstitution routines, all tests passing
brendancooley Oct 26, 2023
ffe343d
chore: lockfile and formatting
brendancooley Oct 27, 2023
e2bf0d7
fix: handle multiple annotations when `dtype` is present in `Field`
brendancooley Nov 3, 2023
017c59b
check: annotated dtypes match those specified in Field.dtype
brendancooley Nov 3, 2023
161300b
chore: misc typing improvements
brendancooley Nov 3, 2023
9e132bc
fix: search for nested field constraints on validation, ignore nulls
brendancooley Nov 3, 2023
93e171e
fix: better inference for column nullability
brendancooley Nov 3, 2023
e9dbca6
wip: better type mismatch messaging, move Representation to internal
brendancooley Nov 6, 2023
399673e
wip: onto classproperty formulation in lieu of metaclass
brendancooley Nov 6, 2023
a63db3f
wip: custom classproperty (from polars)
brendancooley Nov 6, 2023
db44aa3
wip: robustify array dtype inference, add pt custom fields to `Field()`
brendancooley Nov 6, 2023
438974c
feature: custom Field/FieldInfo constructors for better extensibility
brendancooley Nov 8, 2023
386ba83
fix: make _PatitoFieldInfo private, ensure ext fields -> _schema_props
brendancooley Nov 8, 2023
ee63672
wip: serialization
brendancooley Dec 13, 2023
daf8f59
wip: modularize annotation -> dtype and dtype validation
brendancooley Dec 18, 2023
054d034
chore: remove database/duckdb support
brendancooley Feb 7, 2024
155ce1f
chore: more sql cleanup
brendancooley Feb 7, 2024
6221dd7
chore: more migration/refactor
brendancooley Feb 8, 2024
dbad4cf
chore: add tests, some new features
brendancooley Feb 8, 2024
48ed782
bug: instantiated dtypes, constraints on examples, tz-less datetimes
brendancooley Feb 9, 2024
a2459a9
chore: docs cleanup, doctests running, partial linting
brendancooley Feb 14, 2024
7836d16
chore: wip types
brendancooley Feb 14, 2024
7c36b72
bump version and poetry update
thomasaarholt Feb 27, 2024
b5c7002
Use pl.String instead of pl.Utf8
thomasaarholt Feb 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,6 @@ dmypy.json

# Pyre type checker
.pyre/

.vscode/
.DS_Store
1,705 changes: 909 additions & 796 deletions poetry.lock

Large diffs are not rendered by default.

17 changes: 11 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@ keywords = ["validation", "dataframe"]

[tool.poetry.dependencies]
python = "^3.8"
pydantic = "^1.7.0"
polars = ">=0.18.7"
pydantic = ">=2.0.0"
polars = ">=0.20.1"
# Required for typing.Literal in python3.7
typing-extensions = "*"
pandas = {version = "*", optional = true, python = "^3.8"}
duckdb = {version = ">=0.6.0", optional = true, python = "^3.8"}
pyarrow = {version = ">=5.0.0", optional = true, python = "^3.8"}
# Optional docs dependencies
Sphinx = {version = "<7", optional = true}
Expand All @@ -30,7 +29,6 @@ sphinxcontrib-mermaid = {version = "*", optional = true}
[tool.poetry.extras]
# The pyarrow.parquet module is required for writing parquet caches to disk
caching = ["pyarrow"]
duckdb = ["duckdb", "pyarrow"]
pandas = ["pandas"]
docs = [
"Sphinx",
Expand All @@ -42,15 +40,14 @@ docs = [
]

[tool.poetry.group.dev.dependencies]
black = ">=22.3.0"
ruff = ">=0.2.1"
coverage = {version = "*", extras = ["toml"]}
flake8 = "3.9.2"
flake8-annotations = "*"
flake8-bandit = "*"
flake8-black = "*"
flake8-bugbear = "*"
flake8-isort = "*"
isort = "*"
pyright = ">=1.1.239"
pytest = ">=7.1.2"
pytest-cov = ">=3.0.0"
Expand Down Expand Up @@ -134,3 +131,11 @@ disable_error_code = [
[[tool.mypy.overrides]]
module = ["tests.test_validators"]
warn_unused_ignores = false

[tool.ruff.lint]
select = ["E4", "E7", "E9", "F", "I"]
ignore = []

# Allow fix for all enabled rules (when `--fix`) is provided.
fixable = ["ALL"]
unfixable = []
25 changes: 3 additions & 22 deletions src/patito/__init__.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,28 @@
"""Patito, a data-modelling library built on top of polars and pydantic."""
from polars import Expr, Series, col

from patito import exceptions, sql
from patito.exceptions import ValidationError
from patito import exceptions
from patito.exceptions import DataFrameValidationError
from patito.polars import DataFrame, LazyFrame
from patito.pydantic import Field, Model

_CACHING_AVAILABLE = False
_DUCKDB_AVAILABLE = False
field = col("_")
__all__ = [
"DataFrame",
"DataFrameValidationError",
"Expr",
"Field",
"LazyFrame",
"Model",
"Series",
"ValidationError",
"_CACHING_AVAILABLE",
"_DUCKDB_AVAILABLE",
"col",
"exceptions",
"field",
"sql",
]

try:
from patito import duckdb

_DUCKDB_AVAILABLE = True
__all__ += ["duckdb"]
except ImportError: # pragma: no cover
pass

try:
from patito.database import Database

_CACHING_AVAILABLE = True
__all__ += ["Database"]
except ImportError:
pass


try:
from importlib.metadata import PackageNotFoundError, version
except ImportError: # pragma: no cover
Expand Down
1 change: 1 addition & 0 deletions src/patito/_pydantic/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

77 changes: 77 additions & 0 deletions src/patito/_pydantic/column_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from __future__ import annotations

import json
from typing import (
Any,
Dict,
Optional,
Sequence,
Type,
TypeVar,
Union,
)

import polars as pl
from polars.datatypes import DataType, DataTypeClass
from pydantic import BaseModel, field_serializer

from patito._pydantic.dtypes import parse_composite_dtype


class ColumnInfo(BaseModel, arbitrary_types_allowed=True):
"""patito-side model for storing column metadata

Args:
constraints (Union[polars.Expression, List[polars.Expression]): A single
constraint or list of constraints, expressed as a polars expression objects.
All rows must satisfy the given constraint. You can refer to the given column
with ``pt.field``, which will automatically be replaced with
``polars.col(<field_name>)`` before evaluation.
derived_from (Union[str, polars.Expr]): used to mark fields that are meant to be derived from other fields. Users can specify a polars expression that will be called to derive the column value when `pt.DataFrame.derive` is called.
dtype (polars.datatype.DataType): The given dataframe column must have the given
polars dtype, for instance ``polars.UInt64`` or ``pl.Float32``.
unique (bool): All row values must be unique.
"""

dtype: Optional[Union[DataTypeClass, DataType]] = None
constraints: Optional[Union[pl.Expr, Sequence[pl.Expr]]] = None
derived_from: Optional[Union[str, pl.Expr]] = None
unique: Optional[bool] = None

@field_serializer("constraints", "derived_from")
def serialize_exprs(self, exprs: str | pl.Expr | Sequence[pl.Expr] | None) -> Any:
if exprs is None:
return None
elif isinstance(exprs, str):
return exprs
elif isinstance(exprs, pl.Expr):
return self._serialize_expr(exprs)
elif isinstance(exprs, Sequence):
return [self._serialize_expr(c) for c in exprs]
else:
raise ValueError(f"Invalid type for exprs: {type(exprs)}")

def _serialize_expr(self, expr: pl.Expr) -> Dict:
if isinstance(expr, pl.Expr):
return json.loads(
expr.meta.write_json(None)
) # can we access the dictionary directly?
else:
raise ValueError(f"Invalid type for expr: {type(expr)}")

@field_serializer("dtype")
def serialize_dtype(self, dtype: DataTypeClass | DataType | None) -> Any:
"""
References
----------
[1] https://stackoverflow.com/questions/76572310/how-to-serialize-deserialize-polars-datatypes
"""
if dtype is None:
return None
elif isinstance(dtype, DataTypeClass) or isinstance(dtype, DataType):
return parse_composite_dtype(dtype)
else:
raise ValueError(f"Invalid type for dtype: {type(dtype)}")


CI = TypeVar("CI", bound=Type[ColumnInfo])
23 changes: 23 additions & 0 deletions src/patito/_pydantic/dtypes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from patito._pydantic.dtypes.dtypes import (
DtypeResolver,
default_dtypes_for_model,
valid_dtypes_for_model,
validate_annotation,
validate_polars_dtype,
)
from patito._pydantic.dtypes.utils import (
PYTHON_TO_PYDANTIC_TYPES,
dtype_from_string,
parse_composite_dtype,
)

__all__ = [
"DtypeResolver",
"validate_annotation",
"validate_polars_dtype",
"parse_composite_dtype",
"dtype_from_string",
"valid_dtypes_for_model",
"default_dtypes_for_model",
"PYTHON_TO_PYDANTIC_TYPES",
]
Loading